├── html-extractor ├── .gitignore ├── src │ ├── main │ │ ├── resources │ │ │ ├── voa │ │ │ │ ├── PeopleInAmerica.txt │ │ │ │ ├── HowAmericaElects.txt │ │ │ │ ├── EverydayGrammarTV.txt │ │ │ │ ├── Let'sLearnEnglish.txt │ │ │ │ ├── America'sNationalParks.txt │ │ │ │ ├── PersonalTechnology.txt │ │ │ │ ├── HealthLifestyle.txt │ │ │ │ ├── ScienceintheNews.txt │ │ │ │ ├── ThisIsAmerica.txt │ │ │ │ ├── NewsWords.txt │ │ │ │ ├── EverydayGrammar.txt │ │ │ │ ├── EnglishAtTheMovies.txt │ │ │ │ └── EnglishInAMinute.txt │ │ │ └── logback.xml │ │ └── java │ │ │ └── org │ │ │ └── apdplat │ │ │ └── extractor │ │ │ └── html │ │ │ ├── HtmlFetcher.java │ │ │ ├── HtmlExtractor.java │ │ │ ├── model │ │ │ ├── ExtractResultItem.java │ │ │ ├── ExtractFunction.java │ │ │ ├── UrlPattern.java │ │ │ ├── HtmlTemplate.java │ │ │ ├── ExtractFailLog.java │ │ │ ├── CssPath.java │ │ │ └── ExtractResult.java │ │ │ ├── impl │ │ │ ├── SeleniumHtmlFetcher.java │ │ │ ├── HtmlUnitHtmlFetcher.java │ │ │ ├── JSoupHtmlFetcher.java │ │ │ └── ExtractFunctionExecutor.java │ │ │ └── demo │ │ │ └── Toutiao.java │ └── test │ │ └── java │ │ └── org │ │ └── apdplat │ │ └── extractor │ │ └── html │ │ └── impl │ │ ├── JSoupHtmlFetcherTest.java │ │ ├── HtmlUnitHtmlFetcherTest.java │ │ └── SeleniumHtmlFetcherTest.java └── pom.xml ├── html-extractor-web ├── .gitignore ├── src │ └── main │ │ ├── webapp │ │ ├── META-INF │ │ │ └── context.xml │ │ ├── WEB-INF │ │ │ └── web.xml │ │ └── api │ │ │ └── all_extract_regular.jsp │ │ ├── java │ │ └── org │ │ │ └── apdplat │ │ │ └── extractor │ │ │ └── html │ │ │ └── server │ │ │ ├── redis │ │ │ ├── RedisClient.java │ │ │ └── RedisListener.java │ │ │ ├── model │ │ │ ├── ExtractResultItem.java │ │ │ ├── ExtractFunction.java │ │ │ ├── UrlPattern.java │ │ │ ├── HtmlTemplate.java │ │ │ ├── ExtractResult.java │ │ │ ├── ExtractFailLog.java │ │ │ └── CssPath.java │ │ │ └── service │ │ │ └── JsonGenerator.java │ │ └── resources │ │ └── logback.xml └── pom.xml ├── .travis.yml ├── .gitignore ├── pom.xml ├── mvnw.cmd ├── README.md ├── mvnw └── LICENSE.txt /html-extractor/.gitignore: -------------------------------------------------------------------------------- 1 | .settings/ 2 | .classpath 3 | .project 4 | target/ 5 | logs/ 6 | -------------------------------------------------------------------------------- /html-extractor-web/.gitignore: -------------------------------------------------------------------------------- 1 | .settings/ 2 | .classpath 3 | .project 4 | target/ 5 | logs/ 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | jdk: 4 | - oraclejdk8 5 | 6 | install: 7 | - mvn -N io.takari:maven:wrapper -------------------------------------------------------------------------------- /html-extractor-web/src/main/webapp/META-INF/context.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/PeopleInAmerica.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/makers-row-made-in-america/3386962.html=Maker's Row Made in America 2 | http://learningenglish.voanews.com/a/painting-canvases-that-lives-and-breathes/3315141.html=Athena Zhe Painting a Canvas That Lives and Breathes 3 | http://learningenglish.voanews.com/a/i-didnt-have-to-come-here-i-chose-to-come-here/3308781.html=Andy Shallal 'I Didn't Have to Come Here, I Chose to Come Here' 4 | http://learningenglish.voanews.com/a/introducing-people-in-america/3337834.html=Introducing 'People in America' -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/HowAmericaElects.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/3395534.html=How America Elects Convention Rules 2 | http://learningenglish.voanews.com/a/3369131.html=How America Elects Becoming a Delegate 3 | http://learningenglish.voanews.com/a/3298650.html=How America Elects General Election Day 4 | http://learningenglish.voanews.com/a/3264801.html=How America Elects Conventions 5 | http://learningenglish.voanews.com/a/3254747.html=How America Elects US Political Parties 6 | http://learningenglish.voanews.com/a/3163339.html=How America Elects Caucuses Primaries 7 | http://learningenglish.voanews.com/a/3161858.html=How America Elects Polls Debates 8 | http://learningenglish.voanews.com/a/3158592.html=How America Elects How To Raise Money -------------------------------------------------------------------------------- /html-extractor-web/src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | org.apdplat.extractor.html.server.redis.RedisListener 6 | 7 | 8 | 9 | redis.host 10 | localhost 11 | 12 | 13 | redis.port 14 | 6379 15 | 16 | 17 | 18 | 30 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .settings/ 2 | .classpath 3 | .project 4 | target/ 5 | logs/ 6 | data/ 7 | .idea/ 8 | .gradle/ 9 | build/ 10 | HtmlExtractor.iml 11 | HtmlExtractor.ipr 12 | HtmlExtractor.iws 13 | html-extractor/.settings/ 14 | html-extractor/.classpath 15 | html-extractor/.project 16 | html-extractor/target/ 17 | html-extractor/logs/ 18 | html-extractor/data/ 19 | html-extractor/.idea/ 20 | html-extractor/.gradle/ 21 | html-extractor/build/ 22 | html-extractor/html-extractor.iml 23 | html-extractor/html-extractor.ipr 24 | html-extractor/html-extractor.iws 25 | html-extractor-web/.settings/ 26 | html-extractor-web/.classpath 27 | html-extractor-web/.project 28 | html-extractor-web/target/ 29 | html-extractor-web/logs/ 30 | html-extractor-web/data/ 31 | html-extractor-web/.idea/ 32 | html-extractor-web/.gradle/ 33 | html-extractor-web/build/ 34 | html-extractor-web/html-extractor-web.iml 35 | html-extractor-web/html-extractor-web.ipr 36 | html-extractor-web/html-extractor-web.iws 37 | html-extractor-web/nb-configuration.xml -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/HtmlFetcher.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html; 22 | 23 | /** 24 | * 25 | * 网页内容获取工具 26 | * @author 杨尚川 27 | */ 28 | public interface HtmlFetcher { 29 | public String fetch(String url); 30 | } 31 | -------------------------------------------------------------------------------- /html-extractor/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | %m%n 7 | 8 | 9 | 10 | logs/logback.log 11 | 12 | logs/logback_%i.log 13 | 1 14 | 10000 15 | 16 | 17 | 5MB 18 | 19 | 20 | %m%nj 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/HtmlExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html; 22 | 23 | import org.apdplat.extractor.html.model.ExtractResult; 24 | import java.util.List; 25 | 26 | /** 27 | * 网页抽取工具 28 | * 根据URL模式、页面模板、CSS路径、抽取函数,抽取HTML页面 29 | * 30 | * @author 杨尚川 31 | * 32 | */ 33 | public interface HtmlExtractor { 34 | /** 35 | * 抽取信息 36 | * @param url URL 37 | * @param html HTML 38 | * @return 抽取结果 39 | */ 40 | public List extract(String url, String html); 41 | } 42 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/redis/RedisClient.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.redis; 22 | 23 | import static org.apdplat.extractor.html.server.redis.RedisListener.jedisPool; 24 | 25 | import redis.clients.jedis.Jedis; 26 | 27 | /** 28 | * 通知从节点抽取规则发生变化 29 | * 30 | * @author 杨尚川 31 | */ 32 | public class RedisClient { 33 | /** 34 | * 当抽取规则发生变化的时候 35 | * 向Redis服务器Channel:pr发送消息CHANGE 36 | * 从节点就会重新初始化抽取规则 37 | */ 38 | public void extractRegularChange() { 39 | String message = "CHANGE"; 40 | Jedis jedis = jedisPool.getResource(); 41 | jedis.publish("pr", message); 42 | jedisPool.returnResource(jedis); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/model/ExtractResultItem.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.model; 22 | 23 | /** 24 | * 网页结构化信息抽取结果项 25 | * 26 | * @author 杨尚川 27 | * 28 | */ 29 | public class ExtractResultItem { 30 | /** 31 | * 抽取结果项保存到那个字段 32 | */ 33 | private String field; 34 | /** 35 | * 抽取结果项的值 36 | */ 37 | private String value; 38 | 39 | public String getField() { 40 | return field; 41 | } 42 | 43 | public void setField(String field) { 44 | this.field = field; 45 | } 46 | 47 | public String getValue() { 48 | return value; 49 | } 50 | 51 | public void setValue(String value) { 52 | this.value = value; 53 | } 54 | 55 | @Override 56 | public String toString() { 57 | return "ExtractResultItem [\nfield=" + field + ", \nvalue=" + value + "]"; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/ExtractResultItem.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.model; 22 | 23 | /** 24 | * 网页结构化信息抽取结果项 25 | * 26 | * @author 杨尚川 27 | * 28 | */ 29 | public class ExtractResultItem { 30 | /** 31 | * 抽取结果项保存到那个字段 32 | */ 33 | private String field; 34 | /** 35 | * 抽取结果项的值 36 | */ 37 | private String value; 38 | 39 | public String getField() { 40 | return field; 41 | } 42 | 43 | public void setField(String field) { 44 | this.field = field; 45 | } 46 | 47 | public String getValue() { 48 | return value; 49 | } 50 | 51 | public void setValue(String value) { 52 | this.value = value; 53 | } 54 | 55 | @Override 56 | public String toString() { 57 | return "ExtractResultItem [\nfield=" + field + ", \nvalue=" + value + "]"; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/EverydayGrammarTV.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/3360403.html=Little vs A Little Few vs A Few 2 | http://learningenglish.voanews.com/a/3360402.html=Noncount Nouns 3 | http://learningenglish.voanews.com/a/3360400.html=Adverbs 4 | http://learningenglish.voanews.com/a/3360401.html=American English vs British English 5 | http://learningenglish.voanews.com/a/3255188.html=Should and Shall 6 | http://learningenglish.voanews.com/a/3255195.html=Present Progressive Tense 7 | http://learningenglish.voanews.com/a/3255190.html=Passive and Active Voice 8 | http://learningenglish.voanews.com/a/3255184.html=Causatives 9 | http://learningenglish.voanews.com/a/3255157.html=Present Unreal Conditionals 10 | http://learningenglish.voanews.com/a/3255156.html=Present And Future Real Conditionals 11 | http://learningenglish.voanews.com/a/3255168.html=Double Negatives 12 | http://learningenglish.voanews.com/a/3255174.html=Tag Questions 13 | http://learningenglish.voanews.com/a/3255152.html=Words That Are Coming And Going 14 | http://learningenglish.voanews.com/a/3255171.html=For and Since 15 | http://learningenglish.voanews.com/a/3137103.html=Pronouns and Gender 16 | http://learningenglish.voanews.com/a/3137100.html=Gerunds vs Infinitives 17 | http://learningenglish.voanews.com/a/3137104.html=Introducing Articles 18 | http://learningenglish.voanews.com/a/3137098.html=Understanding Fast Talkers 19 | http://learningenglish.voanews.com/a/3137093.html=Simple Past Present Perfect 20 | http://learningenglish.voanews.com/a/3137097.html=Modals for Asking Permission 21 | http://learningenglish.voanews.com/a/3137087.html=Irregular Plurals 22 | http://learningenglish.voanews.com/a/3137090.html=Onomatopoeia 23 | http://learningenglish.voanews.com/a/3137088.html=Pronouns I and Me 24 | http://learningenglish.voanews.com/a/3137091.html=Will vs Be Going to -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | org.apdplat 5 | HtmlExtractor 6 | 1.1 7 | HtmlExtractor 8 | pom 9 | 10 | https://github.com/ysc/HtmlExtractor 11 | HtmlExtractor是一个Java实现的基于模板的网页结构化信息精准抽取组件。 12 | 13 | 14 | GNU GENERAL PUBLIC LICENSE, Version 3 15 | http://www.gnu.org/licenses/gpl.txt 16 | 17 | 18 | 19 | https://github.com/ysc/HtmlExtractor 20 | scm:git:git://github.com/ysc/HtmlExtractor.git 21 | scm:git:git://github.com/ysc/HtmlExtractor.git 22 | GITHUB HtmlExtractor 23 | 24 | 25 | https://github.com/ysc/HtmlExtractor/issues 26 | github.com 27 | 28 | 29 | 30 | 杨尚川 31 | ysc@apdplat.org 32 | 33 | 34 | 35 | 36 | 杨尚川 37 | ysc@apdplat.org 38 | http://yangshangchuan.iteye.com 39 | 40 | 41 | 42 | UTF-8 43 | 44 | 45 | html-extractor 46 | html-extractor-web 47 | 48 | 49 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/impl/SeleniumHtmlFetcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * APDPlat - Application Product Development Platform 3 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | package org.apdplat.extractor.html.impl; 20 | 21 | import org.apdplat.extractor.html.HtmlFetcher; 22 | import org.openqa.selenium.WebDriver; 23 | import org.openqa.selenium.firefox.FirefoxDriver; 24 | import org.slf4j.Logger; 25 | import org.slf4j.LoggerFactory; 26 | 27 | /** 28 | * 29 | * 安装geckodriver: 30 | * brew install geckodriver 31 | * 32 | * 使用selenium执行JS动态渲染网页获取页面内容 33 | * 34 | * @author 杨尚川 35 | */ 36 | public class SeleniumHtmlFetcher implements HtmlFetcher { 37 | private static final Logger LOGGER = LoggerFactory.getLogger(SeleniumHtmlFetcher.class); 38 | 39 | //火狐浏览器 40 | private static final WebDriver WEB_DRIVER = new FirefoxDriver(); 41 | 42 | /** 43 | * 使用HtmlUnit获取页面内容,HtmlUnit能执行JS,动态渲染网页,但不是所有JS都能渲染,需要测试 44 | * @param url html页面路径 45 | * @return 46 | */ 47 | @Override 48 | public String fetch(String url) { 49 | try{ 50 | LOGGER.debug("url:"+url); 51 | WEB_DRIVER.get(url); 52 | String html = WEB_DRIVER.getPageSource(); 53 | LOGGER.debug("html:"+html); 54 | return html; 55 | }catch (Exception e) { 56 | LOGGER.error("获取URL:"+url+"页面出错", e); 57 | } 58 | return ""; 59 | } 60 | 61 | public static void main(String[] args) { 62 | HtmlFetcher htmlFetcher = new SeleniumHtmlFetcher(); 63 | String html = htmlFetcher.fetch("http://apdplat.org"); 64 | System.out.println(html); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/impl/HtmlUnitHtmlFetcher.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.impl; 22 | 23 | import com.gargoylesoftware.htmlunit.BrowserVersion; 24 | import com.gargoylesoftware.htmlunit.WebClient; 25 | import com.gargoylesoftware.htmlunit.html.HtmlPage; 26 | import org.apdplat.extractor.html.HtmlFetcher; 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | /** 31 | * 32 | * 使用HtmlUnit获取页面内容,HtmlUnit能执行JS 33 | * 动态渲染网页,但不是所有JS都能渲染,需要测试 34 | * @author 杨尚川 35 | */ 36 | public class HtmlUnitHtmlFetcher implements HtmlFetcher { 37 | private static final Logger LOGGER = LoggerFactory.getLogger(HtmlUnitHtmlFetcher.class); 38 | 39 | private static final WebClient WEB_CLIENT = new WebClient(BrowserVersion.INTERNET_EXPLORER_11); 40 | 41 | /** 42 | * 使用HtmlUnit获取页面内容,HtmlUnit能执行JS,动态渲染网页,但不是所有JS都能渲染,需要测试 43 | * @param url html页面路径 44 | * @return 45 | */ 46 | @Override 47 | public String fetch(String url) { 48 | try{ 49 | LOGGER.debug("url:"+url); 50 | HtmlPage htmlPage = WEB_CLIENT.getPage(url); 51 | String html = htmlPage.getBody().asXml(); 52 | LOGGER.debug("html:"+html); 53 | return html; 54 | }catch (Exception e) { 55 | LOGGER.error("获取URL:"+url+"页面出错", e); 56 | } 57 | return ""; 58 | } 59 | 60 | public static void main(String[] args) { 61 | HtmlFetcher htmlFetcher = new HtmlUnitHtmlFetcher(); 62 | String html = htmlFetcher.fetch("http://apdplat.org"); 63 | System.out.println(html); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/redis/RedisListener.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.redis; 22 | 23 | import javax.servlet.ServletContext; 24 | import javax.servlet.ServletContextEvent; 25 | import javax.servlet.ServletContextListener; 26 | import org.slf4j.Logger; 27 | import org.slf4j.LoggerFactory; 28 | import redis.clients.jedis.JedisPool; 29 | import redis.clients.jedis.JedisPoolConfig; 30 | 31 | /** 32 | * Redis监听器 33 | * 34 | * @author 杨尚川 35 | */ 36 | public class RedisListener implements ServletContextListener { 37 | private static final Logger LOGGER = LoggerFactory.getLogger(RedisListener.class); 38 | public static JedisPool jedisPool; 39 | 40 | @Override 41 | public void contextInitialized(ServletContextEvent sce) { 42 | ServletContext sc = sce.getServletContext(); 43 | String redisHost = sc.getInitParameter("redis.host"); 44 | String redisPort = sc.getInitParameter("redis.port"); 45 | LOGGER.info("redis.host: " + redisHost); 46 | LOGGER.info("redis.port: " + redisPort); 47 | LOGGER.info("开始初始化JedisPool"); 48 | try { 49 | JedisPoolConfig jedispool_config = new JedisPoolConfig(); 50 | jedisPool = new JedisPool(jedispool_config, redisHost, Integer.parseInt(redisPort)); 51 | LOGGER.info("初始化JedisPool成功"); 52 | } catch (Exception e) { 53 | LOGGER.error("初始化JedisPool失败", e); 54 | } 55 | } 56 | 57 | @Override 58 | public void contextDestroyed(ServletContextEvent sce) { 59 | jedisPool.destroy(); 60 | LOGGER.info("关闭JedisPool"); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | %m%n 7 | 8 | 9 | 10 | logs/logback.log 11 | 12 | logs/logback_%i.log 13 | 1 14 | 10000 15 | 16 | 17 | 5MB 18 | 19 | 20 | %m%nj 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/Let'sLearnEnglish.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-19-when-do-you-start/3357760.html=Lesson 19 When Do I Start 2 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-18-she-always-does-that/3357748.html=Lesson 18 She Always Does That 3 | http://learningenglish.voanews.com/a/are-you-free-on-friday-lets-learn-english/3355785.html=Lesson 17 Are You Free on Friday 4 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-16-where-are-you-from/3355849.html=Lesson 16 Where Are You From 5 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-15-i-love-people-watching/3343720.html=Lesson 15 I Love People-Watching 6 | http://learningenglish.voanews.com/a/lets-learn-english-review-lessons-10-14/3329289.html=Let's Learn English A Review of Lessons 10 -14 7 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-14-how-about-this/3323771.html=Lesson 14 How About This 8 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-13-happy-birthday-william-shakespeare/3312239.html=Lesson 13 Happy Birthday William Shakespeare 9 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-12-meet-my-family/3301733.html=Lesson 12 Meet My Family 10 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-11-this-is-my-neighborhood/3293986.html=Lesson 11 This Is My Neighborhood 11 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-10/3285228.html=Lesson 10 Come Over to My Place 12 | http://learningenglish.voanews.com/a/lets-learn-english-review-1-9/3276044.html=Let's Learn English A Review of Lessons 1 - 9 13 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-9-is-it-cold/3261789.html=Lesson 9 Is It Cold 14 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-8-are-you-busy/3253185.html=Lesson 8 Are You Busy 15 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-7-what-are-you-doing/3240468.html=Lesson 7 What Are You Doing 16 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-6-where-is-the-gym/3225958.html=Lesson 6 Where Is the Gym 17 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-5-where-are-you/3168971.html=Lesson 5 Where Are You 18 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-4/3168920.html=Lesson 4 What Is It 19 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-3-i-am-here/3126527.html=Lesson 3 I'm Here 20 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-2-hello/3113733.html=Lesson 2 Hello I'm Anna 21 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-one/3111026.html=Lesson 1 Welcome -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/model/ExtractFunction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.model; 22 | 23 | /** 24 | * 抽取函数 25 | * 抽取函数是页面模板的二级元素 26 | * 可以精准地控制抽取的内容 27 | * 28 | * @author 杨尚川 29 | * 30 | */ 31 | public class ExtractFunction { 32 | /** 33 | * 抽取函数对应的CSS路径 34 | */ 35 | private CssPath cssPath; 36 | /** 37 | * 抽取函数(只能使用系统内置支持的函数) 38 | */ 39 | private String extractExpression; 40 | /** 41 | * 抽取函数提取出的文本存储到哪个字段 42 | */ 43 | private String fieldName; 44 | /** 45 | * 抽取函数提取出的字段的中文含义,仅仅起注释作用,利于理解 46 | */ 47 | private String fieldDescription; 48 | 49 | public CssPath getCssPath() { 50 | return cssPath; 51 | } 52 | 53 | public void setCssPath(CssPath cssPath) { 54 | this.cssPath = cssPath; 55 | } 56 | 57 | public String getExtractExpression() { 58 | return extractExpression; 59 | } 60 | 61 | public void setExtractExpression(String extractExpression) { 62 | this.extractExpression = extractExpression; 63 | } 64 | 65 | public String getFieldName() { 66 | return fieldName; 67 | } 68 | 69 | public void setFieldName(String fieldName) { 70 | this.fieldName = fieldName; 71 | } 72 | 73 | public String getFieldDescription() { 74 | return fieldDescription; 75 | } 76 | 77 | public void setFieldDescription(String fieldDescription) { 78 | this.fieldDescription = fieldDescription; 79 | } 80 | 81 | @Override 82 | public String toString() { 83 | StringBuilder str = new StringBuilder(); 84 | str.append(this.extractExpression).append("\n"); 85 | str.append(this.fieldName).append("\n"); 86 | str.append(this.fieldDescription).append("\n"); 87 | return str.toString(); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/ExtractFunction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.model; 22 | 23 | import org.codehaus.jackson.annotate.JsonIgnore; 24 | 25 | /** 26 | * 抽取函数 27 | * 抽取函数是页面模板的二级元素 28 | * 可以精准地控制抽取的内容 29 | * 30 | * @author 杨尚川 31 | * 32 | */ 33 | public class ExtractFunction { 34 | /** 35 | * 抽取函数对应的CSS路径 36 | */ 37 | @JsonIgnore 38 | private CssPath cssPath; 39 | /** 40 | * 抽取函数(只能使用系统内置支持的函数) 41 | */ 42 | private String extractExpression; 43 | /** 44 | * 抽取函数提取出的文本存储到哪个字段 45 | */ 46 | private String fieldName; 47 | /** 48 | * 抽取函数提取出的字段的中文含义,仅仅起注释作用,利于理解 49 | */ 50 | private String fieldDescription; 51 | 52 | public CssPath getCssPath() { 53 | return cssPath; 54 | } 55 | 56 | public void setCssPath(CssPath cssPath) { 57 | this.cssPath = cssPath; 58 | } 59 | 60 | public String getExtractExpression() { 61 | return extractExpression; 62 | } 63 | 64 | public void setExtractExpression(String extractExpression) { 65 | this.extractExpression = extractExpression; 66 | } 67 | 68 | public String getFieldName() { 69 | return fieldName; 70 | } 71 | 72 | public void setFieldName(String fieldName) { 73 | this.fieldName = fieldName; 74 | } 75 | 76 | public String getFieldDescription() { 77 | return fieldDescription; 78 | } 79 | 80 | public void setFieldDescription(String fieldDescription) { 81 | this.fieldDescription = fieldDescription; 82 | } 83 | 84 | @Override 85 | public String toString() { 86 | StringBuilder str = new StringBuilder(); 87 | str.append(this.extractExpression).append("\n"); 88 | str.append(this.fieldName).append("\n"); 89 | str.append(this.fieldDescription).append("\n"); 90 | return str.toString(); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/America'sNationalParks.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/acadia-national-park-americas-national-parks-100/3391329.html=Acadia An East Coast Treasure 2 | http://learningenglish.voanews.com/a/whats-trending-today-obama-names-stonewall-inn-national-monument/3391152.html=Stonewall Inn Named LGBT National Monument 3 | http://learningenglish.voanews.com/a/young-traveler-hopes-to-visit-every-national-park-in-america-/3378000.html=Young Traveler Hopes to Visit Every National Park 4 | http://learningenglish.voanews.com/a/americas-national-parks-wrangell-st-elias-alaska/3381159.html=The Untouched Beauty of Wrangell-St. Elias National Park 5 | http://learningenglish.voanews.com/a/americas-national-parks-nps-100-yosemite-national-park/3370560.html=Yosemite A Park of Extremes 6 | http://learningenglish.voanews.com/a/everglades-national-park-liquid-heart-of-florida/3360425.html=Everglades National Park The Liquid Heart of Florida 7 | http://learningenglish.voanews.com/a/americas-national-parks-centennial-gettysburg-national-military-park/3348968.html=A Visit with History Gettysburg National Military Park 8 | http://learningenglish.voanews.com/a/americas-national-parks-carlsbad-caverns-national-park-new-mexico/3338983.html=An Underground World Carlsbad Caverns National Park 9 | http://learningenglish.voanews.com/a/americas-national-parks-great-smoky-mountains/3329159.html=Great Smoky Mountains Americas Most Popular National Park 10 | http://learningenglish.voanews.com/a/americans-national-parks-100-death-valley-national-park/3318946.html=Life in Death Valley National Park 11 | http://learningenglish.voanews.com/a/americas-national-parks-new-orleans-jazz-national-historical-park/3308628.html=National Park in New Orleans Celebrates Jazz 12 | http://learningenglish.voanews.com/a/americas-national-parks-mount-rainier-national-park-washington/3297148.html=The Glacial World of Mount Rainier 13 | http://learningenglish.voanews.com/a/americas-national-parks-mesa-verde-colorado/3287589.html=Mesa Verde National Park Protecting an Ancient Culture 14 | http://learningenglish.voanews.com/a/national-parks-week-free-entry/3285897.html=National Park Week Features Free Park Entry 15 | http://learningenglish.voanews.com/a/history-and-nature-at-dry-tortugas-national-park/3275242.html=History and Nature at Dry Tortugas National Park 16 | http://learningenglish.voanews.com/a/3261802.html=Hawaii Volcanoes National Park A Fiery World 17 | http://learningenglish.voanews.com/a/cherry-blossoms-signal-the-start-of-spring/3251616.html=Washingtons Cherry Blossoms Signal the Start of Spring 18 | http://learningenglish.voanews.com/a/national-parks-100-series-grand-canyon-national-park/3239133.html=The Grand Canyon Beyond Words 19 | http://learningenglish.voanews.com/a/3223506.html=US Park Honors Womens Rights -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/model/UrlPattern.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.model; 22 | 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import java.util.regex.Pattern; 26 | 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | /** 31 | * URL模式(使用正则表达式实现) 32 | * 用正则表达式的方式来指定一组有共同页面布局的网页 33 | * 这样就可以对这组页面指定一套模板来抽取信息 34 | * 35 | * @author 杨尚川 36 | * 37 | */ 38 | public class UrlPattern { 39 | private static final Logger LOGGER = LoggerFactory.getLogger(UrlPattern.class); 40 | /** 41 | * URL模式(使用正则表达式实现) 42 | */ 43 | private String urlPattern; 44 | /** 45 | * URL模式(编译好的正则表达式) 46 | */ 47 | private Pattern regexPattern; 48 | /** 49 | * 多个网页模板 50 | */ 51 | private List htmlTemplates = new ArrayList<>(); 52 | 53 | public String getUrlPattern() { 54 | return urlPattern; 55 | } 56 | 57 | public void setUrlPattern(String urlPattern) { 58 | this.urlPattern = urlPattern; 59 | try { 60 | regexPattern = Pattern.compile(urlPattern, Pattern.CASE_INSENSITIVE); 61 | } catch (Exception e) { 62 | LOGGER.error("编译正则表达式["+urlPattern+"]失败:", e); 63 | } 64 | } 65 | 66 | public Pattern getRegexPattern() { 67 | return regexPattern; 68 | } 69 | 70 | public List getHtmlTemplates() { 71 | return htmlTemplates; 72 | } 73 | 74 | public void setHtmlTemplates(List htmlTemplates) { 75 | this.htmlTemplates = htmlTemplates; 76 | for (HtmlTemplate htmlTemplate : this.htmlTemplates) { 77 | htmlTemplate.setUrlPattern(this); 78 | } 79 | } 80 | 81 | public boolean hasHtmlTemplate() { 82 | return !htmlTemplates.isEmpty(); 83 | } 84 | 85 | public void addHtmlTemplate(HtmlTemplate htmlTemplate) { 86 | htmlTemplates.add(htmlTemplate); 87 | htmlTemplate.setUrlPattern(this); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/UrlPattern.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.model; 22 | 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import java.util.regex.Pattern; 26 | 27 | import org.slf4j.Logger; 28 | import org.slf4j.LoggerFactory; 29 | 30 | /** 31 | * URL模式(使用正则表达式实现) 32 | * 用正则表达式的方式来指定一组有共同页面布局的网页 33 | * 这样就可以对这组页面指定一套模板来抽取信息 34 | * 35 | * @author 杨尚川 36 | * 37 | */ 38 | public class UrlPattern { 39 | private static final Logger LOGGER = LoggerFactory.getLogger(UrlPattern.class); 40 | /** 41 | * URL模式(使用正则表达式实现) 42 | */ 43 | private String urlPattern; 44 | /** 45 | * URL模式(编译好的正则表达式) 46 | */ 47 | private Pattern regexPattern; 48 | /** 49 | * 多个网页模板 50 | */ 51 | private List htmlTemplates = new ArrayList<>(); 52 | 53 | public String getUrlPattern() { 54 | return urlPattern; 55 | } 56 | 57 | public void setUrlPattern(String urlPattern) { 58 | this.urlPattern = urlPattern; 59 | try { 60 | regexPattern = Pattern.compile(urlPattern, Pattern.CASE_INSENSITIVE); 61 | } catch (Exception e) { 62 | LOGGER.error("编译正则表达式["+urlPattern+"]失败:", e); 63 | } 64 | } 65 | 66 | public Pattern getRegexPattern() { 67 | return regexPattern; 68 | } 69 | 70 | public List getHtmlTemplates() { 71 | return htmlTemplates; 72 | } 73 | 74 | public void setHtmlTemplates(List htmlTemplates) { 75 | this.htmlTemplates = htmlTemplates; 76 | for (HtmlTemplate htmlTemplate : this.htmlTemplates) { 77 | htmlTemplate.setUrlPattern(this); 78 | } 79 | } 80 | 81 | public boolean hasHtmlTemplate() { 82 | return !htmlTemplates.isEmpty(); 83 | } 84 | 85 | public void addHtmlTemplate(HtmlTemplate htmlTemplate) { 86 | htmlTemplates.add(htmlTemplate); 87 | htmlTemplate.setUrlPattern(this); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/demo/Toutiao.java: -------------------------------------------------------------------------------- 1 | /* 2 | * APDPlat - Application Product Development Platform 3 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 4 | * 5 | * This program is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program. If not, see . 17 | */ 18 | 19 | package org.apdplat.extractor.html.demo; 20 | 21 | import org.openqa.selenium.By; 22 | import org.openqa.selenium.WebDriver; 23 | import org.openqa.selenium.WebElement; 24 | import org.openqa.selenium.firefox.FirefoxDriver; 25 | 26 | import java.util.List; 27 | import java.util.Random; 28 | 29 | /** 30 | * 如何抓取Js动态生成数据且以滚动页面方式分页的网页 31 | * 以抓取今日头条为例说明:http://toutiao.com/ 32 | * Created by ysc on 10/13/15. 33 | */ 34 | public class Toutiao { 35 | public static void main(String[] args) throws Exception{ 36 | 37 | //等待数据加载的时间 38 | //为了防止服务器封锁,这里的时间要模拟人的行为,随机且不能太短 39 | long waitLoadBaseTime = 3000; 40 | int waitLoadRandomTime = 3000; 41 | Random random = new Random(System.currentTimeMillis()); 42 | 43 | //火狐浏览器 44 | WebDriver driver = new FirefoxDriver(); 45 | //要抓取的网页 46 | driver.get("http://toutiao.com/"); 47 | 48 | //等待页面动态加载完毕 49 | Thread.sleep(waitLoadBaseTime+random.nextInt(waitLoadRandomTime)); 50 | 51 | //要加载多少页数据 52 | int pages=5; 53 | for(int i=0; i elements = driver.findElements(By.className("title")); 63 | int j=1; 64 | for(int i=0;i. 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.impl; 22 | 23 | import org.apdplat.extractor.html.HtmlFetcher; 24 | import org.jsoup.Connection; 25 | import org.jsoup.Jsoup; 26 | import org.slf4j.Logger; 27 | import org.slf4j.LoggerFactory; 28 | 29 | import java.net.URL; 30 | /** 31 | * 32 | * 使用JSoup获取网页内容 33 | * @author 杨尚川 34 | */ 35 | public class JSoupHtmlFetcher implements HtmlFetcher { 36 | private static final Logger LOGGER = LoggerFactory.getLogger(JSoupHtmlFetcher.class); 37 | 38 | private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; 39 | private static final String ENCODING = "gzip, deflate"; 40 | private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; 41 | private static final String CONNECTION = "keep-alive"; 42 | private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; 43 | 44 | @Override 45 | public String fetch(String url) { 46 | try { 47 | LOGGER.debug("url:"+url); 48 | String host = new URL(url).getHost(); 49 | Connection conn = Jsoup.connect(url) 50 | .timeout(60000) 51 | .header("Accept", ACCEPT) 52 | .header("Accept-Encoding", ENCODING) 53 | .header("Accept-Language", LANGUAGE) 54 | .header("Connection", CONNECTION) 55 | .header("Referer", "http://"+host) 56 | .header("Host", host) 57 | .header("User-Agent", USER_AGENT) 58 | .ignoreContentType(true); 59 | String html = conn.get().html(); 60 | LOGGER.debug("html:"+html); 61 | return html; 62 | }catch (Exception e){ 63 | LOGGER.error("获取URL:"+url+"页面出错", e); 64 | } 65 | return ""; 66 | } 67 | 68 | public static void main(String[] args) { 69 | HtmlFetcher htmlFetcher = new JSoupHtmlFetcher(); 70 | String html = htmlFetcher.fetch("http://apdplat.org"); 71 | System.out.println(html); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/model/HtmlTemplate.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.model; 22 | 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | 26 | /** 27 | * 网页模板 28 | * 一个URL模式会有一到多个网页模板 29 | * 一套网页模板指定了如何精准地抽取网页信息 30 | * 31 | * @author 杨尚川 32 | * 33 | */ 34 | public class HtmlTemplate { 35 | /** 36 | * 网页模板名称,仅仅注释作用 37 | */ 38 | private String templateName; 39 | /** 40 | * 网页提取出的文本存储到哪个表 41 | */ 42 | private String tableName; 43 | /** 44 | * URL模式 45 | */ 46 | private UrlPattern urlPattern; 47 | /** 48 | * 多个CSS路径 49 | */ 50 | private List cssPaths = new ArrayList<>(); 51 | 52 | public String getTemplateName() { 53 | return templateName; 54 | } 55 | 56 | public void setTemplateName(String templateName) { 57 | this.templateName = templateName; 58 | } 59 | 60 | public String getTableName() { 61 | return tableName; 62 | } 63 | 64 | public void setTableName(String tableName) { 65 | this.tableName = tableName; 66 | } 67 | 68 | public UrlPattern getUrlPattern() { 69 | return urlPattern; 70 | } 71 | 72 | public void setUrlPattern(UrlPattern urlPattern) { 73 | this.urlPattern = urlPattern; 74 | } 75 | 76 | public List getCssPaths() { 77 | return cssPaths; 78 | } 79 | 80 | public void setCssPaths(List cssPaths) { 81 | this.cssPaths = cssPaths; 82 | for (CssPath cssPath : this.cssPaths) { 83 | cssPath.setPageTemplate(this); 84 | } 85 | } 86 | 87 | public boolean hasCssPath() { 88 | return !cssPaths.isEmpty(); 89 | } 90 | 91 | public void addCssPath(CssPath cssPath) { 92 | cssPaths.add(cssPath); 93 | cssPath.setPageTemplate(this); 94 | } 95 | 96 | @Override 97 | public String toString() { 98 | StringBuilder str = new StringBuilder(); 99 | str.append("网页模板:").append(this.templateName).append(",存储表:").append(this.tableName).append("\n\n"); 100 | int i = 1; 101 | for (CssPath cssPath : cssPaths) { 102 | str.append(i++).append("、").append(cssPath.toString()).append("\n"); 103 | } 104 | return str.toString(); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/HtmlTemplate.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.model; 22 | 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import org.codehaus.jackson.annotate.JsonIgnore; 26 | 27 | /** 28 | * 网页模板 29 | * 一个URL模式会有一到多个网页模板 30 | * 一套网页模板指定了如何精准地抽取网页信息 31 | * 32 | * @author 杨尚川 33 | * 34 | */ 35 | public class HtmlTemplate { 36 | /** 37 | * 网页模板名称,仅仅注释作用 38 | */ 39 | private String templateName; 40 | /** 41 | * 网页提取出的文本存储到哪个表 42 | */ 43 | private String tableName; 44 | /** 45 | * URL模式 46 | */ 47 | @JsonIgnore 48 | private UrlPattern urlPattern; 49 | /** 50 | * 多个CSS路径 51 | */ 52 | private List cssPaths = new ArrayList<>(); 53 | 54 | public String getTemplateName() { 55 | return templateName; 56 | } 57 | 58 | public void setTemplateName(String templateName) { 59 | this.templateName = templateName; 60 | } 61 | 62 | public String getTableName() { 63 | return tableName; 64 | } 65 | 66 | public void setTableName(String tableName) { 67 | this.tableName = tableName; 68 | } 69 | 70 | public UrlPattern getUrlPattern() { 71 | return urlPattern; 72 | } 73 | 74 | public void setUrlPattern(UrlPattern urlPattern) { 75 | this.urlPattern = urlPattern; 76 | } 77 | 78 | public List getCssPaths() { 79 | return cssPaths; 80 | } 81 | 82 | public void setCssPaths(List cssPaths) { 83 | this.cssPaths = cssPaths; 84 | for (CssPath cssPath : this.cssPaths) { 85 | cssPath.setPageTemplate(this); 86 | } 87 | } 88 | 89 | public boolean hasCssPath() { 90 | return !cssPaths.isEmpty(); 91 | } 92 | 93 | public void addCssPath(CssPath cssPath) { 94 | cssPaths.add(cssPath); 95 | cssPath.setPageTemplate(this); 96 | } 97 | 98 | @Override 99 | public String toString() { 100 | StringBuilder str = new StringBuilder(); 101 | str.append("网页模板:").append(this.templateName).append(",存储表:").append(this.tableName).append("\n\n"); 102 | int i = 1; 103 | for (CssPath cssPath : cssPaths) { 104 | str.append(i++).append("、").append(cssPath.toString()).append("\n"); 105 | } 106 | return str.toString(); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/ExtractResult.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.model; 22 | 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | 26 | /** 27 | * 网页结构化信息抽取结果 28 | * 一个网页模板对应一个抽取结果 29 | * 如果一个网页有多个网页模板 30 | * 每个模板都抽取成功 31 | * 只要这些模板保存在不同的表中 32 | * URL作为主键就不会冲突 33 | * 34 | * @author 杨尚川 35 | * 36 | */ 37 | public class ExtractResult { 38 | /** 39 | * 网页对应的URL 40 | */ 41 | private String url; 42 | /** 43 | * 网页原始内容 44 | */ 45 | private byte[] content; 46 | /** 47 | * 网页编码 48 | */ 49 | private String encoding; 50 | /** 51 | * 网页关键词元数据 52 | */ 53 | private String keywords; 54 | /** 55 | * 网页描述元数据 56 | */ 57 | private String description; 58 | /** 59 | * 网页提取出的文本存储到哪个表 60 | */ 61 | private String tableName; 62 | /** 63 | * 一个网页可能有多个抽取结果项,至少要一个 64 | */ 65 | private final List extractResultItems = new ArrayList<>(); 66 | /** 67 | * 抽取失败日志 68 | */ 69 | private final List extractFailLogs = new ArrayList<>(); 70 | 71 | public boolean isSuccess() { 72 | return extractFailLogs.isEmpty() && !extractResultItems.isEmpty(); 73 | } 74 | 75 | public String getUrl() { 76 | return url; 77 | } 78 | 79 | public void setUrl(String url) { 80 | this.url = url; 81 | } 82 | 83 | public byte[] getContent() { 84 | return content; 85 | } 86 | 87 | public void setContent(byte[] content) { 88 | this.content = content; 89 | } 90 | 91 | public String getEncoding() { 92 | return encoding; 93 | } 94 | 95 | public void setEncoding(String encoding) { 96 | this.encoding = encoding; 97 | } 98 | 99 | public String getKeywords() { 100 | return keywords; 101 | } 102 | 103 | public void setKeywords(String keywords) { 104 | this.keywords = keywords; 105 | } 106 | 107 | public String getDescription() { 108 | return description; 109 | } 110 | 111 | public void setDescription(String description) { 112 | this.description = description; 113 | } 114 | 115 | public String getTableName() { 116 | return tableName; 117 | } 118 | 119 | public void setTableName(String tableName) { 120 | this.tableName = tableName; 121 | } 122 | 123 | public List getExtractResultItems() { 124 | return extractResultItems; 125 | } 126 | 127 | public void addExtractResultItem(ExtractResultItem extractResultItem) { 128 | this.extractResultItems.add(extractResultItem); 129 | } 130 | 131 | public List getExtractFailLogs() { 132 | return extractFailLogs; 133 | } 134 | 135 | public void addExtractFailLog(ExtractFailLog extractFailLog) { 136 | this.extractFailLogs.add(extractFailLog); 137 | extractFailLog.setExtractResult(this); 138 | } 139 | 140 | @Override 141 | public String toString() { 142 | return "ExtractResult [\nurl=" + url + ", \ntableName=" + tableName 143 | + ", \nextractResultItems=" + extractResultItems + ", \nextractFailLogs=" + extractFailLogs + "]"; 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/model/ExtractFailLog.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.model; 22 | 23 | /** 24 | * 网页结构化信息抽取失败日志 25 | * 26 | * @author 杨尚川 27 | * 28 | */ 29 | public class ExtractFailLog { 30 | /** 31 | * 网页结构化信息抽取结果 32 | */ 33 | private ExtractResult extractResult; 34 | /** 35 | * 网页的URL 36 | */ 37 | private String url; 38 | /** 39 | * 网页的URL模式 40 | */ 41 | private String urlPattern; 42 | /** 43 | * 网页模板 44 | */ 45 | private String templateName; 46 | /** 47 | * CSS路径 48 | */ 49 | private String cssPath; 50 | /** 51 | * CSS路径下的抽取函数 52 | */ 53 | private String extractExpression; 54 | /** 55 | * 抽取出的内容保存到的表的名称 56 | */ 57 | private String tableName; 58 | /** 59 | * 抽取出的内容保存到的字段名称 60 | */ 61 | private String fieldName; 62 | /** 63 | * 抽取出的内容保存到的字段描述,仅作注释使用 64 | */ 65 | private String fieldDescription; 66 | 67 | public ExtractResult getExtractResult() { 68 | return extractResult; 69 | } 70 | 71 | public void setExtractResult(ExtractResult extractResult) { 72 | this.extractResult = extractResult; 73 | } 74 | 75 | public String getUrl() { 76 | return url; 77 | } 78 | 79 | public void setUrl(String url) { 80 | this.url = url; 81 | } 82 | 83 | public String getUrlPattern() { 84 | return urlPattern; 85 | } 86 | 87 | public void setUrlPattern(String urlPattern) { 88 | this.urlPattern = urlPattern; 89 | } 90 | 91 | public String getTemplateName() { 92 | return templateName; 93 | } 94 | 95 | public void setTemplateName(String templateName) { 96 | this.templateName = templateName; 97 | } 98 | 99 | public String getCssPath() { 100 | return cssPath; 101 | } 102 | 103 | public void setCssPath(String cssPath) { 104 | this.cssPath = cssPath; 105 | } 106 | 107 | public String getExtractExpression() { 108 | return extractExpression; 109 | } 110 | 111 | public void setExtractExpression(String extractExpression) { 112 | this.extractExpression = extractExpression; 113 | } 114 | 115 | public String getTableName() { 116 | return tableName; 117 | } 118 | 119 | public void setTableName(String tableName) { 120 | this.tableName = tableName; 121 | } 122 | 123 | public String getFieldName() { 124 | return fieldName; 125 | } 126 | 127 | public void setFieldName(String fieldName) { 128 | this.fieldName = fieldName; 129 | } 130 | 131 | public String getFieldDescription() { 132 | return fieldDescription; 133 | } 134 | 135 | public void setFieldDescription(String fieldDescription) { 136 | this.fieldDescription = fieldDescription; 137 | } 138 | 139 | @Override 140 | public String toString() { 141 | return "ExtractFailLog [\nurl=" + url + ", \nurlPattern=" + urlPattern 142 | + ", \ntemplateName=" + templateName + ", \ncssPath=" + cssPath 143 | + ", \nextractExpression=" + extractExpression + ", \ntableName=" 144 | + tableName + ", \nfieldName=" + fieldName 145 | + ", \nfieldDescription=" + fieldDescription + "]"; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/ExtractFailLog.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.model; 22 | 23 | /** 24 | * 网页结构化信息抽取失败日志 25 | * 26 | * @author 杨尚川 27 | * 28 | */ 29 | public class ExtractFailLog { 30 | /** 31 | * 网页结构化信息抽取结果 32 | */ 33 | private ExtractResult extractResult; 34 | /** 35 | * 网页的URL 36 | */ 37 | private String url; 38 | /** 39 | * 网页的URL模式 40 | */ 41 | private String urlPattern; 42 | /** 43 | * 网页模板 44 | */ 45 | private String templateName; 46 | /** 47 | * CSS路径 48 | */ 49 | private String cssPath; 50 | /** 51 | * CSS路径下的抽取函数 52 | */ 53 | private String extractExpression; 54 | /** 55 | * 抽取出的内容保存到的表的名称 56 | */ 57 | private String tableName; 58 | /** 59 | * 抽取出的内容保存到的字段名称 60 | */ 61 | private String fieldName; 62 | /** 63 | * 抽取出的内容保存到的字段描述,仅作注释使用 64 | */ 65 | private String fieldDescription; 66 | 67 | public ExtractResult getExtractResult() { 68 | return extractResult; 69 | } 70 | 71 | public void setExtractResult(ExtractResult extractResult) { 72 | this.extractResult = extractResult; 73 | } 74 | 75 | public String getUrl() { 76 | return url; 77 | } 78 | 79 | public void setUrl(String url) { 80 | this.url = url; 81 | } 82 | 83 | public String getUrlPattern() { 84 | return urlPattern; 85 | } 86 | 87 | public void setUrlPattern(String urlPattern) { 88 | this.urlPattern = urlPattern; 89 | } 90 | 91 | public String getTemplateName() { 92 | return templateName; 93 | } 94 | 95 | public void setTemplateName(String templateName) { 96 | this.templateName = templateName; 97 | } 98 | 99 | public String getCssPath() { 100 | return cssPath; 101 | } 102 | 103 | public void setCssPath(String cssPath) { 104 | this.cssPath = cssPath; 105 | } 106 | 107 | public String getExtractExpression() { 108 | return extractExpression; 109 | } 110 | 111 | public void setExtractExpression(String extractExpression) { 112 | this.extractExpression = extractExpression; 113 | } 114 | 115 | public String getTableName() { 116 | return tableName; 117 | } 118 | 119 | public void setTableName(String tableName) { 120 | this.tableName = tableName; 121 | } 122 | 123 | public String getFieldName() { 124 | return fieldName; 125 | } 126 | 127 | public void setFieldName(String fieldName) { 128 | this.fieldName = fieldName; 129 | } 130 | 131 | public String getFieldDescription() { 132 | return fieldDescription; 133 | } 134 | 135 | public void setFieldDescription(String fieldDescription) { 136 | this.fieldDescription = fieldDescription; 137 | } 138 | 139 | @Override 140 | public String toString() { 141 | return "ExtractFailLog [\nurl=" + url + ", \nurlPattern=" + urlPattern 142 | + ", \ntemplateName=" + templateName + ", \ncssPath=" + cssPath 143 | + ", \nextractExpression=" + extractExpression + ", \ntableName=" 144 | + tableName + ", \nfieldName=" + fieldName 145 | + ", \nfieldDescription=" + fieldDescription + "]"; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/model/CssPath.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.model; 22 | 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | 26 | /** 27 | * CSS路径 28 | * CSS路径是页面模板的一级元素 29 | * CSS路径抽取到的内容如果还不满足要求 30 | * 需要使用二级元素即抽取函数来做进一步控制 31 | * 32 | * @author 杨尚川 33 | * 34 | */ 35 | public class CssPath { 36 | /** 37 | * CSS路径对应的网页模板 38 | */ 39 | private HtmlTemplate pageTemplate; 40 | /** 41 | * CSS路径 42 | */ 43 | private String cssPath; 44 | /** 45 | * 提取属性,如果不指定属性,则提取文本 46 | */ 47 | private String attr; 48 | /** 49 | * CSS路径对应的抽取函数列表 50 | */ 51 | private List extractFunctions = new ArrayList<>(); 52 | /** 53 | * CSS路径提取出的文本存储到哪个字段 54 | */ 55 | private String fieldName; 56 | /** 57 | * CSS路径提取出的字段的中文含义,仅仅起注释作用,利于理解 58 | */ 59 | private String fieldDescription; 60 | 61 | public HtmlTemplate getPageTemplate() { 62 | return pageTemplate; 63 | } 64 | 65 | public void setPageTemplate(HtmlTemplate pageTemplate) { 66 | this.pageTemplate = pageTemplate; 67 | } 68 | 69 | public String getCssPath() { 70 | return cssPath; 71 | } 72 | 73 | public void setCssPath(String cssPath) { 74 | this.cssPath = cssPath; 75 | } 76 | 77 | public String getAttr() { 78 | return attr; 79 | } 80 | 81 | public void setAttr(String attr) { 82 | this.attr = attr; 83 | } 84 | 85 | public List getExtractFunctions() { 86 | return extractFunctions; 87 | } 88 | 89 | public void setExtractFunctions(List extractFunctions) { 90 | this.extractFunctions = extractFunctions; 91 | for (ExtractFunction extractFunction : this.extractFunctions) { 92 | extractFunction.setCssPath(this); 93 | } 94 | } 95 | 96 | public boolean hasExtractFunction() { 97 | return !extractFunctions.isEmpty(); 98 | } 99 | 100 | public void addExtractFunction(ExtractFunction extractFunction) { 101 | extractFunctions.add(extractFunction); 102 | extractFunction.setCssPath(this); 103 | } 104 | 105 | public String getFieldName() { 106 | return fieldName; 107 | } 108 | 109 | public void setFieldName(String fieldName) { 110 | this.fieldName = fieldName; 111 | } 112 | 113 | public String getFieldDescription() { 114 | return fieldDescription; 115 | } 116 | 117 | public void setFieldDescription(String fieldDescription) { 118 | this.fieldDescription = fieldDescription; 119 | } 120 | 121 | @Override 122 | public String toString() { 123 | StringBuilder str = new StringBuilder(); 124 | str.append("CSS路径:").append(this.cssPath).append("\n"); 125 | str.append("字段名:").append(this.fieldName).append("\n"); 126 | str.append("字段含义:").append(this.fieldDescription).append("\n"); 127 | for (ExtractFunction ef : this.extractFunctions) { 128 | str.append("\t").append("抽取函数:").append(ef.getExtractExpression()).append("\n"); 129 | str.append("\t").append("字段名:").append(ef.getFieldName()).append("\n"); 130 | str.append("\t").append("字段含义:").append(ef.getFieldDescription()).append("\n"); 131 | } 132 | return str.toString(); 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/CssPath.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.model; 22 | 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | import org.codehaus.jackson.annotate.JsonIgnore; 26 | 27 | /** 28 | * CSS路径 29 | * CSS路径是页面模板的一级元素 30 | * CSS路径抽取到的内容如果还不满足要求 31 | * 需要使用二级元素即抽取函数来做进一步控制 32 | * 33 | * @author 杨尚川 34 | * 35 | */ 36 | public class CssPath { 37 | /** 38 | * CSS路径对应的网页模板 39 | */ 40 | @JsonIgnore 41 | private HtmlTemplate pageTemplate; 42 | /** 43 | * CSS路径 44 | */ 45 | private String cssPath; 46 | /** 47 | * 提取属性,如果不指定属性,则提取文本 48 | */ 49 | private String attr; 50 | /** 51 | * CSS路径对应的抽取函数列表 52 | */ 53 | private List extractFunctions = new ArrayList<>(); 54 | /** 55 | * CSS路径提取出的文本存储到哪个字段 56 | */ 57 | private String fieldName; 58 | /** 59 | * CSS路径提取出的字段的中文含义,仅仅起注释作用,利于理解 60 | */ 61 | private String fieldDescription; 62 | 63 | public HtmlTemplate getPageTemplate() { 64 | return pageTemplate; 65 | } 66 | 67 | public void setPageTemplate(HtmlTemplate pageTemplate) { 68 | this.pageTemplate = pageTemplate; 69 | } 70 | 71 | public String getCssPath() { 72 | return cssPath; 73 | } 74 | 75 | public void setCssPath(String cssPath) { 76 | this.cssPath = cssPath; 77 | } 78 | 79 | public String getAttr() { 80 | return attr; 81 | } 82 | 83 | public void setAttr(String attr) { 84 | this.attr = attr; 85 | } 86 | 87 | public List getExtractFunctions() { 88 | return extractFunctions; 89 | } 90 | 91 | public void setExtractFunctions(List extractFunctions) { 92 | this.extractFunctions = extractFunctions; 93 | for (ExtractFunction extractFunction : this.extractFunctions) { 94 | extractFunction.setCssPath(this); 95 | } 96 | } 97 | 98 | public boolean hasExtractFunction() { 99 | return !extractFunctions.isEmpty(); 100 | } 101 | 102 | public void addExtractFunction(ExtractFunction extractFunction) { 103 | extractFunctions.add(extractFunction); 104 | extractFunction.setCssPath(this); 105 | } 106 | 107 | public String getFieldName() { 108 | return fieldName; 109 | } 110 | 111 | public void setFieldName(String fieldName) { 112 | this.fieldName = fieldName; 113 | } 114 | 115 | public String getFieldDescription() { 116 | return fieldDescription; 117 | } 118 | 119 | public void setFieldDescription(String fieldDescription) { 120 | this.fieldDescription = fieldDescription; 121 | } 122 | 123 | @Override 124 | public String toString() { 125 | StringBuilder str = new StringBuilder(); 126 | str.append("CSS路径:").append(this.cssPath).append("\n"); 127 | str.append("字段名:").append(this.fieldName).append("\n"); 128 | str.append("字段含义:").append(this.fieldDescription).append("\n"); 129 | for (ExtractFunction ef : this.extractFunctions) { 130 | str.append("\t").append("抽取函数:").append(ef.getExtractExpression()).append("\n"); 131 | str.append("\t").append("字段名:").append(ef.getFieldName()).append("\n"); 132 | str.append("\t").append("字段含义:").append(ef.getFieldDescription()).append("\n"); 133 | } 134 | return str.toString(); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /html-extractor-web/src/main/java/org/apdplat/extractor/html/server/service/JsonGenerator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.server.service; 22 | 23 | import java.io.IOException; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import org.apdplat.extractor.html.server.model.CssPath; 27 | import org.apdplat.extractor.html.server.model.ExtractFunction; 28 | import org.apdplat.extractor.html.server.model.HtmlTemplate; 29 | import org.apdplat.extractor.html.server.model.UrlPattern; 30 | import org.codehaus.jackson.map.ObjectMapper; 31 | import org.slf4j.LoggerFactory; 32 | 33 | /** 34 | * JSON生成器 35 | * @author 杨尚川 36 | */ 37 | public class JsonGenerator { 38 | private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(JsonGenerator.class); 39 | private static final ObjectMapper MAPPER = new ObjectMapper(); 40 | private JsonGenerator(){} 41 | 42 | public static String generateExtractRegular(List urlPatterns){ 43 | try { 44 | String value = MAPPER.writeValueAsString(urlPatterns); 45 | return value; 46 | } catch (IOException ex) { 47 | LOGGER.error("将抽取规则转换为JSON出错", ex); 48 | } 49 | return "[]"; 50 | } 51 | public static void main(String[] args) { 52 | List urlPatterns = new ArrayList<>(); 53 | 54 | UrlPattern urlPattern = new UrlPattern(); 55 | urlPattern.setUrlPattern("http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html"); 56 | 57 | urlPatterns.add(urlPattern); 58 | 59 | HtmlTemplate htmlTemplate = new HtmlTemplate(); 60 | htmlTemplate.setTemplateName("网易财经频道"); 61 | htmlTemplate.setTableName("finance"); 62 | 63 | urlPattern.addHtmlTemplate(htmlTemplate); 64 | 65 | CssPath cssPath = new CssPath(); 66 | cssPath.setCssPath("h1#h1title"); 67 | cssPath.setFieldName("title"); 68 | cssPath.setFieldDescription("标题"); 69 | 70 | htmlTemplate.addCssPath(cssPath); 71 | 72 | cssPath = new CssPath(); 73 | cssPath.setCssPath("div#endText"); 74 | cssPath.setFieldName("content"); 75 | cssPath.setFieldDescription("正文"); 76 | 77 | htmlTemplate.addCssPath(cssPath); 78 | 79 | urlPattern = new UrlPattern(); 80 | urlPattern.setUrlPattern("http://finance.qq.com/a/\\d{8}/\\d{6}.htm"); 81 | 82 | urlPatterns.add(urlPattern); 83 | 84 | htmlTemplate = new HtmlTemplate(); 85 | htmlTemplate.setTemplateName("腾讯财经频道"); 86 | htmlTemplate.setTableName("finance"); 87 | 88 | urlPattern.addHtmlTemplate(htmlTemplate); 89 | 90 | cssPath = new CssPath(); 91 | cssPath.setCssPath("div#C-Main-Article-QQ div.hd h1"); 92 | cssPath.setFieldName("title"); 93 | cssPath.setFieldDescription("标题"); 94 | 95 | htmlTemplate.addCssPath(cssPath); 96 | 97 | cssPath = new CssPath(); 98 | cssPath.setCssPath("div#Cnt-Main-Article-QQ"); 99 | cssPath.setFieldName("content"); 100 | cssPath.setFieldDescription("正文"); 101 | 102 | htmlTemplate.addCssPath(cssPath); 103 | 104 | ExtractFunction extractFunction = new ExtractFunction(); 105 | extractFunction.setFieldName("content"); 106 | extractFunction.setFieldDescription("正文"); 107 | extractFunction.setExtractExpression("deleteChild(“div.ep-source”)"); 108 | 109 | cssPath.addExtractFunction(extractFunction); 110 | 111 | System.out.println(generateExtractRegular(urlPatterns)); 112 | 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/model/ExtractResult.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.model; 22 | 23 | import java.util.ArrayList; 24 | import java.util.HashMap; 25 | import java.util.List; 26 | import java.util.Map; 27 | 28 | /** 29 | * 网页结构化信息抽取结果 30 | * 一个网页模板对应一个抽取结果 31 | * 如果一个网页有多个网页模板 32 | * 每个模板都抽取成功 33 | * 只要这些模板保存在不同的表中 34 | * URL作为主键就不会冲突 35 | * 36 | * @author 杨尚川 37 | * 38 | */ 39 | public class ExtractResult { 40 | /** 41 | * 网页对应的URL 42 | */ 43 | private String url; 44 | /** 45 | * 网页原始内容 46 | */ 47 | private byte[] content; 48 | /** 49 | * 网页编码 50 | */ 51 | private String encoding; 52 | /** 53 | * 网页关键词元数据 54 | */ 55 | private String keywords; 56 | /** 57 | * 网页描述元数据 58 | */ 59 | private String description; 60 | /** 61 | * 网页提取出的文本存储到哪个表 62 | */ 63 | private String tableName; 64 | /** 65 | * 一个网页可能有多个抽取结果项,至少要一个 66 | */ 67 | private final Map> extractResultItems = new HashMap<>(); 68 | /** 69 | * 抽取失败日志 70 | */ 71 | private final List extractFailLogs = new ArrayList<>(); 72 | 73 | public boolean isSuccess() { 74 | return extractFailLogs.isEmpty() && !extractResultItems.isEmpty(); 75 | } 76 | 77 | public String getUrl() { 78 | return url; 79 | } 80 | 81 | public void setUrl(String url) { 82 | this.url = url; 83 | } 84 | 85 | public byte[] getContent() { 86 | return content; 87 | } 88 | 89 | public void setContent(byte[] content) { 90 | this.content = content; 91 | } 92 | 93 | public String getEncoding() { 94 | return encoding; 95 | } 96 | 97 | public void setEncoding(String encoding) { 98 | this.encoding = encoding; 99 | } 100 | 101 | public String getKeywords() { 102 | return keywords; 103 | } 104 | 105 | public void setKeywords(String keywords) { 106 | this.keywords = keywords; 107 | } 108 | 109 | public String getDescription() { 110 | return description; 111 | } 112 | 113 | public void setDescription(String description) { 114 | this.description = description; 115 | } 116 | 117 | public String getTableName() { 118 | return tableName; 119 | } 120 | 121 | public void setTableName(String tableName) { 122 | this.tableName = tableName; 123 | } 124 | 125 | public Map> getExtractResultItems() { 126 | return extractResultItems; 127 | } 128 | 129 | public void addExtractResultItem(ExtractResultItem extractResultItem) { 130 | List list = extractResultItems.get(extractResultItem.getField()); 131 | if(list == null){ 132 | list = new ArrayList<>(); 133 | extractResultItems.put(extractResultItem.getField(), list); 134 | } 135 | list.add(extractResultItem); 136 | } 137 | 138 | public List getExtractFailLogs() { 139 | return extractFailLogs; 140 | } 141 | 142 | public void addExtractFailLog(ExtractFailLog extractFailLog) { 143 | this.extractFailLogs.add(extractFailLog); 144 | extractFailLog.setExtractResult(this); 145 | } 146 | 147 | @Override 148 | public String toString() { 149 | return "ExtractResult [\nurl=" + url + ", \ntableName=" + tableName 150 | + ", \nextractResultItems=" + extractResultItems + ", \nextractFailLogs=" + extractFailLogs + "]"; 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/PersonalTechnology.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/wwdc-2016-ios-10/3386601.html=New Updates to iPhone Software 2 | http://learningenglish.voanews.com/a/phone-listening/3377513.html=Your Phone Might Be Listening to You 3 | http://learningenglish.voanews.com/a/twitter-update-2016/3364301.html=Changes to Twitter Let You Say More in a Tweet 4 | http://learningenglish.voanews.com/a/travel-apps/3356673.html=Travel Apps Offer Direction Fun 5 | http://learningenglish.voanews.com/a/google-io-2016/3344946.html=Google Announces New Services and Products 6 | http://learningenglish.voanews.com/a/tech-tools-relax/3332930.html=Tech Tools Offer Relaxation Techniques 7 | http://learningenglish.voanews.com/a/blue-light-filters/3325898.html=Filter Blue Light for Better Sleep 8 | http://learningenglish.voanews.com/a/nanowires-battery/3313100.html=Batteries That Last Forever Could Be Near 9 | http://learningenglish.voanews.com/a/education-apps-infographic/3304443.html=Apps to Help Students Do Their Best Work 10 | http://learningenglish.voanews.com/a/facebook-hidden-messages/3293621.html=Where to Find Your Hidden Messages on Facebook 11 | http://learningenglish.voanews.com/a/apps-battery-use/3283650.html=Which Apps Drain Your Phone's Battery 12 | http://learningenglish.voanews.com/a/best-place-to-live-teleport/3269894.html=Find Your Best Place to Live 13 | http://learningenglish.voanews.com/a/reword-just-not-sorry/3261377.html=Want to Choose Better Words 14 | http://learningenglish.voanews.com/a/apple-annouces-small-iphone-return/3251714.html=Apple Announces the Return of a Smaller iPhone 15 | http://learningenglish.voanews.com/a/how-good-are-you-at-geography-games/3239633.html=How Good Are You at Geography Games 16 | http://learningenglish.voanews.com/a/facebook-reactions/3227759.html=Facebook Reactions Much More Than a Like 17 | http://learningenglish.voanews.com/a/apple-fbi-iphone/3216399.html=Apple vs FBI And Your Privacy 18 | http://learningenglish.voanews.com/a/fotor-photo-editing/3203863.html=Photo Editing App Improves Your Shots 19 | http://learningenglish.voanews.com/a/3194948.html=App Feels Earthquakes Through Mobile Phones 20 | http://learningenglish.voanews.com/a/facebook-new-features/3183507.html=Facebook Updates with New Features 21 | http://learningenglish.voanews.com/a/gmail-tools/3173145.html=Tools to Use with Gmail 22 | http://learningenglish.voanews.com/a/ces-2016/3163620.html=Electronics Show Displays Newest Gadgets 23 | http://learningenglish.voanews.com/a/american-concerts-available-worldwide/3152651.html=American Concerts Available Worldwide 24 | http://learningenglish.voanews.com/a/netflix-expands-to-190-countries/3144530.html=Netflix Expands to 190 Countries 25 | http://learningenglish.voanews.com/a/food-apps-infographic/3133599.html=Apps for Healthy Food Choices 26 | http://learningenglish.voanews.com/a/personal-tech-development-of-the-year/3107202.html=Personal Technology Trends of 2015 27 | http://learningenglish.voanews.com/a/tracking-santa-with-technology/3112564.html=Tracking Santa with Technology 28 | http://learningenglish.voanews.com/a/high-tech-devices-to-enjoy-holiday-music/3102745.html='High Tech' Devices to Enjoy Holiday Music 29 | http://learningenglish.voanews.com/a/facebook-quiz-privacy/3092280.html=Protect Your Privacy While Having Fun with Facebook Quizzes 30 | http://learningenglish.voanews.com/a/apple-tv-fire-tv-or-roku/3082835.html=Apple TV Fire TV or Roku 31 | http://learningenglish.voanews.com/a/tech-gifts-2015/3073944.html=15 Tech Gift Ideas for 2015 32 | http://learningenglish.voanews.com/a/self-driving-cars/3062082.html=Will Your Next Car Drive Itself 33 | http://learningenglish.voanews.com/a/apple-tv-review/3032591.html=Apple TV Review 34 | http://learningenglish.voanews.com/a/computer-ssd/3023674.html=Solid State Drive Gives New Life to Old Computer 35 | http://learningenglish.voanews.com/a/ten-tech-tools-to-teach-you-new-words/3015244.html=10 Tech Tools to Teach You New Words 36 | http://learningenglish.voanews.com/a/emoji-say-volumes-without-a-word-/3004375.html=Emojis Say Volumes Without a Word 37 | http://learningenglish.voanews.com/a/top-5-crowdfunding-projects-ever/2993888.html=Top 5 Crowdfunding Projects Ever 38 | http://learningenglish.voanews.com/a/cell-phone-use-among-friends-how-rude/2983035.html=Cell Phone Use Among Friends How Rude 39 | http://learningenglish.voanews.com/a/amazons-tool-for-reading-one-word-at-a-time/2974264.html=Amazon's Tool for Reading One Word at a Time 40 | http://learningenglish.voanews.com/a/longer-battery-life-among-apple-ios9-updates/2967811.html=Longer Battery Life Among Apple iOS9 Updates 41 | http://learningenglish.voanews.com/a/apple-iphone-ipad-appletv-watch/2954969.html=New iPhones Watches TVs Big iPads Offer Better Selfies -------------------------------------------------------------------------------- /html-extractor/src/main/java/org/apdplat/extractor/html/impl/ExtractFunctionExecutor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * APDPlat - Application Product Development Platform 4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com 5 | * 6 | * This program is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program. If not, see . 18 | * 19 | */ 20 | 21 | package org.apdplat.extractor.html.impl; 22 | 23 | import org.apache.commons.lang.StringUtils; 24 | import org.apdplat.extractor.html.model.CssPath; 25 | import org.jsoup.nodes.Document; 26 | import org.jsoup.nodes.Element; 27 | import org.jsoup.select.Elements; 28 | import org.slf4j.Logger; 29 | import org.slf4j.LoggerFactory; 30 | 31 | /** 32 | * 抽取函数执行引擎,提供的抽取函数有: 33 | * 1、deleteChild(div.ep-source) 34 | * 2、removeText(作者:) 35 | * 3、substring(0,19) 抽取函数的格式为:函数名称+(+逗号分割的参数+) 36 | * 37 | * @author 杨尚川 38 | * 39 | */ 40 | public class ExtractFunctionExecutor { 41 | public static final Logger LOGGER = LoggerFactory.getLogger(ExtractFunctionExecutor.class); 42 | 43 | /** 44 | * 执行抽取函数 45 | * 46 | * @param text CSS路径抽取出来的文本 47 | * @param doc 根文档 48 | * @param cssPath CSS路径对象 49 | * @param parseExpression 抽取函数 50 | * @return 抽取函数处理之后的文本 51 | */ 52 | public static String execute(String text, Document doc, CssPath cssPath, String parseExpression) { 53 | if (parseExpression.startsWith("deleteChild")) { 54 | return executeDeleteChild(text, doc, cssPath, parseExpression); 55 | } 56 | if (parseExpression.startsWith("removeText")) { 57 | return executeRemoveText(text, parseExpression); 58 | } 59 | if (parseExpression.startsWith("substring")) { 60 | return executeSubstring(text, parseExpression); 61 | } 62 | 63 | return null; 64 | } 65 | 66 | /** 67 | * 截取指定范围的文本 使用方法:substring(0,19) 68 | * 括号内的参数为2个,分别是字符索引下标,截取从0开始到19的字符串,索引包括0,不包括19,即[0 - 19) 69 | * 70 | * @param text CSS路径抽取出来的文本 71 | * @param parseExpression 抽取函数 72 | * @return 抽取函数处理之后的文本 73 | */ 74 | public static String executeSubstring(String text, String parseExpression) { 75 | LOGGER.debug("substring抽取函数之前:" + text); 76 | String parameter = parseExpression.replace("substring(", ""); 77 | parameter = parameter.substring(0, parameter.length() - 1); 78 | String[] attr = parameter.split(","); 79 | if (attr != null && attr.length == 2) { 80 | int beginIndex = Integer.parseInt(attr[0]); 81 | int endIndex = Integer.parseInt(attr[1]); 82 | text = text.substring(beginIndex, endIndex); 83 | } 84 | LOGGER.debug("substring抽取函数之后:" + text); 85 | return text; 86 | } 87 | 88 | /** 89 | * 删除指定的文本 使用方法:removeText(作者:) 括号内的参数为文本字符,从CSS路径匹配的文本中删除参数文本 90 | * 91 | * @param text CSS路径抽取出来的文本 92 | * @param parseExpression 抽取函数 93 | * @return 抽取函数处理之后的文本 94 | */ 95 | public static String executeRemoveText(String text, String parseExpression) { 96 | LOGGER.debug("removeText抽取函数之前:" + text); 97 | String parameter = parseExpression.replace("removeText(", ""); 98 | parameter = parameter.substring(0, parameter.length() - 1); 99 | text = text.replace(parameter, ""); 100 | LOGGER.debug("removeText抽取函数之后:" + text); 101 | return text; 102 | } 103 | 104 | /** 105 | * 删除子CSS路径的内容 使用方法:deleteChild(div.ep-source) 106 | * 括号内的参数为相对CSS路径的子路径,从CSS路径匹配的文本中删除子路径匹配的文本 107 | * 108 | * @param text CSS路径抽取出来的文本 109 | * @param doc 根文档 110 | * @param cssPath CSS路径对象 111 | * @param parseExpression 抽取函数 112 | * @return 抽取函数处理之后的文本 113 | */ 114 | public static String executeDeleteChild(String text, Document doc, CssPath cssPath, String parseExpression) { 115 | LOGGER.debug("deleteChild抽取函数之前:" + text); 116 | String parameter = parseExpression.replace("deleteChild(", ""); 117 | parameter = parameter.substring(0, parameter.length() - 1); 118 | Elements elements = doc.select(cssPath.getCssPath() + " " + parameter); 119 | for (Element element : elements) { 120 | String t = element.text(); 121 | if (StringUtils.isNotBlank(t)) { 122 | LOGGER.debug("deleteChild抽取函数删除:" + t); 123 | text = text.replace(t, ""); 124 | } 125 | } 126 | LOGGER.debug("deleteChild抽取函数之后:" + text); 127 | return text; 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /mvnw.cmd: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Licensed to the Apache Software Foundation (ASF) under one 3 | @REM or more contributor license agreements. See the NOTICE file 4 | @REM distributed with this work for additional information 5 | @REM regarding copyright ownership. The ASF licenses this file 6 | @REM to you under the Apache License, Version 2.0 (the 7 | @REM "License"); you may not use this file except in compliance 8 | @REM with the License. You may obtain a copy of the License at 9 | @REM 10 | @REM http://www.apache.org/licenses/LICENSE-2.0 11 | @REM 12 | @REM Unless required by applicable law or agreed to in writing, 13 | @REM software distributed under the License is distributed on an 14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | @REM KIND, either express or implied. See the License for the 16 | @REM specific language governing permissions and limitations 17 | @REM under the License. 18 | @REM ---------------------------------------------------------------------------- 19 | 20 | @REM ---------------------------------------------------------------------------- 21 | @REM Maven2 Start Up Batch script 22 | @REM 23 | @REM Required ENV vars: 24 | @REM JAVA_HOME - location of a JDK home dir 25 | @REM 26 | @REM Optional ENV vars 27 | @REM M2_HOME - location of maven2's installed home dir 28 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands 29 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a key stroke before ending 30 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven 31 | @REM e.g. to debug Maven itself, use 32 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 33 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files 34 | @REM ---------------------------------------------------------------------------- 35 | 36 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' 37 | @echo off 38 | @REM enable echoing my setting MAVEN_BATCH_ECHO to 'on' 39 | @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% 40 | 41 | @REM set %HOME% to equivalent of $HOME 42 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") 43 | 44 | @REM Execute a user defined script before this one 45 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre 46 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending 47 | if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat" 48 | if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd" 49 | :skipRcPre 50 | 51 | @setlocal 52 | 53 | set ERROR_CODE=0 54 | 55 | @REM To isolate internal variables from possible post scripts, we use another setlocal 56 | @setlocal 57 | 58 | @REM ==== START VALIDATION ==== 59 | if not "%JAVA_HOME%" == "" goto OkJHome 60 | 61 | echo. 62 | echo Error: JAVA_HOME not found in your environment. >&2 63 | echo Please set the JAVA_HOME variable in your environment to match the >&2 64 | echo location of your Java installation. >&2 65 | echo. 66 | goto error 67 | 68 | :OkJHome 69 | if exist "%JAVA_HOME%\bin\java.exe" goto init 70 | 71 | echo. 72 | echo Error: JAVA_HOME is set to an invalid directory. >&2 73 | echo JAVA_HOME = "%JAVA_HOME%" >&2 74 | echo Please set the JAVA_HOME variable in your environment to match the >&2 75 | echo location of your Java installation. >&2 76 | echo. 77 | goto error 78 | 79 | @REM ==== END VALIDATION ==== 80 | 81 | :init 82 | 83 | set MAVEN_CMD_LINE_ARGS=%* 84 | 85 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn". 86 | @REM Fallback to current working directory if not found. 87 | 88 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% 89 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir 90 | 91 | set EXEC_DIR=%CD% 92 | set WDIR=%EXEC_DIR% 93 | :findBaseDir 94 | IF EXIST "%WDIR%"\.mvn goto baseDirFound 95 | cd .. 96 | IF "%WDIR%"=="%CD%" goto baseDirNotFound 97 | set WDIR=%CD% 98 | goto findBaseDir 99 | 100 | :baseDirFound 101 | set MAVEN_PROJECTBASEDIR=%WDIR% 102 | cd "%EXEC_DIR%" 103 | goto endDetectBaseDir 104 | 105 | :baseDirNotFound 106 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR% 107 | cd "%EXEC_DIR%" 108 | 109 | :endDetectBaseDir 110 | 111 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig 112 | 113 | @setlocal EnableExtensions EnableDelayedExpansion 114 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a 115 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% 116 | 117 | :endReadAdditionalConfig 118 | 119 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" 120 | 121 | set WRAPPER_JAR="".\.mvn\wrapper\maven-wrapper.jar"" 122 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 123 | 124 | %MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CMD_LINE_ARGS% 125 | if ERRORLEVEL 1 goto error 126 | goto end 127 | 128 | :error 129 | set ERROR_CODE=1 130 | 131 | :end 132 | @endlocal & set ERROR_CODE=%ERROR_CODE% 133 | 134 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost 135 | @REM check for post script, once with legacy .bat ending and once with .cmd ending 136 | if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat" 137 | if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd" 138 | :skipRcPost 139 | 140 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' 141 | if "%MAVEN_BATCH_PAUSE%" == "on" pause 142 | 143 | if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE% 144 | 145 | exit /B %ERROR_CODE% -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/HealthLifestyle.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/health-and-lifestyle-exercise-protects-against-two-major-diseases/3391399.html=Exercise Can Protect Against Two Major Diseases 2 | http://learningenglish.voanews.com/a/aspirin-may-cut-cancer-deaths/3355757.html=Aspirin May Cut Cancer Deaths 3 | http://learningenglish.voanews.com/a/scientists-discover-gene-multiple-sclerosis/3369386.html=Scientists Discover Gene Responsible for Multiple Sclerosis 4 | http://learningenglish.voanews.com/a/3359431.html=How Do You Save Unwanted Babies 5 | http://learningenglish.voanews.com/a/3357042.html=This Superbug Is Too Strong for Antibiotics 6 | http://learningenglish.voanews.com/a/urban-residents-breathe-unsafe-air/3347253.html=WHO 80 of Urban Residents Breathe Unsafe Air 7 | http://learningenglish.voanews.com/a/advice-for-health-workers-caring-for-victims-of-female-genital-mutilation/3342483.html=Health Workers Advised on Care of FGM Victims 8 | http://learningenglish.voanews.com/a/3332366.html=Researchers Develop Better Cheaper Faster Zika Test 9 | http://learningenglish.voanews.com/a/white-women-are-dying-too-soon-in-america/3339042.html=White Women in US Are Dying Sooner 10 | http://learningenglish.voanews.com/a/rich-people-haver-better-stress-than-poor/3329623.html=Rich People Even Have Better Stress Than Poor 11 | http://learningenglish.voanews.com/a/medical-mistake-cause-third-of-deaths/3326271.html=Medical Mistakes Third-Leading Cause of Death in US 12 | http://learningenglish.voanews.com/a/emergency-medical-care-in-a-backpack/3318666.html=Emergency Medical Care in a Backpack 13 | http://learningenglish.voanews.com/a/blood-test-shows-when-a-person-has-a-concussion/3308652.html=Blood Test Shows Concussion Injury 14 | http://learningenglish.voanews.com/a/plants-help-women-live-longer/3311928.html=Study Plants May Help Women Live Longer 15 | http://learningenglish.voanews.com/a/four-healthy-tasty-spices/3305474.html=Can You Name Four of the Healthiest Tastiest Spices 16 | http://learningenglish.voanews.com/a/suicide-rates-increase-in-us/3304646.html=Suicide Rates Increase in US 17 | http://learningenglish.voanews.com/a/one-minute-excercise/3306957.html=One Minute of Exercise May Be All You Need 18 | http://learningenglish.voanews.com/a/contagious-virues-sickens-million/3306881.html=Highly Contagious Virus Sickens Millions Costs Billions 19 | http://learningenglish.voanews.com/a/why-its-harder-to-sleep-new-place/3301774.html=Why It's Harder to Sleep in a New Place 20 | http://learningenglish.voanews.com/a/who-drowning-among-top-causes-of-deaths-in-children/3294288.html=WHO Drowning Among Top Causes of Death in Children 21 | http://learningenglish.voanews.com/a/3258485.html=Mentally Ill Indonesians 'Living in Hell' 22 | http://learningenglish.voanews.com/a/3283010.html=US Health Officials Zika 'Scarier Than We Thought' 23 | http://learningenglish.voanews.com/a/study-says-there-are-more-obese-people-in-the-world-than-underweight-people/3271436.html=Study More Obese People Than Underweight 24 | http://learningenglish.voanews.com/a/want-to-lose-weight-listen-to-yourself-eat/3244250.html=Want to Lose Weight Listen to Yourself Eat 25 | http://learningenglish.voanews.com/a/asian-kids-face-rising-hunger-and-obesity-rates/3258511.html=Asian Children Face Hunger and Obesity 26 | http://learningenglish.voanews.com/a/3242332.html=Mummies Contain Clues to Colon Cancer 27 | http://learningenglish.voanews.com/a/is-email-stressing-you-out/3209644.html=Is Email Stressing You Out 28 | http://learningenglish.voanews.com/a/3236261.html=Social Media Is Keeping Young Adults Awake 29 | http://learningenglish.voanews.com/a/3236302.html=Happy National Napping Day 30 | http://learningenglish.voanews.com/a/3223698.html=Ever Feel Like You'll Die of A Broken Heart 31 | http://learningenglish.voanews.com/a/blood-cancer-therapy-is-last-chance-for-some/3208279.html=Blood Cancer Therapy Is Last Chance for Some 32 | http://learningenglish.voanews.com/a/helicopter-parenting/3194736.html=Helping Children Too Much Is Hurting Them 33 | http://learningenglish.voanews.com/a/us-14-new-reports-of-sexual-transmission-of-zika-virus/3206754.html=US 14 New Reports of Sexual Transmission of Zika Virus 34 | http://learningenglish.voanews.com/a/spotlight-on-zika-helps-other-neglected-tropical-diseases/3196431.html=Spotlight on Zika Helps Neglected Tropical Diseases 35 | http://learningenglish.voanews.com/a/ebola-zika-clues/3193225.html=Ebola Crisis Might Hold Clues for Fighting Zika Virus 36 | http://learningenglish.voanews.com/a/willpower-do-you-think-you-can/3159018.html=Do You Think You Can 37 | http://learningenglish.voanews.com/a/power-of-touch/3179086.html=The Power of Touch 38 | http://learningenglish.voanews.com/a/zika-virus-olympic-concerns/3187238.html=Olympic Athletes Concerned About Zika Virus 39 | http://learningenglish.voanews.com/a/researchers-find-link-between-zika-virus-and-birth-defects/3187097.html=Researchers Link Between Zika Virus and Birth Defects 40 | http://learningenglish.voanews.com/a/white-house-seeks-1point-8-billion-to-combat-zika/3182160.html=Obama Seeks $1.8 Billion to Combat Zika 41 | http://learningenglish.voanews.com/a/zika-virus-detected-in-body-fluids/3178780.html=Zika Virus Detected in Body Fluids 42 | http://learningenglish.voanews.com/a/person-infected-with-zika-virus-through-sexual-relations/3175064.html=Person Infected With Zika Virus Through Sexual Contact 43 | http://learningenglish.voanews.com/a/who-declares-zika-virus-a-global-health-emergency/3172096.html=WHO Declares Zika A Global Health Emergency 44 | http://learningenglish.voanews.com/a/copper-kills-viruses-on-contact/3147962.html=Copper Metal Kills Viruses on Contact 45 | http://learningenglish.voanews.com/a/urgent-call-to-stop-the-spread-of-zika-virus/3165458.html=Urgent Call to Stop Zika Virus -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/ScienceintheNews.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/bones-of-hobbit-ancester-found-in-indonesia/3377930.html='Hobbit' Ancestor Bones Found 2 | http://learningenglish.voanews.com/a/australian-spider-worlds-fastest/3387065.html=Australian Spider May Be World's Fastest 3 | http://learningenglish.voanews.com/a/scientists-say-gene-editing-should-stay-in-laboratories/3390454.html=Scientists Gene Editing Should Stay in Laboratories 4 | http://learningenglish.voanews.com/a/how-3d-printing-can-create-tissue-from-stem-cells/3374074.html=How 3-D Printing Can Create Tissue from Stem Cells 5 | http://learningenglish.voanews.com/a/was-mars-less-red-long-ago/3355973.html=Was Mars -- The Red Planet -- Once White 6 | http://learningenglish.voanews.com/a/what-it-takes-to-be-an-astronaut/3367765.html=What It Takes to Be an Astronaut 7 | http://learningenglish.voanews.com/a/largest-sponge-found-off-hawaii-coast/3360778.html=Minivan-sized Sponge Found off Hawaii Coast 8 | http://learningenglish.voanews.com/a/knife-cuts-on-mastodon-bone-reveal-earlier-humans/3347233.html=Knife Cuts on Mastodon Bone Reveal Earlier Humans 9 | http://learningenglish.voanews.com/a/artificial-intelligence-helpful-and-dangerous/3334422.html=Artificial Intelligence Helpful and Dangerous 10 | http://learningenglish.voanews.com/a/solar-cooker-helps-reduce-smoke-pollution-deaths/3313364.html=Solar Cooker Reduces Smoke Pollution Deaths 11 | http://learningenglish.voanews.com/a/nasa-training-for-commercial-flights-to-space/3318170.html=NASA Astronauts Train for Commercial Space Flights 12 | http://learningenglish.voanews.com/a/nasa-discovers-nearly-1300-planets/3326050.html=NASA Discovers Nearly 1300 New Planets 13 | http://learningenglish.voanews.com/a/amazing-amazon-hides-coral-reef/3314107.html=Amazing Amazon Hides Atlantic's Coral Reef 14 | http://learningenglish.voanews.com/a/dinosaurs-already-decreasing-before-asteroid-hit/3298965.html=Dinosaurs Already Decreasing Before Asteroid Hit 15 | http://learningenglish.voanews.com/a/some-birds-as-smarts-as-apes/3298714.html=Some Birds as Smart as Apes 16 | http://learningenglish.voanews.com/a/monkeys-raft-to-panama/3298794.html=Scientists Monkeys Used Raft to Cross Ocean 17 | http://learningenglish.voanews.com/a/polio-virus-used-to-kill-brain-tumors/3196863.html=Polio Virus Used to Kill Brain Tumors 18 | http://learningenglish.voanews.com/a/researchers-look-for-ways-to-heal-brains/3109209.html=Researchers Repair Brain Injuries With New Cells 19 | http://learningenglish.voanews.com/a/hawking-zuckerberg-help-launch-search-for-life-in-the-universe/3287925.html=Hawking Zuckerberg to Search for Life in the Universe 20 | http://learningenglish.voanews.com/a/marine-biologist-fight-for-coral-reefs-makes-science-fun/3232146.html=Scientist Fights for Coral Reefs Makes Science Fun 21 | http://learningenglish.voanews.com/a/scientists-could-be-closer-to-aids-cure/3265201.html=Scientists Could be Closer to AIDS Cure 22 | http://learningenglish.voanews.com/a/will-the-plain-of-jars-mystery-be-solved/3250131.html=Scientists Closer to Solving Plain of Jars Mystery 23 | http://learningenglish.voanews.com/a/making-world-better-tech/3247753.html=Four Young People Who Make the World Better 24 | http://learningenglish.voanews.com/a/climate-change-sea-levels-and-arctic-temperatures-rise/3203794.html=Sea Levels and Arctic Temperatures Rise 25 | http://learningenglish.voanews.com/a/astronomers-eclipse-window-seat/3229536.html=Astronomers Get Window Seat for Eclipse 26 | http://learningenglish.voanews.com/a/smart-bandages-could-heal-wounds-quickly/3188911.html=Smart Bandages to Heal Wounds More Quickly 27 | http://learningenglish.voanews.com/a/oceans-could-hold-more-plastics-than-fish-2050/3166848.html=Oceans To Hold More Plastic Than Fish by 2050 28 | http://learningenglish.voanews.com/a/scott-kelly-about-a-year-in-space/3208316.html=Astronaut Scott Kelly Talks about His Year in Space 29 | http://learningenglish.voanews.com/a/volcanic-clays-kill-bacteria/3183724.html=Some Volcanic Clays Kill Bacteria 30 | http://learningenglish.voanews.com/a/eistein-is-proven-right/3188629.html=Gravitational Waves Detected Einstein Is Right 31 | http://learningenglish.voanews.com/a/nasa-asteroid/3183344.html=NASA Big Asteroid Could Pass Near Earth Next Month 32 | http://learningenglish.voanews.com/a/rarely-seen-bush-dogs-pictures-panama/3161723.html=Cameras Capture Pictures of Mysterious Bush Dog 33 | http://learningenglish.voanews.com/a/ninth-planet-may-be-in-solar-system/3156472.html=Is There a Ninth Planet in Our Solar System 34 | http://learningenglish.voanews.com/a/device-shows-human-body-as-never-seen-before/3066410.html=New Device Shows Human Body As Never Seen Before 35 | http://learningenglish.voanews.com/a/how-do-dogs-drink-water/3116091.html=Think You Know How Dogs Drink Water 36 | http://learningenglish.voanews.com/a/habitat-loss-endangers-migrating-birds/3109229.html=Migrating Birds Can't Find Their Way Home 37 | http://learningenglish.voanews.com/a/plant-your-mobile-charger-in-the-dirt/3057157.html=Plant Your Mobile Charger in the Dirt 38 | http://learningenglish.voanews.com/a/look-back-at-science-in-2015/3115846.html=A Look Back at 2015 The Year in Science and Technology 39 | http://learningenglish.voanews.com/a/spacex-rocket-launches-and-returns/3115778.html=SpaceX Celebrates Historic Rocket Launch Landing 40 | http://learningenglish.voanews.com/a/apollo-11-spacecraft-soon-to-be-in-3d/3109269.html=Apollo 11 Spacecraft Lands in Your Smartphone 41 | http://learningenglish.voanews.com/a/could-a-jolt-to-the-brain-stop-motion-sickness/3060134.html=Could an Electric Current Stop Motion Sickness 42 | http://learningenglish.voanews.com/a/researchers-use-lasers-to-chill-water/3072071.html=Lasers Used to Chill Instead of Heat 43 | http://learningenglish.voanews.com/a/nasa-paints-pluto-wild-with-color/3065467.html=NASA Paints Pluto Wild With Color 44 | http://learningenglish.voanews.com/a/whisky-tested-as-alternative-fuel-for-vehicles/3065592.html=Scottish Whisky Tested as Alternative to Fossil Fuels 45 | http://learningenglish.voanews.com/a/melting-glaciers-and-climate-talks/3056901.html=Melting Glaciers on Climate Talk List in Paris -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/ThisIsAmerica.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/young-traveler-hopes-to-visit-every-national-park-in-america-/3378000.html=Young Traveler Hopes to Visit Every National Park 2 | http://learningenglish.voanews.com/a/should-insensitive-place-names-be-changed/3370802.html=Should Offensive Place Names Be Changed 3 | http://learningenglish.voanews.com/a/the-new-sanctuary-movement-seeks-to-help-immigrants-to-us/3361063.html=Sanctuary Movement Helps Immigrants at Risk 4 | http://learningenglish.voanews.com/a/hallowed-ground-arlington-national-cemetery/2780835.html=Memorial Day Arlington National Cemetery 5 | http://learningenglish.voanews.com/a/us-golf-courses-try-new-ways-to-get-more-people-to-play/3347618.html=How Can Golf Appeal to More People 6 | http://learningenglish.voanews.com/a/rockford-peaches-baseball-girls-a-league-of-their-own/3344028.html=All-Girls Baseball Team in Seattle Makes History 7 | http://learningenglish.voanews.com/a/death-judge-scalia-slows-top-american-court/3337624.html=Death of Judge Scalia Slows Top American Court 8 | http://learningenglish.voanews.com/a/american-schools-remain-divided-by-race/3336002.html=American Schools Remain Divided by Race 9 | http://learningenglish.voanews.com/a/real-mermaids-not-really-but/3313412.html=Real Mermaids Not Really But 10 | http://learningenglish.voanews.com/a/ex-prisoners-ask-for-2nd-chance-and-job/3318669.html=Ex-Prisoners Ask for Second Chance and a Job 11 | http://learningenglish.voanews.com/a/top-democratic-and-republican-senators-disagree-on-next-steps-for-supreme-court-nomination/3306320.html=Democrats Republicans Disagree on Supreme Court Nomination 12 | http://learningenglish.voanews.com/a/what-bathroom-should-transgender-people-use/3308562.html=What Bathroom Should Transgender People Use 13 | http://learningenglish.voanews.com/a/high-demand-from-foreign-investors-for-special-us-visas/3298194.html=High Demand from Investors for Special US Visas 14 | http://learningenglish.voanews.com/a/us-immigration-case/3292826.html=Will US Let Some Undocumented Immigrants Stay 15 | http://learningenglish.voanews.com/a/pope-message-on-family-in-line-with-many-us-catholics/3287739.html=Pope's Message on Family in Line with US Catholics 16 | http://learningenglish.voanews.com/a/twelve-places-often-mispronounced/3282903.html=Twelve Mispronounced US Place Names 17 | http://learningenglish.voanews.com/a/supreme-court-helps-define-one-person-one-vote/3276158.html=Supreme Court Helps Define One Person One Vote 18 | http://learningenglish.voanews.com/a/large-cars-new-york-auto-show/3265049.html=Large Cars on Top At New York Auto Show 19 | http://learningenglish.voanews.com/a/are-16-17-mature-enough-to-vote/3247882.html=Are Teenagers Mature Enough to Vote 20 | http://learningenglish.voanews.com/a/some-foreign-leaders-taking-sides-against-trump/3255022.html=Foreign Leaders Taking Sides Against Donald Trump 21 | http://learningenglish.voanews.com/a/americans-take-80-world-opioid-supply/3244249.html=Americans Take 80 of World's Opioid Supply 22 | http://learningenglish.voanews.com/a/record-number-americans-giving-up-us-citizenship/3219840.html=Americans Who Gave Up Passports Tell Why 23 | http://learningenglish.voanews.com/a/trappist-monastery-in-virginia-countryside-adapts-to-the-21st-century/3203507.html=Silent Monks Learn to Speak for Revenue 24 | http://learningenglish.voanews.com/a/is-there-more-lying-in-this-election/3208230.html=Is There More Lying In This Election 25 | http://learningenglish.voanews.com/a/a-rages-to-riches-story/3209830.html=From Child of Freed Slaves to Millionaire 26 | http://learningenglish.voanews.com/a/smartest-americans/3186794.html=Where Are the Best Educated Americans 27 | http://learningenglish.voanews.com/a/presidents-day-2016-presidential-firsts-/3186922.html=Presidential Facts and 'Firsts' 28 | http://learningenglish.voanews.com/a/americas-destinations-on-the-rise/3185649.html=America's Destinations 'On the Rise' 29 | http://learningenglish.voanews.com/a/us-gives-less-foreign-aid-than-americans-think/3172204.html=US Gives Less Foreign Aid than Americans Think 30 | http://learningenglish.voanews.com/a/more-cases-of-brain-disease-from-football-blows/3175188.html=More Cases of Brain Disease from Football Blows 31 | http://learningenglish.voanews.com/a/white-house-protester-dies-after-longest-vigil/3166054.html=White House Protester Dies After Longest Vigil 32 | http://learningenglish.voanews.com/a/3158747.html=Water Crisis in US City a 'Disaster Not Just an Emergency' 33 | http://learningenglish.voanews.com/a/bulgarians-biking-from-alaska-to-argentina/3143338.html=Bulgarians Biking From Alaska to Argentina 34 | http://learningenglish.voanews.com/a/sean-penn-guzman-el-chapo-propaganda/3148549.html=Sean Penn Journalist or Propagandist 35 | http://learningenglish.voanews.com/a/voa-study-questions/3133320.html=Americans Jobs More Important Than Terrorism 36 | http://learningenglish.voanews.com/a/new-years-music/1813304.html=Memories and Hopes Meet in New Year's Music 37 | http://learningenglish.voanews.com/a/what-does-that-mean-what-did-she-say-expressions-american-presidential-candidates-use-in-debates/3106823.html=Political Idioms What Did He Say 38 | http://learningenglish.voanews.com/a/floridas-key-west-close-to-perfect-far-from-normal/3102410.html=Florida's Key West Close to Perfect Far From Normal 39 | http://learningenglish.voanews.com/a/hold-the-butter-modern-diets-meet-holiday-traditions/3117341.html=Hold The Butter Modern Diets Meet Holiday Traditions 40 | http://learningenglish.voanews.com/a/route-66-end-of-the-trail-california/3109251.html=Route 66 California The End of the Trail 41 | http://learningenglish.voanews.com/a/whay-are-americans-so-angry/3099245.html=Why Are Americans So Angry 42 | http://learningenglish.voanews.com/a/arizona-spirit-route-66-voa/3099375.html=Arizona The Spirit of Route 66 43 | http://learningenglish.voanews.com/a/shootings-leads-questions-police-tactics/3085017.html=Shootings Lead to Questions About Police Tactics 44 | http://learningenglish.voanews.com/a/route-66-voa-santa-fe-city-different/3088211.html=Santa Fe The City Different 45 | http://learningenglish.voanews.com/a/four-famous-foods-on-route-66/3073335.html=Route 66 Serves Up Pizza Burgers 'Horseshoes' and More -------------------------------------------------------------------------------- /html-extractor-web/src/main/webapp/api/all_extract_regular.jsp: -------------------------------------------------------------------------------- 1 | <%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%> 2 | [ 3 | { 4 | "urlPattern": "http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html", 5 | "regexPattern": "/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html", 6 | "pageTemplates": [ 7 | { 8 | "templateName": "网易财经频道1", 9 | "tableName": "finance", 10 | "cssPaths": [ 11 | { 12 | "fieldName": "title", 13 | "cssPath": "h1", 14 | "fieldDescription": "标题", 15 | "extractFunctions": [] 16 | }, 17 | { 18 | "fieldName": "content", 19 | "cssPath": "div#endText", 20 | "fieldDescription": "正文", 21 | "extractFunctions": [] 22 | } 23 | ] 24 | }, 25 | { 26 | "templateName": "网易财经频道2", 27 | "tableName": "finance", 28 | "cssPaths": [ 29 | { 30 | "fieldName": "title", 31 | "cssPath": "h1", 32 | "fieldDescription": "标题", 33 | "extractFunctions": [] 34 | }, 35 | { 36 | "fieldName": "publishTime", 37 | "cssPath": "html body div#js-epContent.ep-content div.ep-content-bg div#epContentLeft.ep-content-main div.ep-main-bg div.clearfix div.ep-info div.left", 38 | "fieldDescription": "发表时间", 39 | "extractFunctions": [ 40 | { 41 | "fieldName": "publishTime", 42 | "fieldDescription": "发表时间", 43 | "extractExpression": "substring(0,19)" 44 | } 45 | ] 46 | }, 47 | { 48 | "fieldName": "content", 49 | "cssPath": "div#endText", 50 | "fieldDescription": "正文", 51 | "extractFunctions": [] 52 | } 53 | ] 54 | }, 55 | { 56 | "templateName": "网易财经栏目", 57 | "tableName": "finance", 58 | "cssPaths": [ 59 | { 60 | "fieldName": "title", 61 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l h1", 62 | "fieldDescription": "标题", 63 | "extractFunctions": [] 64 | }, 65 | { 66 | "fieldName": "content", 67 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.w_text", 68 | "fieldDescription": "正文", 69 | "extractFunctions": [] 70 | }, 71 | { 72 | "fieldName": "author", 73 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.author span.name", 74 | "fieldDescription": "作者", 75 | "extractFunctions": [ 76 | { 77 | "fieldName": "author", 78 | "fieldDescription": "作者", 79 | "extractExpression": "removeText(作者:)" 80 | } 81 | ] 82 | }, 83 | { 84 | "fieldName": "introduction", 85 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.introduction p", 86 | "fieldDescription": "导语", 87 | "extractFunctions": [] 88 | }, 89 | { 90 | "fieldName": "followers", 91 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.words_bbs div#tieArea.tie-area div#tiePostBox.tie-post div.tie-titlebar span.tie-info a.js-bactCount", 92 | "fieldDescription": "跟贴人数", 93 | "extractFunctions": [] 94 | }, 95 | { 96 | "fieldName": "tieTotalCount", 97 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.author a.discuss span.tieTotalCount tieTotalCount", 98 | "fieldDescription": "参与讨论人数", 99 | "extractFunctions": [] 100 | } 101 | ] 102 | } 103 | ] 104 | }, 105 | { 106 | "urlPattern": "http://finance.qq.com/a/\\d{8}/\\d{6}.htm", 107 | "regexPattern": "/a/\\d{8}/\\d{6}.htm", 108 | "pageTemplates": [ 109 | { 110 | "templateName": "腾讯财经频道", 111 | "tableName": "finance", 112 | "cssPaths": [ 113 | { 114 | "fieldName": "title", 115 | "cssPath": "div#C-Main-Article-QQ div.hd h1", 116 | "fieldDescription": "标题", 117 | "extractFunctions": [] 118 | }, 119 | { 120 | "fieldName": "content", 121 | "cssPath": "div#Cnt-Main-Article-QQ", 122 | "fieldDescription": "正文", 123 | "extractFunctions": [ 124 | { 125 | "fieldName": "content", 126 | "fieldDescription": "正文", 127 | "extractExpression": "deleteChild(div.ep-source)" 128 | } 129 | ] 130 | } 131 | ] 132 | } 133 | ] 134 | } 135 | ] -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/NewsWords.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/3317431.html=Resilient 2 | http://learningenglish.voanews.com/a/3317432.html=Restoration 3 | http://learningenglish.voanews.com/a/3317434.html=Kickoff 4 | http://learningenglish.voanews.com/a/3317428.html=Revolutionary 5 | http://learningenglish.voanews.com/a/3317427.html=Cronyism 6 | http://learningenglish.voanews.com/a/3317430.html=Atrocities 7 | http://learningenglish.voanews.com/a/3317429.html=Rhetoric 8 | http://learningenglish.voanews.com/a/3210134.html=Transplant 9 | http://learningenglish.voanews.com/a/3210131.html=Birthright Citizenship 10 | http://learningenglish.voanews.com/a/3210133.html=Holy Grail 11 | http://learningenglish.voanews.com/a/3210129.html=Doping 12 | http://learningenglish.voanews.com/a/3210128.html=Emissions 13 | http://learningenglish.voanews.com/a/3210130.html=Tears of Joy 14 | http://learningenglish.voanews.com/a/3210123.html=Exonerated 15 | http://learningenglish.voanews.com/a/3210122.html=Drought 16 | http://learningenglish.voanews.com/a/3210127.html=Transgender 17 | http://learningenglish.voanews.com/a/3210124.html=Migrants 18 | http://learningenglish.voanews.com/a/2705670.html=Global 19 | http://learningenglish.voanews.com/a/2705669.html=Crude 20 | http://learningenglish.voanews.com/a/2705671.html=Recycling 21 | http://learningenglish.voanews.com/a/2705668.html=Viable 22 | http://learningenglish.voanews.com/a/2705673.html=Incarcerated 23 | http://learningenglish.voanews.com/a/2667269.html=Confidential 24 | http://learningenglish.voanews.com/a/2667266.html=Chaos 25 | http://learningenglish.voanews.com/a/2667265.html=Scope 26 | http://learningenglish.voanews.com/a/2601328.html=Combat 27 | http://learningenglish.voanews.com/a/2601326.html=Glaucoma 28 | http://learningenglish.voanews.com/a/2670607.html=Infrastructure 29 | http://learningenglish.voanews.com/a/2670605.html=Gridlock 30 | http://learningenglish.voanews.com/a/2601327.html=Museum 31 | http://learningenglish.voanews.com/a/2601324.html=Turmoil 32 | http://learningenglish.voanews.com/a/2601325.html=Secure 33 | http://learningenglish.voanews.com/a/2553596.html=Unleashed 34 | http://learningenglish.voanews.com/a/2553581.html=Crippling 35 | http://learningenglish.voanews.com/a/2553587.html=Strategy 36 | http://learningenglish.voanews.com/a/2553567.html=Obligation 37 | http://learningenglish.voanews.com/a/2553576.html=Agenda 38 | http://learningenglish.voanews.com/a/2533606.html=Delicate 39 | http://learningenglish.voanews.com/a/2533607.html=Hawkish 40 | http://learningenglish.voanews.com/a/2533608.html=Looting 41 | http://learningenglish.voanews.com/a/2461984.html=Referendum 42 | http://learningenglish.voanews.com/a/2461990.html=Apprehended 43 | http://learningenglish.voanews.com/a/2461986.html=Ringleader 44 | http://learningenglish.voanews.com/a/2461985.html=Sovereign 45 | http://learningenglish.voanews.com/a/2533611.html=Objective 46 | http://learningenglish.voanews.com/a/2461987.html=Assets 47 | http://learningenglish.voanews.com/a/2670609.html=Significant 48 | http://learningenglish.voanews.com/a/2670610.html=Facilitate 49 | http://learningenglish.voanews.com/a/2511974.html=National Guard 50 | http://learningenglish.voanews.com/a/2533610.html=Symptoms 51 | http://learningenglish.voanews.com/a/2511969.html=Undocumented Workers 52 | http://learningenglish.voanews.com/a/2511971.html=Defiant 53 | http://learningenglish.voanews.com/a/2511972.html=Expedite 54 | http://learningenglish.voanews.com/a/2494865.html=Civil War 55 | http://learningenglish.voanews.com/a/2494868.html=Reconnaisance 56 | http://learningenglish.voanews.com/a/2494867.html=Confine 57 | http://learningenglish.voanews.com/a/2494866.html=Potential 58 | http://learningenglish.voanews.com/a/2494864.html=Analyst 59 | http://learningenglish.voanews.com/a/2479301.html=Rank 60 | http://learningenglish.voanews.com/a/2479300.html=Emergence 61 | http://learningenglish.voanews.com/a/2479297.html=Runoff 62 | http://learningenglish.voanews.com/a/2479298.html=Sporadic 63 | http://learningenglish.voanews.com/a/2479296.html=Fraud 64 | http://learningenglish.voanews.com/a/2438621.html=Solution 65 | http://learningenglish.voanews.com/a/2438625.html=Recall 66 | http://learningenglish.voanews.com/a/2511968.html=Malaria 67 | http://learningenglish.voanews.com/a/2438620.html=Irreversible 68 | http://learningenglish.voanews.com/a/2438618.html=Consensus 69 | http://learningenglish.voanews.com/a/2438617.html=Revision 70 | http://learningenglish.voanews.com/a/2438614.html=Stabilize 71 | http://learningenglish.voanews.com/a/2438612.html=Convoy 72 | http://learningenglish.voanews.com/a/2438615.html=Maritime 73 | http://learningenglish.voanews.com/a/2438613.html=Critical 74 | http://learningenglish.voanews.com/a/2438657.html=Biofuel 75 | http://learningenglish.voanews.com/a/1903494.html=Sectarian 76 | http://learningenglish.voanews.com/a/1903491.html=Regime 77 | http://learningenglish.voanews.com/a/1903493.html=Startup 78 | http://learningenglish.voanews.com/a/1903490.html=Mainstream 79 | http://learningenglish.voanews.com/a/1903492.html=Legislation 80 | http://learningenglish.voanews.com/a/1899356.html=Chemical Weapons 81 | http://learningenglish.voanews.com/a/1899359.html=Accord 82 | http://learningenglish.voanews.com/a/1899357.html=Obamacare 83 | http://learningenglish.voanews.com/a/1899355.html=Pragmatic 84 | http://learningenglish.voanews.com/a/1899354.html=Turmoil 85 | http://learningenglish.voanews.com/a/1893953.html=Casualty 86 | http://learningenglish.voanews.com/a/1893949.html=Unilateral 87 | http://learningenglish.voanews.com/a/1893955.html=Credibility 88 | http://learningenglish.voanews.com/a/1893947.html=Humanitarian 89 | http://learningenglish.voanews.com/a/1893946.html=Investigation 90 | http://learningenglish.voanews.com/a/1890651.html=Insurgency 91 | http://learningenglish.voanews.com/a/1890630.html=Fiscal 92 | http://learningenglish.voanews.com/a/1890653.html=Verification 93 | http://learningenglish.voanews.com/a/1890628.html=Partisan 94 | http://learningenglish.voanews.com/a/1894815.html=Sanctions 95 | http://learningenglish.voanews.com/a/1884224.html=Paparazzi 96 | http://learningenglish.voanews.com/a/1884223.html=Dissident 97 | http://learningenglish.voanews.com/a/1884221.html=NATO 98 | http://learningenglish.voanews.com/a/1884222.html=Censorship 99 | http://learningenglish.voanews.com/a/1884220.html=Consumer 100 | http://learningenglish.voanews.com/a/1879109.html=Furloughed 101 | http://learningenglish.voanews.com/a/1878855.html=Exchange Rate 102 | http://learningenglish.voanews.com/a/1878853.html=Candidate 103 | http://learningenglish.voanews.com/a/1878949.html=Drone 104 | http://learningenglish.voanews.com/a/1878887.html=Bipartisanship 105 | http://learningenglish.voanews.com/a/1874349.html=Recession 106 | http://learningenglish.voanews.com/a/1874348.html=Mortgage 107 | http://learningenglish.voanews.com/a/1874354.html=Supreme Court 108 | http://learningenglish.voanews.com/a/1874352.html=Bilateral 109 | http://learningenglish.voanews.com/a/1874346.html=Summit 110 | http://learningenglish.voanews.com/a/1868773.html=Espionage 111 | http://learningenglish.voanews.com/a/1868824.html=Investment 112 | http://learningenglish.voanews.com/a/1868826.html=House of Representatives 113 | http://learningenglish.voanews.com/a/1868816.html=Indictment 114 | http://learningenglish.voanews.com/a/1868817.html=Encouraging 115 | http://learningenglish.voanews.com/a/1864638.html=Dialogue 116 | http://learningenglish.voanews.com/a/1864614.html=Bankruptcy 117 | http://learningenglish.voanews.com/a/1864598.html=Senate 118 | http://learningenglish.voanews.com/a/1864166.html=Presidential 119 | http://learningenglish.voanews.com/a/1864123.html=Surveillance 120 | http://learningenglish.voanews.com/a/1864102.html=Stock Market 121 | http://learningenglish.voanews.com/a/1859189.html=Embargo -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## HtmlExtractor是一个Java实现的基于模板的网页结构化信息精准抽取组件,本身并不包含爬虫功能,但可被爬虫或其他程序调用以便更精准地对网页结构化信息进行抽取。 2 | 3 | ## HtmlExtractor是为大规模分布式环境设计的,采用主从架构,主节点负责维护抽取规则,从节点向主节点请求抽取规则,当抽取规则发生变化,主节点主动通知从节点,从而能实现抽取规则变化之后的实时动态生效。 4 | 5 | ## [捐赠致谢](https://github.com/ysc/QuestionAnsweringSystem/wiki/donation) 6 | 7 | ## 如何使用? 8 | 9 | 使用SeleniumHtmlFetcher需要安装驱动: 10 | FirefoxDriver: brew install geckodriver 11 | ChromeDriver: brew install chromedriver 12 | 13 | HtmlExtractor由2个子项目构成,html-extractor和html-extractor-web。 14 | html-extractor实现了数据抽取逻辑,是从节点,html-extractor-web提供web界面来维护抽取规则,是主节点。 15 | html-extractor是一个jar包,可通过maven引用: 16 | 17 | org.apdplat 18 | html-extractor 19 | 1.1 20 | 21 | html-extractor-web是一个war包,需要部署到Servlet/Jsp容器上。 22 | 在html-extractor-web目录下运行mvn jetty:run就可以启动Servlet/Jsp容器jetty,之后打开浏览器访问: 23 | http://localhost:8080/html-extractor-web/api/ 查看自己定义的规则。 24 | 25 | 注意:页面模板中定义的所有CSS路径和抽取表达式全部抽取成功,才算抽取成功, 26 | 只要有一个CSS路径或抽取表达式失败,就是抽取失败。 27 | 28 | [如何使用HtmlExtractor实现基于模板的网页结构化信息精准抽取?](http://my.oschina.net/apdplat/blog/402149) 29 | 30 | ## 单机集中式使用方法: 31 | 32 | //1、构造抽取规则 33 | List urlPatterns = new ArrayList<>(); 34 | //1.1、构造URL模式 35 | UrlPattern urlPattern = new UrlPattern(); 36 | urlPattern.setUrlPattern("http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html"); 37 | //1.2、构造HTML模板 38 | HtmlTemplate htmlTemplate = new HtmlTemplate(); 39 | htmlTemplate.setTemplateName("网易财经频道"); 40 | htmlTemplate.setTableName("finance"); 41 | //1.3、将URL模式和HTML模板建立关联 42 | urlPattern.addHtmlTemplate(htmlTemplate); 43 | //1.4、构造CSS路径 44 | CssPath cssPath = new CssPath(); 45 | cssPath.setCssPath("h1"); 46 | cssPath.setFieldName("title"); 47 | cssPath.setFieldDescription("标题"); 48 | //1.5、将CSS路径和模板建立关联 49 | htmlTemplate.addCssPath(cssPath); 50 | //1.6、构造CSS路径 51 | cssPath = new CssPath(); 52 | cssPath.setCssPath("div#endText"); 53 | cssPath.setFieldName("content"); 54 | cssPath.setFieldDescription("正文"); 55 | //1.7、将CSS路径和模板建立关联 56 | htmlTemplate.addCssPath(cssPath); 57 | //可象上面那样构造多个URLURL模式 58 | urlPatterns.add(urlPattern); 59 | 60 | //2、获取抽取规则对象 61 | ExtractRegular extractRegular = ExtractRegular.getInstance(urlPatterns); 62 | //注意:可通过如下3个方法动态地改变抽取规则 63 | //extractRegular.addUrlPatterns(urlPatterns); 64 | //extractRegular.addUrlPattern(urlPattern); 65 | //extractRegular.removeUrlPattern(urlPattern.getUrlPattern()); 66 | 67 | //3、获取HTML抽取工具 68 | HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular); 69 | 70 | //4、抽取网页 71 | String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html"; 72 | HtmlFetcher htmlFetcher = new JSoupHtmlFetcher(); 73 | String html = htmlFetcher.fetch(url); 74 | List extractResults = htmlExtractor.extract(url, html); 75 | 76 | //5、输出结果 77 | int i = 1; 78 | for (ExtractResult extractResult : extractResults) { 79 | System.out.println((i++) + "、网页 " + extractResult.getUrl() + " 的抽取结果"); 80 | if(!extractResult.isSuccess()){ 81 | System.out.println("抽取失败:"); 82 | for(ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()){ 83 | System.out.println("\turl:"+extractFailLog.getUrl()); 84 | System.out.println("\turlPattern:"+extractFailLog.getUrlPattern()); 85 | System.out.println("\ttemplateName:"+extractFailLog.getTemplateName()); 86 | System.out.println("\tfieldName:"+extractFailLog.getFieldName()); 87 | System.out.println("\tfieldDescription:"+extractFailLog.getFieldDescription()); 88 | System.out.println("\tcssPath:"+extractFailLog.getCssPath()); 89 | if(extractFailLog.getExtractExpression()!=null) { 90 | System.out.println("\textractExpression:" + extractFailLog.getExtractExpression()); 91 | } 92 | } 93 | continue; 94 | } 95 | Map> extractResultItems = extractResult.getExtractResultItems(); 96 | for(String field : extractResultItems.keySet()){ 97 | List values = extractResultItems.get(field); 98 | if(values.size() > 1){ 99 | int j=1; 100 | System.out.println("\t多值字段:"+field); 101 | for(ExtractResultItem item : values){ 102 | System.out.println("\t\t"+(j++)+"、"+field+" = "+item.getValue()); 103 | } 104 | }else{ 105 | System.out.println("\t"+field+" = "+values.get(0).getValue()); 106 | } 107 | } 108 | System.out.println("\tdescription = "+extractResult.getDescription()); 109 | System.out.println("\tkeywords = "+extractResult.getKeywords()); 110 | } 111 | 112 | ## 多机分布式使用方法: 113 | 114 | 1、运行主节点,负责维护抽取规则: 115 | 方法一:在html-extractor-web目录下运行mvn jetty:run 。 116 | 方法二:在html-extractor-web目录下运行mvn install , 117 | 然后将target/html-extractor-web-1.0.war部署到Tomcat。 118 | 119 | 2、获取一个HtmlExtractor的实例(从节点),示例代码如下: 120 | String allExtractRegularUrl = "http://localhost:8080/HtmlExtractorServer/api/all_extract_regular.jsp"; 121 | String redisHost = "localhost"; 122 | int redisPort = 6379; 123 | ExtractRegular extractRegular = ExtractRegular.getInstance(allExtractRegularUrl, redisHost, redisPort); 124 | HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular); 125 | 126 | 3、抽取信息,示例代码如下: 127 | String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html"; 128 | HtmlFetcher htmlFetcher = new JSoupHtmlFetcher(); 129 | String html = htmlFetcher.fetch(url); 130 | List extractResults = htmlExtractor.extract(url, html); 131 | 132 | int i = 1; 133 | for (ExtractResult extractResult : extractResults) { 134 | System.out.println((i++) + "、网页 " + extractResult.getUrl() + " 的抽取结果"); 135 | if(!extractResult.isSuccess()){ 136 | System.out.println("抽取失败:"); 137 | for(ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()){ 138 | System.out.println("\turl:"+extractFailLog.getUrl()); 139 | System.out.println("\turlPattern:"+extractFailLog.getUrlPattern()); 140 | System.out.println("\ttemplateName:"+extractFailLog.getTemplateName()); 141 | System.out.println("\tfieldName:"+extractFailLog.getFieldName()); 142 | System.out.println("\tfieldDescription:"+extractFailLog.getFieldDescription()); 143 | System.out.println("\tcssPath:"+extractFailLog.getCssPath()); 144 | if(extractFailLog.getExtractExpression()!=null) { 145 | System.out.println("\textractExpression:" + extractFailLog.getExtractExpression()); 146 | } 147 | } 148 | continue; 149 | } 150 | for(ExtractResultItem extractResultItem : extractResult.getExtractResultItems()){ 151 | System.out.print("\t"+extractResultItem.getField()+" = "+extractResultItem.getValue()); 152 | } 153 | System.out.println("\tdescription = "+extractResult.getDescription()); 154 | System.out.println("\tkeywords = "+extractResult.getKeywords()); 155 | } 156 | 157 | [https://travis-ci.org/ysc/HtmlExtractor](https://travis-ci.org/ysc/HtmlExtractor) 158 | -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/EverydayGrammar.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/everyday-grammar-relative-pronouns/2793920.html=Relative Pronouns 2 | http://learningenglish.voanews.com/a/everyday-grammar-may-might-must-modals-certainty/2887387.html=May Might Must - Modals of Certainty 3 | http://learningenglish.voanews.com/a/modals-permission-everyday-grammar/3355585.html=Modals for Asking Permission 4 | http://learningenglish.voanews.com/a/you-really-should-learn-modals-everyday-gramamr/3355517.html=You Really Should Learn Modals 5 | http://learningenglish.voanews.com/a/grammar-demonstrative-pronouns-determiners/3347315.html=Demonstrating How to Use Demonstratives 6 | http://learningenglish.voanews.com/a/are-you-how-you-talk/3337552.html=Are You How You Talk 7 | http://learningenglish.voanews.com/a/who-makes-grammar-rules/3325780.html=Who Makes Grammar Rules 8 | http://learningenglish.voanews.com/a/everyday-grammar-commonly-confused-words-part-three-homophones/3317204.html=Commonly Confused Words Part 3 Homophones 9 | http://learningenglish.voanews.com/a/everyday-grammar-commonly-confused-words-part-two/3307049.html=Commonly Confused Words Part Two 10 | http://learningenglish.voanews.com/a/everyday-grammar-commonly-confused-words-week-one/3294436.html=Commonly Confused Words Part One 11 | http://learningenglish.voanews.com/a/they-say-reported-speech-is-easy/3280282.html=They Say That Reported Speech Is Easy 12 | http://learningenglish.voanews.com/a/identify-with-relative-pronouns/3261879.html=Identify With Relative Pronouns 13 | http://learningenglish.voanews.com/a/changing-prepositions-with-provide/3259606.html=Changing Prepositions With Provide 14 | http://learningenglish.voanews.com/a/using-the-passive-voice/3247545.html=Using the Passive Voice 15 | http://learningenglish.voanews.com/a/everyday-grammar-double-negatives/2743416.html=The Story of the Double Negative 16 | http://learningenglish.voanews.com/a/using-right-article-everyday-grammar/2819461.html=Using the Right Article 17 | http://learningenglish.voanews.com/a/everyday-grammar-making-wishes/3218288.html=Do You Wish You Knew Better Grammar 18 | http://learningenglish.voanews.com/a/how-much-do-you-know-about-quantifiers/3206680.html=How Much Do You Know about Quantifiers 19 | http://learningenglish.voanews.com/a/everyday-grammar-understanding-noncount-nouns/3193621.html=Understanding Noncount Nouns 20 | http://learningenglish.voanews.com/a/past-unreal-conditionals/3181755.html=Past Unreal Conditionals 21 | http://learningenglish.voanews.com/a/if-you-learn-conditionals-be-glad/3173342.html=If You Learn Conditionals You ll Be Glad You Did 22 | http://learningenglish.voanews.com/a/improve-writing-contrast-concession/3163659.html=Improve Your Writing with Contrast and Concession 23 | http://learningenglish.voanews.com/a/perfect-progressive-tenses-everyday-grammar/3141901.html=The Perfect Progressive Tenses 24 | http://learningenglish.voanews.com/a/everyday-grammar-have-you-perfected-the-perfect-tenses/3137265.html=Have You Perfected the Perfect Tenses 25 | http://learningenglish.voanews.com/a/everyday-grammar-are-you-progressing-with-the-progressive--tenses/3131962.html=Are You Progressing with Progressive Tenses 26 | http://learningenglish.voanews.com/a/introduction-to-verb-tenses-everyday-grammar/3123576.html=An Introduction to Verb Tenses 27 | http://learningenglish.voanews.com/a/getting-to-know-gerunds-and-infinitives/3111996.html=Getting to Know Gerunds and Infinitives 28 | http://learningenglish.voanews.com/a/everyday-grammar-should-vs-shall/3107315.html=The Should vs Shall Debate 29 | http://learningenglish.voanews.com/a/for-or-since-what-is-the-difference/3097366.html=For or Since What Is the Difference 30 | http://learningenglish.voanews.com/a/phrasal-verbs-to-help-you-with-technology/3085650.html=30 Phrasal Verbs to Help You With Technology 31 | http://learningenglish.voanews.com/a/learn-prepositions-in-on-at/3073690.html=Are You In On or At Prepositions that Tell of Time and Place 32 | http://learningenglish.voanews.com/a/six-difference-between-britsh-and-american-english/3063743.html=Six Differences Between British and American English 33 | http://learningenglish.voanews.com/a/everyday-grammar-three-grammar-rules-that-are-dying/3053579.html=Everyday Grammar 3 Grammar Rules That Are Dying 34 | http://learningenglish.voanews.com/a/top-10-separable-phrasal-verbs/3041841.html=Everyday Grammar Our Top 10 Separable Phrasal Verbs 35 | http://learningenglish.voanews.com/a/everyday-grammar-using-transitions-for-smoother-writing/3029586.html=Everyday Grammar Make Your Writing Smoother with Transitions 36 | http://learningenglish.voanews.com/a/everyday-grammar-pow-whizz-what-are-onomatopoeia/3018658.html=Pow Whizz What Are Onomatopoeia 37 | http://learningenglish.voanews.com/a/everyday-grammar-introducing-phrasal-verbs/3010251.html=Everyday Grammar Introducing Phrasal Verbs 38 | http://learningenglish.voanews.com/a/everyday-grammar-when-nouns-act-like-adjectives/2998821.html=Everyday Grammar When Nouns Act Like Adjectives 39 | http://learningenglish.voanews.com/a/everyday-grammar-comparatives-superlatives/2989386.html=Everyday Grammar Comparatives and Superlatives 40 | http://learningenglish.voanews.com/a/everyday-grammar-do-does-you-understand-subject-verb-agreement/2977592.html=Everyday Grammar Do/Does You Understand Subject-Verb Agreement 41 | http://learningenglish.voanews.com/a/unusual-plurals-everyday-grammar/2968871.html=Everyday Grammar Unusual Plurals 42 | http://learningenglish.voanews.com/a/tag-questions-are-easy-arent-they-everyday-grammar/2956417.html=Everyday Grammar Tag Questions Are Easy Aren't They 43 | http://learningenglish.voanews.com/a/everyday-grammar-relative-adverbs/2944081.html=Everyday Grammar Three Reasons to Learn Relative Adverbs 44 | http://learningenglish.voanews.com/a/everyday-grammar-fun-with-future-tenses/2935173.html=Fun with Future Tenses 45 | http://learningenglish.voanews.com/a/everyday-grammar-we-suggest-that-you-learn-the-subjunctive/2925403.html=Everyday Grammar We Suggest That You Learn the Subjunctive 46 | http://learningenglish.voanews.com/a/everyday-grammar-the-sounds-of-grammar-betty-azar/2916335.html=Everyday Grammar The Sounds of Grammar with Betty Azar 47 | http://learningenglish.voanews.com/a/are-causatives-making-you-crazy-everyday-grammar/2903050.html=Are Causatives Making You Crazy 48 | http://learningenglish.voanews.com/a/everyday-grammar-modals-permission-can-may/2877141.html=Can I Could I May I 49 | http://learningenglish.voanews.com/a/everyday-grammar-you-had-better-learn-modals/2865365.html=Everyday Grammar You Had Better Learn Modals 50 | http://learningenglish.voanews.com/a/everyday-grammar-reported-speech/2856671.html=Everyday Grammar Mastering Reported Speech 51 | http://learningenglish.voanews.com/a/beating-problems-with-adverbs-everyday-grammar/2843494.html=Everyday Grammar Beating Problems with Adverbs 52 | http://learningenglish.voanews.com/a/everyday-grammar-words-come-and-go-in-english/2832644.html=Words Come and Go in English 53 | http://learningenglish.voanews.com/a/everyday-grammar-when-passive-is-better-than-active/2825976.html=Everyday Grammar When Passive Is Better than Active 54 | http://learningenglish.voanews.com/a/everyday-grammar-past-unreal-conditional-mixed-conditional/2809016.html=Everyday Grammar Advanced Conditionals 55 | http://learningenglish.voanews.com/a/everyday-grammar-introducing-conditionals/2778457.html=Everyday Grammar Introducing Conditionals 56 | http://learningenglish.voanews.com/a/problems-with-pronouns-and-gender/2770727.html=Problems with Pronouns and Gender 57 | http://learningenglish.voanews.com/a/everyday-grammar-simple-past-and-present-perfect/2752310.html=Simple Past and Present Perfect 58 | http://learningenglish.voanews.com/a/prepositions-time-place-everyday-grammar-in-on-at/2732061.html=Everyday Grammar In On and At 59 | http://learningenglish.voanews.com/a/everyday-grammar-gerunds-infinitives/2722827.html=Everyday Grammar Gerunds and Infinitives 60 | http://learningenglish.voanews.com/a/everyday-grammar-prepositions-provide/2701412.html=Everyday Grammar Put Prepositions in Their Place 61 | http://learningenglish.voanews.com/a/everyday-grammar-subject-object-pronouns/2674867.html=Can You Correct Her and I -------------------------------------------------------------------------------- /mvnw: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # ---------------------------------------------------------------------------- 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, 14 | # software distributed under the License is distributed on an 15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 | # KIND, either express or implied. See the License for the 17 | # specific language governing permissions and limitations 18 | # under the License. 19 | # ---------------------------------------------------------------------------- 20 | 21 | # ---------------------------------------------------------------------------- 22 | # Maven2 Start Up Batch script 23 | # 24 | # Required ENV vars: 25 | # ------------------ 26 | # JAVA_HOME - location of a JDK home dir 27 | # 28 | # Optional ENV vars 29 | # ----------------- 30 | # M2_HOME - location of maven2's installed home dir 31 | # MAVEN_OPTS - parameters passed to the Java VM when running Maven 32 | # e.g. to debug Maven itself, use 33 | # set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 34 | # MAVEN_SKIP_RC - flag to disable loading of mavenrc files 35 | # ---------------------------------------------------------------------------- 36 | 37 | if [ -z "$MAVEN_SKIP_RC" ] ; then 38 | 39 | if [ -f /etc/mavenrc ] ; then 40 | . /etc/mavenrc 41 | fi 42 | 43 | if [ -f "$HOME/.mavenrc" ] ; then 44 | . "$HOME/.mavenrc" 45 | fi 46 | 47 | fi 48 | 49 | # OS specific support. $var _must_ be set to either true or false. 50 | cygwin=false; 51 | darwin=false; 52 | mingw=false 53 | case "`uname`" in 54 | CYGWIN*) cygwin=true ;; 55 | MINGW*) mingw=true;; 56 | Darwin*) darwin=true 57 | # 58 | # Look for the Apple JDKs first to preserve the existing behaviour, and then look 59 | # for the new JDKs provided by Oracle. 60 | # 61 | if [ -z "$JAVA_HOME" ] && [ -L /System/Library/Frameworks/JavaVM.framework/Versions/CurrentJDK ] ; then 62 | # 63 | # Apple JDKs 64 | # 65 | export JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/CurrentJDK/Home 66 | fi 67 | 68 | if [ -z "$JAVA_HOME" ] && [ -L /System/Library/Java/JavaVirtualMachines/CurrentJDK ] ; then 69 | # 70 | # Apple JDKs 71 | # 72 | export JAVA_HOME=/System/Library/Java/JavaVirtualMachines/CurrentJDK/Contents/Home 73 | fi 74 | 75 | if [ -z "$JAVA_HOME" ] && [ -L "/Library/Java/JavaVirtualMachines/CurrentJDK" ] ; then 76 | # 77 | # Oracle JDKs 78 | # 79 | export JAVA_HOME=/Library/Java/JavaVirtualMachines/CurrentJDK/Contents/Home 80 | fi 81 | 82 | if [ -z "$JAVA_HOME" ] && [ -x "/usr/libexec/java_home" ]; then 83 | # 84 | # Apple JDKs 85 | # 86 | export JAVA_HOME=`/usr/libexec/java_home` 87 | fi 88 | ;; 89 | esac 90 | 91 | if [ -z "$JAVA_HOME" ] ; then 92 | if [ -r /etc/gentoo-release ] ; then 93 | JAVA_HOME=`java-config --jre-home` 94 | fi 95 | fi 96 | 97 | if [ -z "$M2_HOME" ] ; then 98 | ## resolve links - $0 may be a link to maven's home 99 | PRG="$0" 100 | 101 | # need this for relative symlinks 102 | while [ -h "$PRG" ] ; do 103 | ls=`ls -ld "$PRG"` 104 | link=`expr "$ls" : '.*-> \(.*\)$'` 105 | if expr "$link" : '/.*' > /dev/null; then 106 | PRG="$link" 107 | else 108 | PRG="`dirname "$PRG"`/$link" 109 | fi 110 | done 111 | 112 | saveddir=`pwd` 113 | 114 | M2_HOME=`dirname "$PRG"`/.. 115 | 116 | # make it fully qualified 117 | M2_HOME=`cd "$M2_HOME" && pwd` 118 | 119 | cd "$saveddir" 120 | # echo Using m2 at $M2_HOME 121 | fi 122 | 123 | # For Cygwin, ensure paths are in UNIX format before anything is touched 124 | if $cygwin ; then 125 | [ -n "$M2_HOME" ] && 126 | M2_HOME=`cygpath --unix "$M2_HOME"` 127 | [ -n "$JAVA_HOME" ] && 128 | JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 129 | [ -n "$CLASSPATH" ] && 130 | CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 131 | fi 132 | 133 | # For Migwn, ensure paths are in UNIX format before anything is touched 134 | if $mingw ; then 135 | [ -n "$M2_HOME" ] && 136 | M2_HOME="`(cd "$M2_HOME"; pwd)`" 137 | [ -n "$JAVA_HOME" ] && 138 | JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" 139 | # TODO classpath? 140 | fi 141 | 142 | if [ -z "$JAVA_HOME" ]; then 143 | javaExecutable="`which javac`" 144 | if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then 145 | # readlink(1) is not available as standard on Solaris 10. 146 | readLink=`which readlink` 147 | if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then 148 | if $darwin ; then 149 | javaHome="`dirname \"$javaExecutable\"`" 150 | javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" 151 | else 152 | javaExecutable="`readlink -f \"$javaExecutable\"`" 153 | fi 154 | javaHome="`dirname \"$javaExecutable\"`" 155 | javaHome=`expr "$javaHome" : '\(.*\)/bin'` 156 | JAVA_HOME="$javaHome" 157 | export JAVA_HOME 158 | fi 159 | fi 160 | fi 161 | 162 | if [ -z "$JAVACMD" ] ; then 163 | if [ -n "$JAVA_HOME" ] ; then 164 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 165 | # IBM's JDK on AIX uses strange locations for the executables 166 | JAVACMD="$JAVA_HOME/jre/sh/java" 167 | else 168 | JAVACMD="$JAVA_HOME/bin/java" 169 | fi 170 | else 171 | JAVACMD="`which java`" 172 | fi 173 | fi 174 | 175 | if [ ! -x "$JAVACMD" ] ; then 176 | echo "Error: JAVA_HOME is not defined correctly." >&2 177 | echo " We cannot execute $JAVACMD" >&2 178 | exit 1 179 | fi 180 | 181 | if [ -z "$JAVA_HOME" ] ; then 182 | echo "Warning: JAVA_HOME environment variable is not set." 183 | fi 184 | 185 | CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher 186 | 187 | # For Cygwin, switch paths to Windows format before running java 188 | if $cygwin; then 189 | [ -n "$M2_HOME" ] && 190 | M2_HOME=`cygpath --path --windows "$M2_HOME"` 191 | [ -n "$JAVA_HOME" ] && 192 | JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 193 | [ -n "$CLASSPATH" ] && 194 | CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 195 | fi 196 | 197 | # traverses directory structure from process work directory to filesystem root 198 | # first directory with .mvn subdirectory is considered project base directory 199 | find_maven_basedir() { 200 | local basedir=$(pwd) 201 | local wdir=$(pwd) 202 | while [ "$wdir" != '/' ] ; do 203 | if [ -d "$wdir"/.mvn ] ; then 204 | basedir=$wdir 205 | break 206 | fi 207 | wdir=$(cd "$wdir/.."; pwd) 208 | done 209 | echo "${basedir}" 210 | } 211 | 212 | # concatenates all lines of a file 213 | concat_lines() { 214 | if [ -f "$1" ]; then 215 | echo "$(tr -s '\n' ' ' < "$1")" 216 | fi 217 | } 218 | 219 | export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-$(find_maven_basedir)} 220 | MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" 221 | 222 | # Provide a "standardized" way to retrieve the CLI args that will 223 | # work with both Windows and non-Windows executions. 224 | MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" 225 | export MAVEN_CMD_LINE_ARGS 226 | 227 | WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain 228 | 229 | exec "$JAVACMD" \ 230 | $MAVEN_OPTS \ 231 | -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ 232 | "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ 233 | ${WRAPPER_LAUNCHER} "$@" 234 | -------------------------------------------------------------------------------- /html-extractor-web/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.apdplat 6 | html-extractor-web 7 | 1.1 8 | war 9 | 10 | 11 | html-extractor-web 12 | https://github.com/ysc/HtmlExtractor 13 | 14 | html-extractor-web是html-extractor的web接口,负责维护抽取规则。 15 | 16 | 17 | APDPlat 18 | http://apdplat.org/ 19 | 20 | 21 | 22 | GNU GENERAL PUBLIC LICENSE, Version 3 23 | http://www.gnu.org/licenses/gpl.html 24 | 25 | 26 | 2014 27 | 28 | https://github.com/ysc/HtmlExtractor 29 | scm:git:git://github.com/ysc/HtmlExtractor.git 30 | scm:git:ssh://git@github.com/ysc/HtmlExtractor.git 31 | HEAD 32 | 33 | 34 | 35 | 杨尚川 36 | ysc@apdplat.org 37 | http://yangshangchuan.iteye.com 38 | 39 | 40 | 41 | 42 | 43 | 44 | org.apache.maven.plugins 45 | maven-war-plugin 46 | ${maven-war-plugin.version} 47 | 48 | 49 | 50 | org.apache.maven.plugins 51 | maven-compiler-plugin 52 | ${maven-compiler-plugin.version} 53 | 54 | ${project.build.sourceEncoding} 55 | ${java.version} 56 | ${java.version} 57 | true 58 | true 59 | true 60 | 61 | 62 | 63 | 64 | org.apache.maven.plugins 65 | maven-jar-plugin 66 | ${maven-jar-plugin.version} 67 | 68 | 69 | 70 | org.apache.maven.plugins 71 | maven-surefire-plugin 72 | ${maven-surefire-plugin.version} 73 | 74 | true 75 | 76 | 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-resources-plugin 81 | ${maven-resources-plugin.version} 82 | 83 | ${project.build.sourceEncoding} 84 | 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-javadoc-plugin 90 | ${maven-javadoc-plugin.version} 91 | 92 | 93 | attach-docs 94 | 95 | jar 96 | 97 | 98 | 99 | 100 | 101 | 102 | maven-source-plugin 103 | ${maven-source-plugin.version} 104 | 105 | 106 | attach-sources 107 | 108 | jar 109 | 110 | 111 | 112 | 113 | 114 | org.mortbay.jetty 115 | maven-jetty-plugin 116 | ${maven-jetty-plugin.version} 117 | 118 | 119 | 120 | 121 | org.codehaus.sonar 122 | sonar-maven3-plugin 123 | ${sonar-maven3-plugin.version} 124 | 125 | 126 | 127 | 128 | 1.8 129 | UTF-8 130 | 131 | 2.1.1 132 | 3.0 133 | 2.4 134 | 2.14 135 | 2.6 136 | 2.9.1 137 | 2.2.1 138 | 3.5 139 | 6.1.26 140 | 141 | 4.11 142 | 7.0 143 | 5.1.18 144 | 1.6.4 145 | 0.9.28 146 | 2.5.1 147 | 2.6 148 | 1.9.13 149 | 150 | 151 | 152 | 153 | junit 154 | junit 155 | ${junit.version} 156 | test 157 | 158 | 159 | 160 | javax 161 | javaee-api 162 | ${javaee-api.version} 163 | provided 164 | 165 | 166 | 167 | mysql 168 | mysql-connector-java 169 | ${mysql.version} 170 | 171 | 172 | 173 | org.slf4j 174 | slf4j-api 175 | ${slf4j-api.version} 176 | 177 | 178 | 179 | ch.qos.logback 180 | logback-classic 181 | ${logback-classic.version} 182 | 183 | 184 | commons-logging 185 | commons-logging 186 | 187 | 188 | runtime 189 | 190 | 191 | 192 | redis.clients 193 | jedis 194 | ${jedis.version} 195 | 196 | 197 | commons-lang 198 | commons-lang 199 | ${commons-lang.version} 200 | 201 | 202 | org.codehaus.jackson 203 | jackson-mapper-asl 204 | ${jackson.version} 205 | 206 | 207 | 208 | -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/EnglishAtTheMovies.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/4515733.html=Put Yourself Out There 2 | http://learningenglish.voanews.com/a/4515732.html=Cast a Very Long Shadow 3 | http://learningenglish.voanews.com/a/4515734.html=Shut This Whole Thing Down 4 | http://learningenglish.voanews.com/a/4403351.html=Holding Your Breath 5 | http://learningenglish.voanews.com/a/4403350.html=Hold You In Contempt 6 | http://learningenglish.voanews.com/a/4403346.html=Scared Out Of Her Mind 7 | http://learningenglish.voanews.com/a/4403348.html=Damaged Goods 8 | http://learningenglish.voanews.com/a/4403347.html=Not My Cup Of Tea 9 | http://learningenglish.voanews.com/a/4403345.html=You In? 10 | http://learningenglish.voanews.com/a/4403340.html=Took Out 11 | http://learningenglish.voanews.com/a/4403339.html=Down The Tubes 12 | http://learningenglish.voanews.com/a/4403341.html=Felt Closer To 13 | http://learningenglish.voanews.com/a/4403342.html=Rust Bucket 14 | http://learningenglish.voanews.com/a/4278055.html=Up A Notch 15 | http://learningenglish.voanews.com/a/4278047.html=Pick Up Where He Left Off 16 | http://learningenglish.voanews.com/a/4278052.html=My Head Is Pounding 17 | http://learningenglish.voanews.com/a/4278048.html=Leaving To Chance 18 | http://learningenglish.voanews.com/a/4278046.html=Hold Them Accountable 19 | http://learningenglish.voanews.com/a/4278039.html=Get Me 20 | http://learningenglish.voanews.com/a/4278043.html=Even The Playing Field 21 | http://learningenglish.voanews.com/a/4278041.html=Cut It Out 22 | http://learningenglish.voanews.com/a/4278040.html=Bounce Back 23 | http://learningenglish.voanews.com/a/4218427.html=Suicide Mission 24 | http://learningenglish.voanews.com/a/4218437.html=See You On The Other Side 25 | http://learningenglish.voanews.com/a/4194269.html=Master Of Disguise 26 | http://learningenglish.voanews.com/a/4194274.html=Let You Down 27 | http://learningenglish.voanews.com/a/4194268.html=I've Lost My Way 28 | http://learningenglish.voanews.com/a/4194280.html=I Dig It 29 | http://learningenglish.voanews.com/a/4194259.html=Give Him A Break 30 | http://learningenglish.voanews.com/a/4194261.html=Dump Me 31 | http://learningenglish.voanews.com/a/4194264.html=Blast From The Past 32 | http://learningenglish.voanews.com/a/4194262.html=Act Out 33 | http://learningenglish.voanews.com/a/3093779.html=Cooked Up 34 | http://learningenglish.voanews.com/a/3207722.html=Gear Up 35 | http://learningenglish.voanews.com/a/3207728.html=Waste Of Time 36 | http://learningenglish.voanews.com/a/3093784.html=Scaredy-cat 37 | http://learningenglish.voanews.com/a/3093783.html=Put Your Foot Down 38 | http://learningenglish.voanews.com/a/3093775.html=A Few Tricks Up Her Sleeve 39 | http://learningenglish.voanews.com/a/3207737.html=Pick A Fight 40 | http://learningenglish.voanews.com/a/4072135.html=Don't Jinx Me 41 | http://learningenglish.voanews.com/a/4072139.html=We're Broke 42 | http://learningenglish.voanews.com/a/4072137.html=A Little Off 43 | http://learningenglish.voanews.com/a/4072134.html=Take Back 44 | http://learningenglish.voanews.com/a/4072136.html=Come Along 45 | http://learningenglish.voanews.com/a/4017445.html=Hand-outs 46 | http://learningenglish.voanews.com/a/4017452.html=Stand A Chance 47 | http://learningenglish.voanews.com/a/4017446.html=A Keeper 48 | http://learningenglish.voanews.com/a/4017448.html=Getting Ahead of Myself 49 | http://learningenglish.voanews.com/a/4017449.html=Crushing It 50 | http://learningenglish.voanews.com/a/3930642.html=Wait For Backup 51 | http://learningenglish.voanews.com/a/3930638.html=Fire Burning In Our Bellies 52 | http://learningenglish.voanews.com/a/3930647.html=Where Do You See Yourself In Five Years? 53 | http://learningenglish.voanews.com/a/3930639.html=Fit In 54 | http://learningenglish.voanews.com/a/3930641.html=Takes Your Breath Away 55 | http://learningenglish.voanews.com/a/3930646.html=You Got This 56 | http://learningenglish.voanews.com/a/3930645.html=You Will Soon Pay 57 | http://learningenglish.voanews.com/a/3930644.html=Go Undercover 58 | http://learningenglish.voanews.com/a/3930643.html=See Fit 59 | http://learningenglish.voanews.com/a/3930640.html=It's Shady 60 | http://learningenglish.voanews.com/a/3818369.html=Time Flies When You're Having fun 61 | http://learningenglish.voanews.com/a/3823000.html=Something Fishy 62 | http://learningenglish.voanews.com/a/3823001.html=Turn Back The Clock 63 | http://learningenglish.voanews.com/a/3822997.html=Jack Up Our Price 64 | http://learningenglish.voanews.com/a/3822994.html=Gifted 65 | http://learningenglish.voanews.com/a/3822998.html=Practice Run 66 | http://learningenglish.voanews.com/a/3822995.html=Let's Roll 67 | http://learningenglish.voanews.com/a/3822999.html=Set Things Right 68 | http://learningenglish.voanews.com/a/3822996.html=Fishing Around 69 | http://learningenglish.voanews.com/a/3818370.html=Got Our Work Cut Out For Us 70 | http://learningenglish.voanews.com/a/3725681.html=Piece Of Cake 71 | http://learningenglish.voanews.com/a/3725689.html=Crooked Cops 72 | http://learningenglish.voanews.com/a/3725680.html=I'm Really Freaking Out 73 | http://learningenglish.voanews.com/a/3725688.html=Ahead Of Your Time 74 | http://learningenglish.voanews.com/a/3725679.html=Power Nap 75 | http://learningenglish.voanews.com/a/3725674.html=I'm From A Different Planet 76 | http://learningenglish.voanews.com/a/3725682.html=To Take Charge 77 | http://learningenglish.voanews.com/a/3725678.html=It's Revolutionary 78 | http://learningenglish.voanews.com/a/3725675.html=Hitting Rock Bottom 79 | http://learningenglish.voanews.com/a/3725683.html=Write Your Own Rules 80 | http://learningenglish.voanews.com/a/3624832.html=You're A Piece Of Work 81 | http://learningenglish.voanews.com/a/3624830.html=My Heart Was Broken 82 | http://learningenglish.voanews.com/a/3624829.html=Lost Their Way 83 | http://learningenglish.voanews.com/a/3624834.html=You Are Having Visions 84 | http://learningenglish.voanews.com/a/3624825.html=Get Yourself Lawyered Up 85 | http://learningenglish.voanews.com/a/3624824.html=A Chain Reaction 86 | http://learningenglish.voanews.com/a/3624823.html=Fresh Start 87 | http://learningenglish.voanews.com/a/3624831.html=There's Something Off About Them 88 | http://learningenglish.voanews.com/a/3624826.html=Brace For Impact 89 | http://learningenglish.voanews.com/a/3624828.html=I am Dead Meat 90 | http://learningenglish.voanews.com/a/3529894.html=It's Crunch Time 91 | http://learningenglish.voanews.com/a/3529901.html=Not Everyone Is Wired 92 | http://learningenglish.voanews.com/a/3529902.html=The Cost of Doing Business 93 | http://learningenglish.voanews.com/a/3529904.html=We've Been Hacked 94 | http://learningenglish.voanews.com/a/3529896.html=Clean Record 95 | http://learningenglish.voanews.com/a/3529898.html=Give Me A Couple Pointers 96 | http://learningenglish.voanews.com/a/3529899.html=Make Things Right 97 | http://learningenglish.voanews.com/a/3529903.html=They Tracked You 98 | http://learningenglish.voanews.com/a/3529900.html=It's Not Cool 99 | http://learningenglish.voanews.com/a/3529897.html=I've Lost Direction 100 | http://learningenglish.voanews.com/a/3428747.html=More To You Than Meets The Eye 101 | http://learningenglish.voanews.com/a/3428759.html=You're A Legend 102 | http://learningenglish.voanews.com/a/3428746.html=I'm Just Ordinary 103 | http://learningenglish.voanews.com/a/3428743.html=What These Newcomers' Intentions Are 104 | http://learningenglish.voanews.com/a/3428758.html=Word Travels 105 | http://learningenglish.voanews.com/a/3428764.html=White Collar Crime 106 | http://learningenglish.voanews.com/a/3428749.html=Somebody Got The Better Of Us 107 | http://learningenglish.voanews.com/a/3428742.html=Hold His Feet To The Fire 108 | http://learningenglish.voanews.com/a/3428752.html=Watch Their Back 109 | http://learningenglish.voanews.com/a/3428741.html=There's A Lot At Stake 110 | http://learningenglish.voanews.com/a/3318586.html=Stepping Up 111 | http://learningenglish.voanews.com/a/3318577.html=Get Her Digits 112 | http://learningenglish.voanews.com/a/3318576.html=Signed Up For 113 | http://learningenglish.voanews.com/a/3318591.html=Swallow Your Pride 114 | http://learningenglish.voanews.com/a/3318585.html=Playing with Fire 115 | http://learningenglish.voanews.com/a/3318588.html=Showing Off 116 | http://learningenglish.voanews.com/a/3318578.html=Make Them Pay 117 | http://learningenglish.voanews.com/a/3318579.html=Justice is About to be Served 118 | http://learningenglish.voanews.com/a/3318584.html=Not Giving Up 119 | http://learningenglish.voanews.com/a/3318592.html=Stop at Nothing 120 | http://learningenglish.voanews.com/a/3207730.html=Make Your Case 121 | http://learningenglish.voanews.com/a/3207721.html=Come To Your Senses 122 | http://learningenglish.voanews.com/a/3207723.html=Make It Count 123 | http://learningenglish.voanews.com/a/3207729.html=Sounds Screwy 124 | http://learningenglish.voanews.com/a/3207731.html=So Long 125 | http://learningenglish.voanews.com/a/3207737.html=Pick A Fight 126 | http://learningenglish.voanews.com/a/3207728.html=Waste Of Time 127 | http://learningenglish.voanews.com/a/3207725.html=Party Foul 128 | http://learningenglish.voanews.com/a/3207724.html=All Hands On Deck 129 | http://learningenglish.voanews.com/a/3207722.html=Gear Up 130 | http://learningenglish.voanews.com/a/3093798.html=Time Bomb 131 | http://learningenglish.voanews.com/a/3093802.html=Shaking Us Down 132 | http://learningenglish.voanews.com/a/3093795.html=Felt Like I Belonged 133 | http://learningenglish.voanews.com/a/3093785.html=Stand On its Own 134 | http://learningenglish.voanews.com/a/3093789.html=Bigger Than They Are 135 | http://learningenglish.voanews.com/a/3093783.html=Put Your Foot Down 136 | http://learningenglish.voanews.com/a/3093779.html=Cooked Up 137 | http://learningenglish.voanews.com/a/3093784.html=Scaredy-cat 138 | http://learningenglish.voanews.com/a/3093775.html=A Few Tricks Up Her Sleeve 139 | http://learningenglish.voanews.com/a/3093780.html=Clean Slate -------------------------------------------------------------------------------- /html-extractor/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.apdplat 6 | html-extractor 7 | 1.1 8 | jar 9 | 10 | 11 | html-extractor 12 | https://github.com/ysc/HtmlExtractor 13 | 14 | html-extractor是一个Java实现的基于模板的通用的网页结构化信息精准抽取组件。 15 | 16 | 17 | APDPlat 18 | http://apdplat.org/ 19 | 20 | 21 | 22 | GNU GENERAL PUBLIC LICENSE, Version 3 23 | http://www.gnu.org/licenses/gpl.html 24 | 25 | 26 | 2014 27 | 28 | https://github.com/ysc/HtmlExtractor 29 | scm:git:git://github.com/ysc/HtmlExtractor.git 30 | scm:git:ssh://git@github.com/ysc/HtmlExtractor.git 31 | HEAD 32 | 33 | 34 | 35 | 杨尚川 36 | ysc@apdplat.org 37 | http://yangshangchuan.iteye.com 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | org.apache.maven.plugins 46 | maven-compiler-plugin 47 | ${maven-compiler-plugin.version} 48 | 49 | ${project.build.sourceEncoding} 50 | ${java.version} 51 | ${java.version} 52 | true 53 | true 54 | true 55 | 56 | 57 | 58 | 59 | org.apache.maven.plugins 60 | maven-jar-plugin 61 | ${maven-jar-plugin.version} 62 | 63 | 64 | **/logback.xml 65 | 66 | 67 | 68 | 69 | 70 | org.apache.maven.plugins 71 | maven-surefire-plugin 72 | ${maven-surefire-plugin.version} 73 | 74 | true 75 | 76 | 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-resources-plugin 81 | ${maven-resources-plugin.version} 82 | 83 | ${project.build.sourceEncoding} 84 | 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-javadoc-plugin 90 | ${maven-javadoc-plugin.version} 91 | 92 | 93 | attach-docs 94 | 95 | jar 96 | 97 | 98 | 99 | 100 | 101 | 102 | maven-source-plugin 103 | ${maven-source-plugin.version} 104 | 105 | 106 | attach-sources 107 | 108 | jar 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | org.codehaus.sonar 117 | sonar-maven3-plugin 118 | ${sonar-maven3-plugin.version} 119 | 120 | 121 | 122 | 123 | 1.8 124 | UTF-8 125 | 126 | 3.0 127 | 2.4 128 | 2.14 129 | 2.6 130 | 2.9.1 131 | 2.2.1 132 | 3.5 133 | 134 | 4.11 135 | 1.6.4 136 | 0.9.28 137 | 2.5.1 138 | 1.7.2 139 | 2.6 140 | 1.9.13 141 | 3.1 142 | 1.6.4 143 | 2.24 144 | 3.0.1 145 | 146 | 147 | 148 | 149 | junit 150 | junit 151 | 4.11 152 | test 153 | 154 | 155 | 156 | org.slf4j 157 | slf4j-api 158 | ${slf4j-api.version} 159 | 160 | 161 | 162 | ch.qos.logback 163 | logback-classic 164 | ${logback-classic.version} 165 | 166 | 167 | commons-logging 168 | commons-logging 169 | 170 | 171 | runtime 172 | 173 | 174 | 175 | org.slf4j 176 | jcl-over-slf4j 177 | ${jcl-over-slf4j.version} 178 | runtime 179 | 180 | 181 | 182 | redis.clients 183 | jedis 184 | ${jedis.version} 185 | 186 | 187 | org.jsoup 188 | jsoup 189 | ${jsoup.version} 190 | 191 | 192 | commons-lang 193 | commons-lang 194 | ${commons-lang.version} 195 | 196 | 197 | org.codehaus.jackson 198 | jackson-mapper-asl 199 | ${jackson.version} 200 | 201 | 202 | commons-httpclient 203 | commons-httpclient 204 | ${commons-httpclient.version} 205 | 206 | 207 | commons-logging 208 | commons-logging 209 | 210 | 211 | 212 | 213 | net.sourceforge.htmlunit 214 | htmlunit 215 | ${htmlunit.version} 216 | 217 | 218 | commons-logging 219 | commons-logging 220 | 221 | 222 | 223 | 224 | org.seleniumhq.selenium 225 | selenium-java 226 | ${selenium.version} 227 | 228 | 229 | 230 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /html-extractor/src/main/resources/voa/EnglishInAMinute.txt: -------------------------------------------------------------------------------- 1 | http://learningenglish.voanews.com/a/3620615.html=Boils Down To 2 | http://learningenglish.voanews.com/a/3699956.html=Put All Your Eggs in One Basket 3 | http://learningenglish.voanews.com/a/3699945.html=Breathing Room 4 | http://learningenglish.voanews.com/a/3661254.html=Pack Rat 5 | http://learningenglish.voanews.com/a/3699954.html=Jump on the Bandwagon 6 | http://learningenglish.voanews.com/a/3620611.html=See Eye to Eye 7 | http://learningenglish.voanews.com/a/3699952.html=Keep You on Your Toes 8 | http://learningenglish.voanews.com/a/3699948.html=Bigger Fish to Fry 9 | http://learningenglish.voanews.com/a/3435558.html=On a Roll 10 | http://learningenglish.voanews.com/a/3605386.html=Happy Medium 11 | http://learningenglish.voanews.com/a/3699953.html=One Tough Cookie 12 | http://learningenglish.voanews.com/a/3620612.html=Slippery Slope 13 | http://learningenglish.voanews.com/a/3605389.html=Freebie 14 | http://learningenglish.voanews.com/a/3661248.html=Two Left Feet 15 | http://learningenglish.voanews.com/a/3435564.html=Sugarcoat 16 | http://learningenglish.voanews.com/a/3605385.html=Get off on the Wrong (or Right!) Foot 17 | http://learningenglish.voanews.com/a/3620609.html=Golden Opportunity 18 | http://learningenglish.voanews.com/a/3563804.html=Slip One's Mind 19 | http://learningenglish.voanews.com/a/3240171.html=Dog Eat Dog 20 | http://learningenglish.voanews.com/a/3605387.html=Sink Your Teeth Into 21 | http://learningenglish.voanews.com/a/3545534.html=Switch Gears 22 | http://learningenglish.voanews.com/a/3406302.html=Break the Ice 23 | http://learningenglish.voanews.com/a/3563798.html=The Ball Is In Your Court 24 | http://learningenglish.voanews.com/a/3545538.html=Go the Extra Mile 25 | http://learningenglish.voanews.com/a/3390611.html=Music to My Ears 26 | http://learningenglish.voanews.com/a/3435557.html=Up One's Alley 27 | http://learningenglish.voanews.com/a/3390591.html=Off the Deep End 28 | http://learningenglish.voanews.com/a/3390586.html=Red Flag 29 | http://learningenglish.voanews.com/a/3406310.html=Hit Your Stride 30 | http://learningenglish.voanews.com/a/3435555.html=To Have a Lot on Your Plate 31 | http://learningenglish.voanews.com/a/3406300.html=Comfort Zone 32 | http://learningenglish.voanews.com/a/3390603.html=Turn the Tables 33 | http://learningenglish.voanews.com/a/3390599.html=Bent out of Shape 34 | http://learningenglish.voanews.com/a/3406303.html=Miss the Boat 35 | http://learningenglish.voanews.com/a/3390596.html=Sing a Different Tune 36 | http://learningenglish.voanews.com/a/3445382.html=Suck the Air out of the Room 37 | http://learningenglish.voanews.com/a/3390583.html=No Pain No Gain 38 | http://learningenglish.voanews.com/a/3240231.html=Where There's Smoke There's Fire 39 | http://learningenglish.voanews.com/a/3390584.html=A Fish Out of Water 40 | http://learningenglish.voanews.com/a/3282529.html=To Fall Down the Rabbit Hole 41 | http://learningenglish.voanews.com/a/3282533.html=Put Someone on the Spot 42 | http://learningenglish.voanews.com/a/3282555.html=Cool as a Cucumber 43 | http://learningenglish.voanews.com/a/3240167.html=At the Drop of a Hat 44 | http://learningenglish.voanews.com/a/3282557.html=Fly on the Wall 45 | http://learningenglish.voanews.com/a/3282572.html=Baggage 46 | http://learningenglish.voanews.com/a/3240181.html=The Last Straw 47 | http://learningenglish.voanews.com/a/3209591.html=Push the Envelope 48 | http://learningenglish.voanews.com/a/3209587.html=Break Even 49 | http://learningenglish.voanews.com/a/3240179.html=Moving Target 50 | http://learningenglish.voanews.com/a/3203430.html=Keep Your Eye on the Ball 51 | http://learningenglish.voanews.com/a/3209586.html=Bucket List 52 | http://learningenglish.voanews.com/a/3203427.html=Deer in the Headlights 53 | http://learningenglish.voanews.com/a/3209593.html=That Ship Has Sailed 54 | http://learningenglish.voanews.com/a/3203438.html=A Wolf in Sheep's Clothing 55 | http://learningenglish.voanews.com/a/3203422.html=Best of Both Worlds 56 | http://learningenglish.voanews.com/a/3080432.html=Get Roped Into 57 | http://learningenglish.voanews.com/a/3203426.html=Throw the Baby Out with the Bathwater 58 | http://learningenglish.voanews.com/a/3209594.html=Living Under a Rock 59 | http://learningenglish.voanews.com/a/3081382.html=Wiggle Room 60 | http://learningenglish.voanews.com/a/3080429.html=Fly Under the Radar 61 | http://learningenglish.voanews.com/a/3026464.html=Fall Through the Cracks 62 | http://learningenglish.voanews.com/a/3081369.html=Head Over Heels 63 | http://learningenglish.voanews.com/a/3026468.html=In the Spotlight 64 | http://learningenglish.voanews.com/a/3080438.html=Face the Music 65 | http://learningenglish.voanews.com/a/3026460.html=Branch Out 66 | http://learningenglish.voanews.com/a/3026463.html=Low-Hanging Fruit 67 | http://learningenglish.voanews.com/a/3026462.html=In Over My Head 68 | http://learningenglish.voanews.com/a/2833784.html=Don't Hold Your Breath 69 | http://learningenglish.voanews.com/a/2921220.html=Burning Bridges 70 | http://learningenglish.voanews.com/a/2921285.html=MIA 71 | http://learningenglish.voanews.com/a/2777234.html=Wake-Up Call 72 | http://learningenglish.voanews.com/a/2777201.html=The Calm Before the Storm 73 | http://learningenglish.voanews.com/a/2833771.html=Jack of All Trades 74 | http://learningenglish.voanews.com/a/2741430.html=To Have Butterflies 75 | http://learningenglish.voanews.com/a/2741442.html=Up In The Air 76 | http://learningenglish.voanews.com/a/2777204.html=Get a Grip 77 | http://learningenglish.voanews.com/a/2833770.html=Water Under the Bridge 78 | http://learningenglish.voanews.com/a/2777203.html=On the Same Page 79 | http://learningenglish.voanews.com/a/2777202.html=The Sky's the Limit 80 | http://learningenglish.voanews.com/a/2741426.html=Elephant in the Room 81 | http://learningenglish.voanews.com/a/2741424.html=Knock Your Socks Off 82 | http://learningenglish.voanews.com/a/2741435.html=Pain in the Neck 83 | http://learningenglish.voanews.com/a/2716955.html=On Pins and Needles 84 | http://learningenglish.voanews.com/a/2716987.html=On the Edge of Your Seat 85 | http://learningenglish.voanews.com/a/2716950.html=Nail-biter 86 | http://learningenglish.voanews.com/a/2833766.html=Clam Up 87 | http://learningenglish.voanews.com/a/2716948.html=A Bitter Pill to Swallow 88 | http://learningenglish.voanews.com/a/2716951.html=Fifth Wheel 89 | http://learningenglish.voanews.com/a/2711203.html=Jazz it Up 90 | http://learningenglish.voanews.com/a/2711200.html=Don't Count Your Chickens Before They Hatch 91 | http://learningenglish.voanews.com/a/2711198.html=Over the Hill 92 | http://learningenglish.voanews.com/a/2833767.html=Out of the Blue 93 | http://learningenglish.voanews.com/a/2711201.html=Out of the Woods 94 | http://learningenglish.voanews.com/a/2660132.html=Fishing for Compliments 95 | http://learningenglish.voanews.com/a/2670644.html=Ring a Bell 96 | http://learningenglish.voanews.com/a/2670640.html=Skating on Thin Ice 97 | http://learningenglish.voanews.com/a/2688209.html=Tip of the Iceberg 98 | http://learningenglish.voanews.com/a/2688203.html=Straight from the Horse's Mouth 99 | http://learningenglish.voanews.com/a/2688206.html=Sleep Like a Rock 100 | http://learningenglish.voanews.com/a/2688204.html=Shoot the Breeze 101 | http://learningenglish.voanews.com/a/2749865.html=Bend the Rules 102 | http://learningenglish.voanews.com/a/2670637.html=Put (Something) on Hold 103 | http://learningenglish.voanews.com/a/2670645.html=Off the Cuff 104 | http://learningenglish.voanews.com/a/2596479.html=Not Have a Leg to Stand on 105 | http://learningenglish.voanews.com/a/2596477.html=Out of Your Mind 106 | http://learningenglish.voanews.com/a/2580622.html=Open Book 107 | http://learningenglish.voanews.com/a/2580617.html=Nip it in the Bud 108 | http://learningenglish.voanews.com/a/2670628.html=Down to Earth 109 | http://learningenglish.voanews.com/a/2580619.html=Drama Queen 110 | http://learningenglish.voanews.com/a/2667762.html=Keep One's Eyes Peeled 111 | http://learningenglish.voanews.com/a/2553166.html=Back to Square One 112 | http://learningenglish.voanews.com/a/2553157.html=Know the Ropes 113 | http://learningenglish.voanews.com/a/2553143.html=The Apple Doesn't Fall Far From the Tree 114 | http://learningenglish.voanews.com/a/2553116.html=Another One Bites the Dust 115 | http://learningenglish.voanews.com/a/2553133.html=Last-Ditch Effort 116 | http://learningenglish.voanews.com/a/2510605.html=Poker Face 117 | http://learningenglish.voanews.com/a/2510600.html=Blow Off Steam 118 | http://learningenglish.voanews.com/a/2510597.html=Up the Creek Without a Paddle 119 | http://learningenglish.voanews.com/a/2510754.html=Junk Food 120 | http://learningenglish.voanews.com/a/2510756.html=Fly by the Seat of One's Pants 121 | http://learningenglish.voanews.com/a/2494877.html=Food for Thought 122 | http://learningenglish.voanews.com/a/2494873.html=Got up on the Wrong Side of the Bed 123 | http://learningenglish.voanews.com/a/2494876.html=Fair Weather Friend 124 | http://learningenglish.voanews.com/a/2494875.html=Clear the Air 125 | http://learningenglish.voanews.com/a/2494874.html=Calling Someone's Bluff 126 | http://learningenglish.voanews.com/a/2479292.html=Barking Up the Wrong Tree 127 | http://learningenglish.voanews.com/a/2479288.html=Back-Seat Driver 128 | http://learningenglish.voanews.com/a/2479287.html=Apple of One's Eye 129 | http://learningenglish.voanews.com/a/2479286.html=As the Crow Flies 130 | http://learningenglish.voanews.com/a/2479290.html=Burned Out 131 | http://learningenglish.voanews.com/a/1679564.html=It's Up To You 132 | http://learningenglish.voanews.com/a/1666400.html=Miss The Point 133 | http://learningenglish.voanews.com/a/1698437.html=Go With The Flow 134 | http://learningenglish.voanews.com/a/1679557.html=A Piece of Cake 135 | http://learningenglish.voanews.com/a/1698912.html=Get On My Nerves 136 | http://learningenglish.voanews.com/a/1651145.html=What's Up? 137 | http://learningenglish.voanews.com/a/1664388.html=Grab a Bite 138 | http://learningenglish.voanews.com/a/1665136.html=A Day Late and a Dollar Short 139 | http://learningenglish.voanews.com/a/1665197.html=Dressed to Kill 140 | http://learningenglish.voanews.com/a/1665309.html=My Two Cents 141 | http://learningenglish.voanews.com/a/1665362.html=Burn The Midnight Oil 142 | http://learningenglish.voanews.com/a/1665601.html=Actions Speak Louder Than Words 143 | http://learningenglish.voanews.com/a/1666266.html=Keep Your Chin Up 144 | http://learningenglish.voanews.com/a/1670596.html=Bad Taste in My Mouth 145 | http://learningenglish.voanews.com/a/1670827.html=If The Shoe Fits 146 | http://learningenglish.voanews.com/a/1672261.html=A Game Plan 147 | http://learningenglish.voanews.com/a/1675045.html=In Your Dreams 148 | http://learningenglish.voanews.com/a/1675057.html=Stay in Touch 149 | http://learningenglish.voanews.com/a/1675069.html=Draw a Blank 150 | http://learningenglish.voanews.com/a/1675076.html=Get out of Here 151 | http://learningenglish.voanews.com/a/1675089.html=Costs a Pretty Penny 152 | http://learningenglish.voanews.com/a/1678615.html=Once in a Blue Moon 153 | http://learningenglish.voanews.com/a/1679513.html=A Breath of Fresh Air 154 | http://learningenglish.voanews.com/a/1684038.html=Call It A Day 155 | http://learningenglish.voanews.com/a/1698742.html=Cut to the Chase 156 | http://learningenglish.voanews.com/a/1698759.html=Cash Cow 157 | http://learningenglish.voanews.com/a/1698806.html=No Pain, No Gain 158 | http://learningenglish.voanews.com/a/1698818.html=Heart of Gold 159 | http://learningenglish.voanews.com/a/1698839.html=Under The Weather 160 | http://learningenglish.voanews.com/a/1698852.html=Ears Are Burning 161 | http://learningenglish.voanews.com/a/1698857.html=Off The Top Of My Head 162 | http://learningenglish.voanews.com/a/1698863.html=Hear a Pin Drop 163 | http://learningenglish.voanews.com/a/1698950.html=Play It By Ear 164 | http://learningenglish.voanews.com/a/1664509.html=Cats and Dogs 165 | http://learningenglish.voanews.com/a/1664835.html=It's Been Ages 166 | http://learningenglish.voanews.com/a/1670728.html=A Basket Case 167 | http://learningenglish.voanews.com/a/1679544.html=Break a Leg 168 | http://learningenglish.voanews.com/a/1684020.html=Make Up Your Mind 169 | http://learningenglish.voanews.com/a/1698727.html=Break the Bank 170 | http://learningenglish.voanews.com/a/1698775.html=Dime a Dozen 171 | http://learningenglish.voanews.com/a/1698787.html=24 - 7 172 | http://learningenglish.voanews.com/a/1698827.html=An Arm and a Leg 173 | http://learningenglish.voanews.com/a/1684043.html=Get Cold Feet 174 | http://learningenglish.voanews.com/a/1684030.html=Hit The Sack 175 | http://learningenglish.voanews.com/a/1684094.html=Treat With Kid Gloves 176 | http://learningenglish.voanews.com/a/1679529.html=Bite My Tongue --------------------------------------------------------------------------------