├── html-extractor
├── .gitignore
├── src
│ ├── main
│ │ ├── resources
│ │ │ ├── voa
│ │ │ │ ├── PeopleInAmerica.txt
│ │ │ │ ├── HowAmericaElects.txt
│ │ │ │ ├── EverydayGrammarTV.txt
│ │ │ │ ├── Let'sLearnEnglish.txt
│ │ │ │ ├── America'sNationalParks.txt
│ │ │ │ ├── PersonalTechnology.txt
│ │ │ │ ├── HealthLifestyle.txt
│ │ │ │ ├── ScienceintheNews.txt
│ │ │ │ ├── ThisIsAmerica.txt
│ │ │ │ ├── NewsWords.txt
│ │ │ │ ├── EverydayGrammar.txt
│ │ │ │ ├── EnglishAtTheMovies.txt
│ │ │ │ └── EnglishInAMinute.txt
│ │ │ └── logback.xml
│ │ └── java
│ │ │ └── org
│ │ │ └── apdplat
│ │ │ └── extractor
│ │ │ └── html
│ │ │ ├── HtmlFetcher.java
│ │ │ ├── HtmlExtractor.java
│ │ │ ├── model
│ │ │ ├── ExtractResultItem.java
│ │ │ ├── ExtractFunction.java
│ │ │ ├── UrlPattern.java
│ │ │ ├── HtmlTemplate.java
│ │ │ ├── ExtractFailLog.java
│ │ │ ├── CssPath.java
│ │ │ └── ExtractResult.java
│ │ │ ├── impl
│ │ │ ├── SeleniumHtmlFetcher.java
│ │ │ ├── HtmlUnitHtmlFetcher.java
│ │ │ ├── JSoupHtmlFetcher.java
│ │ │ └── ExtractFunctionExecutor.java
│ │ │ └── demo
│ │ │ └── Toutiao.java
│ └── test
│ │ └── java
│ │ └── org
│ │ └── apdplat
│ │ └── extractor
│ │ └── html
│ │ └── impl
│ │ ├── JSoupHtmlFetcherTest.java
│ │ ├── HtmlUnitHtmlFetcherTest.java
│ │ └── SeleniumHtmlFetcherTest.java
└── pom.xml
├── html-extractor-web
├── .gitignore
├── src
│ └── main
│ │ ├── webapp
│ │ ├── META-INF
│ │ │ └── context.xml
│ │ ├── WEB-INF
│ │ │ └── web.xml
│ │ └── api
│ │ │ └── all_extract_regular.jsp
│ │ ├── java
│ │ └── org
│ │ │ └── apdplat
│ │ │ └── extractor
│ │ │ └── html
│ │ │ └── server
│ │ │ ├── redis
│ │ │ ├── RedisClient.java
│ │ │ └── RedisListener.java
│ │ │ ├── model
│ │ │ ├── ExtractResultItem.java
│ │ │ ├── ExtractFunction.java
│ │ │ ├── UrlPattern.java
│ │ │ ├── HtmlTemplate.java
│ │ │ ├── ExtractResult.java
│ │ │ ├── ExtractFailLog.java
│ │ │ └── CssPath.java
│ │ │ └── service
│ │ │ └── JsonGenerator.java
│ │ └── resources
│ │ └── logback.xml
└── pom.xml
├── .travis.yml
├── .gitignore
├── pom.xml
├── mvnw.cmd
├── README.md
├── mvnw
└── LICENSE.txt
/html-extractor/.gitignore:
--------------------------------------------------------------------------------
1 | .settings/
2 | .classpath
3 | .project
4 | target/
5 | logs/
6 |
--------------------------------------------------------------------------------
/html-extractor-web/.gitignore:
--------------------------------------------------------------------------------
1 | .settings/
2 | .classpath
3 | .project
4 | target/
5 | logs/
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 |
3 | jdk:
4 | - oraclejdk8
5 |
6 | install:
7 | - mvn -N io.takari:maven:wrapper
--------------------------------------------------------------------------------
/html-extractor-web/src/main/webapp/META-INF/context.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/PeopleInAmerica.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/makers-row-made-in-america/3386962.html=Maker's Row Made in America
2 | http://learningenglish.voanews.com/a/painting-canvases-that-lives-and-breathes/3315141.html=Athena Zhe Painting a Canvas That Lives and Breathes
3 | http://learningenglish.voanews.com/a/i-didnt-have-to-come-here-i-chose-to-come-here/3308781.html=Andy Shallal 'I Didn't Have to Come Here, I Chose to Come Here'
4 | http://learningenglish.voanews.com/a/introducing-people-in-america/3337834.html=Introducing 'People in America'
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/HowAmericaElects.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/3395534.html=How America Elects Convention Rules
2 | http://learningenglish.voanews.com/a/3369131.html=How America Elects Becoming a Delegate
3 | http://learningenglish.voanews.com/a/3298650.html=How America Elects General Election Day
4 | http://learningenglish.voanews.com/a/3264801.html=How America Elects Conventions
5 | http://learningenglish.voanews.com/a/3254747.html=How America Elects US Political Parties
6 | http://learningenglish.voanews.com/a/3163339.html=How America Elects Caucuses Primaries
7 | http://learningenglish.voanews.com/a/3161858.html=How America Elects Polls Debates
8 | http://learningenglish.voanews.com/a/3158592.html=How America Elects How To Raise Money
--------------------------------------------------------------------------------
/html-extractor-web/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | org.apdplat.extractor.html.server.redis.RedisListener
6 |
7 |
8 |
9 | redis.host
10 | localhost
11 |
12 |
13 | redis.port
14 | 6379
15 |
16 |
17 |
18 | 30
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .settings/
2 | .classpath
3 | .project
4 | target/
5 | logs/
6 | data/
7 | .idea/
8 | .gradle/
9 | build/
10 | HtmlExtractor.iml
11 | HtmlExtractor.ipr
12 | HtmlExtractor.iws
13 | html-extractor/.settings/
14 | html-extractor/.classpath
15 | html-extractor/.project
16 | html-extractor/target/
17 | html-extractor/logs/
18 | html-extractor/data/
19 | html-extractor/.idea/
20 | html-extractor/.gradle/
21 | html-extractor/build/
22 | html-extractor/html-extractor.iml
23 | html-extractor/html-extractor.ipr
24 | html-extractor/html-extractor.iws
25 | html-extractor-web/.settings/
26 | html-extractor-web/.classpath
27 | html-extractor-web/.project
28 | html-extractor-web/target/
29 | html-extractor-web/logs/
30 | html-extractor-web/data/
31 | html-extractor-web/.idea/
32 | html-extractor-web/.gradle/
33 | html-extractor-web/build/
34 | html-extractor-web/html-extractor-web.iml
35 | html-extractor-web/html-extractor-web.ipr
36 | html-extractor-web/html-extractor-web.iws
37 | html-extractor-web/nb-configuration.xml
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/HtmlFetcher.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html;
22 |
23 | /**
24 | *
25 | * 网页内容获取工具
26 | * @author 杨尚川
27 | */
28 | public interface HtmlFetcher {
29 | public String fetch(String url);
30 | }
31 |
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | %m%n
7 |
8 |
9 |
10 | logs/logback.log
11 |
12 | logs/logback_%i.log
13 | 1
14 | 10000
15 |
16 |
17 | 5MB
18 |
19 |
20 | %m%nj
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/HtmlExtractor.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html;
22 |
23 | import org.apdplat.extractor.html.model.ExtractResult;
24 | import java.util.List;
25 |
26 | /**
27 | * 网页抽取工具
28 | * 根据URL模式、页面模板、CSS路径、抽取函数,抽取HTML页面
29 | *
30 | * @author 杨尚川
31 | *
32 | */
33 | public interface HtmlExtractor {
34 | /**
35 | * 抽取信息
36 | * @param url URL
37 | * @param html HTML
38 | * @return 抽取结果
39 | */
40 | public List extract(String url, String html);
41 | }
42 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/redis/RedisClient.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.redis;
22 |
23 | import static org.apdplat.extractor.html.server.redis.RedisListener.jedisPool;
24 |
25 | import redis.clients.jedis.Jedis;
26 |
27 | /**
28 | * 通知从节点抽取规则发生变化
29 | *
30 | * @author 杨尚川
31 | */
32 | public class RedisClient {
33 | /**
34 | * 当抽取规则发生变化的时候
35 | * 向Redis服务器Channel:pr发送消息CHANGE
36 | * 从节点就会重新初始化抽取规则
37 | */
38 | public void extractRegularChange() {
39 | String message = "CHANGE";
40 | Jedis jedis = jedisPool.getResource();
41 | jedis.publish("pr", message);
42 | jedisPool.returnResource(jedis);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/model/ExtractResultItem.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.model;
22 |
23 | /**
24 | * 网页结构化信息抽取结果项
25 | *
26 | * @author 杨尚川
27 | *
28 | */
29 | public class ExtractResultItem {
30 | /**
31 | * 抽取结果项保存到那个字段
32 | */
33 | private String field;
34 | /**
35 | * 抽取结果项的值
36 | */
37 | private String value;
38 |
39 | public String getField() {
40 | return field;
41 | }
42 |
43 | public void setField(String field) {
44 | this.field = field;
45 | }
46 |
47 | public String getValue() {
48 | return value;
49 | }
50 |
51 | public void setValue(String value) {
52 | this.value = value;
53 | }
54 |
55 | @Override
56 | public String toString() {
57 | return "ExtractResultItem [\nfield=" + field + ", \nvalue=" + value + "]";
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/ExtractResultItem.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.model;
22 |
23 | /**
24 | * 网页结构化信息抽取结果项
25 | *
26 | * @author 杨尚川
27 | *
28 | */
29 | public class ExtractResultItem {
30 | /**
31 | * 抽取结果项保存到那个字段
32 | */
33 | private String field;
34 | /**
35 | * 抽取结果项的值
36 | */
37 | private String value;
38 |
39 | public String getField() {
40 | return field;
41 | }
42 |
43 | public void setField(String field) {
44 | this.field = field;
45 | }
46 |
47 | public String getValue() {
48 | return value;
49 | }
50 |
51 | public void setValue(String value) {
52 | this.value = value;
53 | }
54 |
55 | @Override
56 | public String toString() {
57 | return "ExtractResultItem [\nfield=" + field + ", \nvalue=" + value + "]";
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/EverydayGrammarTV.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/3360403.html=Little vs A Little Few vs A Few
2 | http://learningenglish.voanews.com/a/3360402.html=Noncount Nouns
3 | http://learningenglish.voanews.com/a/3360400.html=Adverbs
4 | http://learningenglish.voanews.com/a/3360401.html=American English vs British English
5 | http://learningenglish.voanews.com/a/3255188.html=Should and Shall
6 | http://learningenglish.voanews.com/a/3255195.html=Present Progressive Tense
7 | http://learningenglish.voanews.com/a/3255190.html=Passive and Active Voice
8 | http://learningenglish.voanews.com/a/3255184.html=Causatives
9 | http://learningenglish.voanews.com/a/3255157.html=Present Unreal Conditionals
10 | http://learningenglish.voanews.com/a/3255156.html=Present And Future Real Conditionals
11 | http://learningenglish.voanews.com/a/3255168.html=Double Negatives
12 | http://learningenglish.voanews.com/a/3255174.html=Tag Questions
13 | http://learningenglish.voanews.com/a/3255152.html=Words That Are Coming And Going
14 | http://learningenglish.voanews.com/a/3255171.html=For and Since
15 | http://learningenglish.voanews.com/a/3137103.html=Pronouns and Gender
16 | http://learningenglish.voanews.com/a/3137100.html=Gerunds vs Infinitives
17 | http://learningenglish.voanews.com/a/3137104.html=Introducing Articles
18 | http://learningenglish.voanews.com/a/3137098.html=Understanding Fast Talkers
19 | http://learningenglish.voanews.com/a/3137093.html=Simple Past Present Perfect
20 | http://learningenglish.voanews.com/a/3137097.html=Modals for Asking Permission
21 | http://learningenglish.voanews.com/a/3137087.html=Irregular Plurals
22 | http://learningenglish.voanews.com/a/3137090.html=Onomatopoeia
23 | http://learningenglish.voanews.com/a/3137088.html=Pronouns I and Me
24 | http://learningenglish.voanews.com/a/3137091.html=Will vs Be Going to
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 | org.apdplat
5 | HtmlExtractor
6 | 1.1
7 | HtmlExtractor
8 | pom
9 |
10 | https://github.com/ysc/HtmlExtractor
11 | HtmlExtractor是一个Java实现的基于模板的网页结构化信息精准抽取组件。
12 |
13 |
14 | GNU GENERAL PUBLIC LICENSE, Version 3
15 | http://www.gnu.org/licenses/gpl.txt
16 |
17 |
18 |
19 | https://github.com/ysc/HtmlExtractor
20 | scm:git:git://github.com/ysc/HtmlExtractor.git
21 | scm:git:git://github.com/ysc/HtmlExtractor.git
22 | GITHUB HtmlExtractor
23 |
24 |
25 | https://github.com/ysc/HtmlExtractor/issues
26 | github.com
27 |
28 |
29 |
30 | 杨尚川
31 | ysc@apdplat.org
32 |
33 |
34 |
35 |
36 | 杨尚川
37 | ysc@apdplat.org
38 | http://yangshangchuan.iteye.com
39 |
40 |
41 |
42 | UTF-8
43 |
44 |
45 | html-extractor
46 | html-extractor-web
47 |
48 |
49 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/impl/SeleniumHtmlFetcher.java:
--------------------------------------------------------------------------------
1 | /*
2 | * APDPlat - Application Product Development Platform
3 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, either version 3 of the License, or
8 | * (at your option) any later version.
9 | *
10 | * This program is distributed in the hope that it will be useful,
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | * GNU General Public License for more details.
14 | *
15 | * You should have received a copy of the GNU General Public License
16 | * along with this program. If not, see .
17 | */
18 |
19 | package org.apdplat.extractor.html.impl;
20 |
21 | import org.apdplat.extractor.html.HtmlFetcher;
22 | import org.openqa.selenium.WebDriver;
23 | import org.openqa.selenium.firefox.FirefoxDriver;
24 | import org.slf4j.Logger;
25 | import org.slf4j.LoggerFactory;
26 |
27 | /**
28 | *
29 | * 安装geckodriver:
30 | * brew install geckodriver
31 | *
32 | * 使用selenium执行JS动态渲染网页获取页面内容
33 | *
34 | * @author 杨尚川
35 | */
36 | public class SeleniumHtmlFetcher implements HtmlFetcher {
37 | private static final Logger LOGGER = LoggerFactory.getLogger(SeleniumHtmlFetcher.class);
38 |
39 | //火狐浏览器
40 | private static final WebDriver WEB_DRIVER = new FirefoxDriver();
41 |
42 | /**
43 | * 使用HtmlUnit获取页面内容,HtmlUnit能执行JS,动态渲染网页,但不是所有JS都能渲染,需要测试
44 | * @param url html页面路径
45 | * @return
46 | */
47 | @Override
48 | public String fetch(String url) {
49 | try{
50 | LOGGER.debug("url:"+url);
51 | WEB_DRIVER.get(url);
52 | String html = WEB_DRIVER.getPageSource();
53 | LOGGER.debug("html:"+html);
54 | return html;
55 | }catch (Exception e) {
56 | LOGGER.error("获取URL:"+url+"页面出错", e);
57 | }
58 | return "";
59 | }
60 |
61 | public static void main(String[] args) {
62 | HtmlFetcher htmlFetcher = new SeleniumHtmlFetcher();
63 | String html = htmlFetcher.fetch("http://apdplat.org");
64 | System.out.println(html);
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/impl/HtmlUnitHtmlFetcher.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.impl;
22 |
23 | import com.gargoylesoftware.htmlunit.BrowserVersion;
24 | import com.gargoylesoftware.htmlunit.WebClient;
25 | import com.gargoylesoftware.htmlunit.html.HtmlPage;
26 | import org.apdplat.extractor.html.HtmlFetcher;
27 | import org.slf4j.Logger;
28 | import org.slf4j.LoggerFactory;
29 |
30 | /**
31 | *
32 | * 使用HtmlUnit获取页面内容,HtmlUnit能执行JS
33 | * 动态渲染网页,但不是所有JS都能渲染,需要测试
34 | * @author 杨尚川
35 | */
36 | public class HtmlUnitHtmlFetcher implements HtmlFetcher {
37 | private static final Logger LOGGER = LoggerFactory.getLogger(HtmlUnitHtmlFetcher.class);
38 |
39 | private static final WebClient WEB_CLIENT = new WebClient(BrowserVersion.INTERNET_EXPLORER_11);
40 |
41 | /**
42 | * 使用HtmlUnit获取页面内容,HtmlUnit能执行JS,动态渲染网页,但不是所有JS都能渲染,需要测试
43 | * @param url html页面路径
44 | * @return
45 | */
46 | @Override
47 | public String fetch(String url) {
48 | try{
49 | LOGGER.debug("url:"+url);
50 | HtmlPage htmlPage = WEB_CLIENT.getPage(url);
51 | String html = htmlPage.getBody().asXml();
52 | LOGGER.debug("html:"+html);
53 | return html;
54 | }catch (Exception e) {
55 | LOGGER.error("获取URL:"+url+"页面出错", e);
56 | }
57 | return "";
58 | }
59 |
60 | public static void main(String[] args) {
61 | HtmlFetcher htmlFetcher = new HtmlUnitHtmlFetcher();
62 | String html = htmlFetcher.fetch("http://apdplat.org");
63 | System.out.println(html);
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/redis/RedisListener.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.redis;
22 |
23 | import javax.servlet.ServletContext;
24 | import javax.servlet.ServletContextEvent;
25 | import javax.servlet.ServletContextListener;
26 | import org.slf4j.Logger;
27 | import org.slf4j.LoggerFactory;
28 | import redis.clients.jedis.JedisPool;
29 | import redis.clients.jedis.JedisPoolConfig;
30 |
31 | /**
32 | * Redis监听器
33 | *
34 | * @author 杨尚川
35 | */
36 | public class RedisListener implements ServletContextListener {
37 | private static final Logger LOGGER = LoggerFactory.getLogger(RedisListener.class);
38 | public static JedisPool jedisPool;
39 |
40 | @Override
41 | public void contextInitialized(ServletContextEvent sce) {
42 | ServletContext sc = sce.getServletContext();
43 | String redisHost = sc.getInitParameter("redis.host");
44 | String redisPort = sc.getInitParameter("redis.port");
45 | LOGGER.info("redis.host: " + redisHost);
46 | LOGGER.info("redis.port: " + redisPort);
47 | LOGGER.info("开始初始化JedisPool");
48 | try {
49 | JedisPoolConfig jedispool_config = new JedisPoolConfig();
50 | jedisPool = new JedisPool(jedispool_config, redisHost, Integer.parseInt(redisPort));
51 | LOGGER.info("初始化JedisPool成功");
52 | } catch (Exception e) {
53 | LOGGER.error("初始化JedisPool失败", e);
54 | }
55 | }
56 |
57 | @Override
58 | public void contextDestroyed(ServletContextEvent sce) {
59 | jedisPool.destroy();
60 | LOGGER.info("关闭JedisPool");
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | %m%n
7 |
8 |
9 |
10 | logs/logback.log
11 |
12 | logs/logback_%i.log
13 | 1
14 | 10000
15 |
16 |
17 | 5MB
18 |
19 |
20 | %m%nj
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/Let'sLearnEnglish.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-19-when-do-you-start/3357760.html=Lesson 19 When Do I Start
2 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-18-she-always-does-that/3357748.html=Lesson 18 She Always Does That
3 | http://learningenglish.voanews.com/a/are-you-free-on-friday-lets-learn-english/3355785.html=Lesson 17 Are You Free on Friday
4 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-16-where-are-you-from/3355849.html=Lesson 16 Where Are You From
5 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-15-i-love-people-watching/3343720.html=Lesson 15 I Love People-Watching
6 | http://learningenglish.voanews.com/a/lets-learn-english-review-lessons-10-14/3329289.html=Let's Learn English A Review of Lessons 10 -14
7 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-14-how-about-this/3323771.html=Lesson 14 How About This
8 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-13-happy-birthday-william-shakespeare/3312239.html=Lesson 13 Happy Birthday William Shakespeare
9 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-12-meet-my-family/3301733.html=Lesson 12 Meet My Family
10 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-11-this-is-my-neighborhood/3293986.html=Lesson 11 This Is My Neighborhood
11 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-10/3285228.html=Lesson 10 Come Over to My Place
12 | http://learningenglish.voanews.com/a/lets-learn-english-review-1-9/3276044.html=Let's Learn English A Review of Lessons 1 - 9
13 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-9-is-it-cold/3261789.html=Lesson 9 Is It Cold
14 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-8-are-you-busy/3253185.html=Lesson 8 Are You Busy
15 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-7-what-are-you-doing/3240468.html=Lesson 7 What Are You Doing
16 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-6-where-is-the-gym/3225958.html=Lesson 6 Where Is the Gym
17 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-5-where-are-you/3168971.html=Lesson 5 Where Are You
18 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-4/3168920.html=Lesson 4 What Is It
19 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-3-i-am-here/3126527.html=Lesson 3 I'm Here
20 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-2-hello/3113733.html=Lesson 2 Hello I'm Anna
21 | http://learningenglish.voanews.com/a/lets-learn-english-lesson-one/3111026.html=Lesson 1 Welcome
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/model/ExtractFunction.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.model;
22 |
23 | /**
24 | * 抽取函数
25 | * 抽取函数是页面模板的二级元素
26 | * 可以精准地控制抽取的内容
27 | *
28 | * @author 杨尚川
29 | *
30 | */
31 | public class ExtractFunction {
32 | /**
33 | * 抽取函数对应的CSS路径
34 | */
35 | private CssPath cssPath;
36 | /**
37 | * 抽取函数(只能使用系统内置支持的函数)
38 | */
39 | private String extractExpression;
40 | /**
41 | * 抽取函数提取出的文本存储到哪个字段
42 | */
43 | private String fieldName;
44 | /**
45 | * 抽取函数提取出的字段的中文含义,仅仅起注释作用,利于理解
46 | */
47 | private String fieldDescription;
48 |
49 | public CssPath getCssPath() {
50 | return cssPath;
51 | }
52 |
53 | public void setCssPath(CssPath cssPath) {
54 | this.cssPath = cssPath;
55 | }
56 |
57 | public String getExtractExpression() {
58 | return extractExpression;
59 | }
60 |
61 | public void setExtractExpression(String extractExpression) {
62 | this.extractExpression = extractExpression;
63 | }
64 |
65 | public String getFieldName() {
66 | return fieldName;
67 | }
68 |
69 | public void setFieldName(String fieldName) {
70 | this.fieldName = fieldName;
71 | }
72 |
73 | public String getFieldDescription() {
74 | return fieldDescription;
75 | }
76 |
77 | public void setFieldDescription(String fieldDescription) {
78 | this.fieldDescription = fieldDescription;
79 | }
80 |
81 | @Override
82 | public String toString() {
83 | StringBuilder str = new StringBuilder();
84 | str.append(this.extractExpression).append("\n");
85 | str.append(this.fieldName).append("\n");
86 | str.append(this.fieldDescription).append("\n");
87 | return str.toString();
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/ExtractFunction.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.model;
22 |
23 | import org.codehaus.jackson.annotate.JsonIgnore;
24 |
25 | /**
26 | * 抽取函数
27 | * 抽取函数是页面模板的二级元素
28 | * 可以精准地控制抽取的内容
29 | *
30 | * @author 杨尚川
31 | *
32 | */
33 | public class ExtractFunction {
34 | /**
35 | * 抽取函数对应的CSS路径
36 | */
37 | @JsonIgnore
38 | private CssPath cssPath;
39 | /**
40 | * 抽取函数(只能使用系统内置支持的函数)
41 | */
42 | private String extractExpression;
43 | /**
44 | * 抽取函数提取出的文本存储到哪个字段
45 | */
46 | private String fieldName;
47 | /**
48 | * 抽取函数提取出的字段的中文含义,仅仅起注释作用,利于理解
49 | */
50 | private String fieldDescription;
51 |
52 | public CssPath getCssPath() {
53 | return cssPath;
54 | }
55 |
56 | public void setCssPath(CssPath cssPath) {
57 | this.cssPath = cssPath;
58 | }
59 |
60 | public String getExtractExpression() {
61 | return extractExpression;
62 | }
63 |
64 | public void setExtractExpression(String extractExpression) {
65 | this.extractExpression = extractExpression;
66 | }
67 |
68 | public String getFieldName() {
69 | return fieldName;
70 | }
71 |
72 | public void setFieldName(String fieldName) {
73 | this.fieldName = fieldName;
74 | }
75 |
76 | public String getFieldDescription() {
77 | return fieldDescription;
78 | }
79 |
80 | public void setFieldDescription(String fieldDescription) {
81 | this.fieldDescription = fieldDescription;
82 | }
83 |
84 | @Override
85 | public String toString() {
86 | StringBuilder str = new StringBuilder();
87 | str.append(this.extractExpression).append("\n");
88 | str.append(this.fieldName).append("\n");
89 | str.append(this.fieldDescription).append("\n");
90 | return str.toString();
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/America'sNationalParks.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/acadia-national-park-americas-national-parks-100/3391329.html=Acadia An East Coast Treasure
2 | http://learningenglish.voanews.com/a/whats-trending-today-obama-names-stonewall-inn-national-monument/3391152.html=Stonewall Inn Named LGBT National Monument
3 | http://learningenglish.voanews.com/a/young-traveler-hopes-to-visit-every-national-park-in-america-/3378000.html=Young Traveler Hopes to Visit Every National Park
4 | http://learningenglish.voanews.com/a/americas-national-parks-wrangell-st-elias-alaska/3381159.html=The Untouched Beauty of Wrangell-St. Elias National Park
5 | http://learningenglish.voanews.com/a/americas-national-parks-nps-100-yosemite-national-park/3370560.html=Yosemite A Park of Extremes
6 | http://learningenglish.voanews.com/a/everglades-national-park-liquid-heart-of-florida/3360425.html=Everglades National Park The Liquid Heart of Florida
7 | http://learningenglish.voanews.com/a/americas-national-parks-centennial-gettysburg-national-military-park/3348968.html=A Visit with History Gettysburg National Military Park
8 | http://learningenglish.voanews.com/a/americas-national-parks-carlsbad-caverns-national-park-new-mexico/3338983.html=An Underground World Carlsbad Caverns National Park
9 | http://learningenglish.voanews.com/a/americas-national-parks-great-smoky-mountains/3329159.html=Great Smoky Mountains Americas Most Popular National Park
10 | http://learningenglish.voanews.com/a/americans-national-parks-100-death-valley-national-park/3318946.html=Life in Death Valley National Park
11 | http://learningenglish.voanews.com/a/americas-national-parks-new-orleans-jazz-national-historical-park/3308628.html=National Park in New Orleans Celebrates Jazz
12 | http://learningenglish.voanews.com/a/americas-national-parks-mount-rainier-national-park-washington/3297148.html=The Glacial World of Mount Rainier
13 | http://learningenglish.voanews.com/a/americas-national-parks-mesa-verde-colorado/3287589.html=Mesa Verde National Park Protecting an Ancient Culture
14 | http://learningenglish.voanews.com/a/national-parks-week-free-entry/3285897.html=National Park Week Features Free Park Entry
15 | http://learningenglish.voanews.com/a/history-and-nature-at-dry-tortugas-national-park/3275242.html=History and Nature at Dry Tortugas National Park
16 | http://learningenglish.voanews.com/a/3261802.html=Hawaii Volcanoes National Park A Fiery World
17 | http://learningenglish.voanews.com/a/cherry-blossoms-signal-the-start-of-spring/3251616.html=Washingtons Cherry Blossoms Signal the Start of Spring
18 | http://learningenglish.voanews.com/a/national-parks-100-series-grand-canyon-national-park/3239133.html=The Grand Canyon Beyond Words
19 | http://learningenglish.voanews.com/a/3223506.html=US Park Honors Womens Rights
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/model/UrlPattern.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.model;
22 |
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import java.util.regex.Pattern;
26 |
27 | import org.slf4j.Logger;
28 | import org.slf4j.LoggerFactory;
29 |
30 | /**
31 | * URL模式(使用正则表达式实现)
32 | * 用正则表达式的方式来指定一组有共同页面布局的网页
33 | * 这样就可以对这组页面指定一套模板来抽取信息
34 | *
35 | * @author 杨尚川
36 | *
37 | */
38 | public class UrlPattern {
39 | private static final Logger LOGGER = LoggerFactory.getLogger(UrlPattern.class);
40 | /**
41 | * URL模式(使用正则表达式实现)
42 | */
43 | private String urlPattern;
44 | /**
45 | * URL模式(编译好的正则表达式)
46 | */
47 | private Pattern regexPattern;
48 | /**
49 | * 多个网页模板
50 | */
51 | private List htmlTemplates = new ArrayList<>();
52 |
53 | public String getUrlPattern() {
54 | return urlPattern;
55 | }
56 |
57 | public void setUrlPattern(String urlPattern) {
58 | this.urlPattern = urlPattern;
59 | try {
60 | regexPattern = Pattern.compile(urlPattern, Pattern.CASE_INSENSITIVE);
61 | } catch (Exception e) {
62 | LOGGER.error("编译正则表达式["+urlPattern+"]失败:", e);
63 | }
64 | }
65 |
66 | public Pattern getRegexPattern() {
67 | return regexPattern;
68 | }
69 |
70 | public List getHtmlTemplates() {
71 | return htmlTemplates;
72 | }
73 |
74 | public void setHtmlTemplates(List htmlTemplates) {
75 | this.htmlTemplates = htmlTemplates;
76 | for (HtmlTemplate htmlTemplate : this.htmlTemplates) {
77 | htmlTemplate.setUrlPattern(this);
78 | }
79 | }
80 |
81 | public boolean hasHtmlTemplate() {
82 | return !htmlTemplates.isEmpty();
83 | }
84 |
85 | public void addHtmlTemplate(HtmlTemplate htmlTemplate) {
86 | htmlTemplates.add(htmlTemplate);
87 | htmlTemplate.setUrlPattern(this);
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/UrlPattern.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.model;
22 |
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import java.util.regex.Pattern;
26 |
27 | import org.slf4j.Logger;
28 | import org.slf4j.LoggerFactory;
29 |
30 | /**
31 | * URL模式(使用正则表达式实现)
32 | * 用正则表达式的方式来指定一组有共同页面布局的网页
33 | * 这样就可以对这组页面指定一套模板来抽取信息
34 | *
35 | * @author 杨尚川
36 | *
37 | */
38 | public class UrlPattern {
39 | private static final Logger LOGGER = LoggerFactory.getLogger(UrlPattern.class);
40 | /**
41 | * URL模式(使用正则表达式实现)
42 | */
43 | private String urlPattern;
44 | /**
45 | * URL模式(编译好的正则表达式)
46 | */
47 | private Pattern regexPattern;
48 | /**
49 | * 多个网页模板
50 | */
51 | private List htmlTemplates = new ArrayList<>();
52 |
53 | public String getUrlPattern() {
54 | return urlPattern;
55 | }
56 |
57 | public void setUrlPattern(String urlPattern) {
58 | this.urlPattern = urlPattern;
59 | try {
60 | regexPattern = Pattern.compile(urlPattern, Pattern.CASE_INSENSITIVE);
61 | } catch (Exception e) {
62 | LOGGER.error("编译正则表达式["+urlPattern+"]失败:", e);
63 | }
64 | }
65 |
66 | public Pattern getRegexPattern() {
67 | return regexPattern;
68 | }
69 |
70 | public List getHtmlTemplates() {
71 | return htmlTemplates;
72 | }
73 |
74 | public void setHtmlTemplates(List htmlTemplates) {
75 | this.htmlTemplates = htmlTemplates;
76 | for (HtmlTemplate htmlTemplate : this.htmlTemplates) {
77 | htmlTemplate.setUrlPattern(this);
78 | }
79 | }
80 |
81 | public boolean hasHtmlTemplate() {
82 | return !htmlTemplates.isEmpty();
83 | }
84 |
85 | public void addHtmlTemplate(HtmlTemplate htmlTemplate) {
86 | htmlTemplates.add(htmlTemplate);
87 | htmlTemplate.setUrlPattern(this);
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/demo/Toutiao.java:
--------------------------------------------------------------------------------
1 | /*
2 | * APDPlat - Application Product Development Platform
3 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
4 | *
5 | * This program is free software: you can redistribute it and/or modify
6 | * it under the terms of the GNU General Public License as published by
7 | * the Free Software Foundation, either version 3 of the License, or
8 | * (at your option) any later version.
9 | *
10 | * This program is distributed in the hope that it will be useful,
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | * GNU General Public License for more details.
14 | *
15 | * You should have received a copy of the GNU General Public License
16 | * along with this program. If not, see .
17 | */
18 |
19 | package org.apdplat.extractor.html.demo;
20 |
21 | import org.openqa.selenium.By;
22 | import org.openqa.selenium.WebDriver;
23 | import org.openqa.selenium.WebElement;
24 | import org.openqa.selenium.firefox.FirefoxDriver;
25 |
26 | import java.util.List;
27 | import java.util.Random;
28 |
29 | /**
30 | * 如何抓取Js动态生成数据且以滚动页面方式分页的网页
31 | * 以抓取今日头条为例说明:http://toutiao.com/
32 | * Created by ysc on 10/13/15.
33 | */
34 | public class Toutiao {
35 | public static void main(String[] args) throws Exception{
36 |
37 | //等待数据加载的时间
38 | //为了防止服务器封锁,这里的时间要模拟人的行为,随机且不能太短
39 | long waitLoadBaseTime = 3000;
40 | int waitLoadRandomTime = 3000;
41 | Random random = new Random(System.currentTimeMillis());
42 |
43 | //火狐浏览器
44 | WebDriver driver = new FirefoxDriver();
45 | //要抓取的网页
46 | driver.get("http://toutiao.com/");
47 |
48 | //等待页面动态加载完毕
49 | Thread.sleep(waitLoadBaseTime+random.nextInt(waitLoadRandomTime));
50 |
51 | //要加载多少页数据
52 | int pages=5;
53 | for(int i=0; i elements = driver.findElements(By.className("title"));
63 | int j=1;
64 | for(int i=0;i.
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.impl;
22 |
23 | import org.apdplat.extractor.html.HtmlFetcher;
24 | import org.jsoup.Connection;
25 | import org.jsoup.Jsoup;
26 | import org.slf4j.Logger;
27 | import org.slf4j.LoggerFactory;
28 |
29 | import java.net.URL;
30 | /**
31 | *
32 | * 使用JSoup获取网页内容
33 | * @author 杨尚川
34 | */
35 | public class JSoupHtmlFetcher implements HtmlFetcher {
36 | private static final Logger LOGGER = LoggerFactory.getLogger(JSoupHtmlFetcher.class);
37 |
38 | private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
39 | private static final String ENCODING = "gzip, deflate";
40 | private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
41 | private static final String CONNECTION = "keep-alive";
42 | private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
43 |
44 | @Override
45 | public String fetch(String url) {
46 | try {
47 | LOGGER.debug("url:"+url);
48 | String host = new URL(url).getHost();
49 | Connection conn = Jsoup.connect(url)
50 | .timeout(60000)
51 | .header("Accept", ACCEPT)
52 | .header("Accept-Encoding", ENCODING)
53 | .header("Accept-Language", LANGUAGE)
54 | .header("Connection", CONNECTION)
55 | .header("Referer", "http://"+host)
56 | .header("Host", host)
57 | .header("User-Agent", USER_AGENT)
58 | .ignoreContentType(true);
59 | String html = conn.get().html();
60 | LOGGER.debug("html:"+html);
61 | return html;
62 | }catch (Exception e){
63 | LOGGER.error("获取URL:"+url+"页面出错", e);
64 | }
65 | return "";
66 | }
67 |
68 | public static void main(String[] args) {
69 | HtmlFetcher htmlFetcher = new JSoupHtmlFetcher();
70 | String html = htmlFetcher.fetch("http://apdplat.org");
71 | System.out.println(html);
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/model/HtmlTemplate.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.model;
22 |
23 | import java.util.ArrayList;
24 | import java.util.List;
25 |
26 | /**
27 | * 网页模板
28 | * 一个URL模式会有一到多个网页模板
29 | * 一套网页模板指定了如何精准地抽取网页信息
30 | *
31 | * @author 杨尚川
32 | *
33 | */
34 | public class HtmlTemplate {
35 | /**
36 | * 网页模板名称,仅仅注释作用
37 | */
38 | private String templateName;
39 | /**
40 | * 网页提取出的文本存储到哪个表
41 | */
42 | private String tableName;
43 | /**
44 | * URL模式
45 | */
46 | private UrlPattern urlPattern;
47 | /**
48 | * 多个CSS路径
49 | */
50 | private List cssPaths = new ArrayList<>();
51 |
52 | public String getTemplateName() {
53 | return templateName;
54 | }
55 |
56 | public void setTemplateName(String templateName) {
57 | this.templateName = templateName;
58 | }
59 |
60 | public String getTableName() {
61 | return tableName;
62 | }
63 |
64 | public void setTableName(String tableName) {
65 | this.tableName = tableName;
66 | }
67 |
68 | public UrlPattern getUrlPattern() {
69 | return urlPattern;
70 | }
71 |
72 | public void setUrlPattern(UrlPattern urlPattern) {
73 | this.urlPattern = urlPattern;
74 | }
75 |
76 | public List getCssPaths() {
77 | return cssPaths;
78 | }
79 |
80 | public void setCssPaths(List cssPaths) {
81 | this.cssPaths = cssPaths;
82 | for (CssPath cssPath : this.cssPaths) {
83 | cssPath.setPageTemplate(this);
84 | }
85 | }
86 |
87 | public boolean hasCssPath() {
88 | return !cssPaths.isEmpty();
89 | }
90 |
91 | public void addCssPath(CssPath cssPath) {
92 | cssPaths.add(cssPath);
93 | cssPath.setPageTemplate(this);
94 | }
95 |
96 | @Override
97 | public String toString() {
98 | StringBuilder str = new StringBuilder();
99 | str.append("网页模板:").append(this.templateName).append(",存储表:").append(this.tableName).append("\n\n");
100 | int i = 1;
101 | for (CssPath cssPath : cssPaths) {
102 | str.append(i++).append("、").append(cssPath.toString()).append("\n");
103 | }
104 | return str.toString();
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/HtmlTemplate.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.model;
22 |
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import org.codehaus.jackson.annotate.JsonIgnore;
26 |
27 | /**
28 | * 网页模板
29 | * 一个URL模式会有一到多个网页模板
30 | * 一套网页模板指定了如何精准地抽取网页信息
31 | *
32 | * @author 杨尚川
33 | *
34 | */
35 | public class HtmlTemplate {
36 | /**
37 | * 网页模板名称,仅仅注释作用
38 | */
39 | private String templateName;
40 | /**
41 | * 网页提取出的文本存储到哪个表
42 | */
43 | private String tableName;
44 | /**
45 | * URL模式
46 | */
47 | @JsonIgnore
48 | private UrlPattern urlPattern;
49 | /**
50 | * 多个CSS路径
51 | */
52 | private List cssPaths = new ArrayList<>();
53 |
54 | public String getTemplateName() {
55 | return templateName;
56 | }
57 |
58 | public void setTemplateName(String templateName) {
59 | this.templateName = templateName;
60 | }
61 |
62 | public String getTableName() {
63 | return tableName;
64 | }
65 |
66 | public void setTableName(String tableName) {
67 | this.tableName = tableName;
68 | }
69 |
70 | public UrlPattern getUrlPattern() {
71 | return urlPattern;
72 | }
73 |
74 | public void setUrlPattern(UrlPattern urlPattern) {
75 | this.urlPattern = urlPattern;
76 | }
77 |
78 | public List getCssPaths() {
79 | return cssPaths;
80 | }
81 |
82 | public void setCssPaths(List cssPaths) {
83 | this.cssPaths = cssPaths;
84 | for (CssPath cssPath : this.cssPaths) {
85 | cssPath.setPageTemplate(this);
86 | }
87 | }
88 |
89 | public boolean hasCssPath() {
90 | return !cssPaths.isEmpty();
91 | }
92 |
93 | public void addCssPath(CssPath cssPath) {
94 | cssPaths.add(cssPath);
95 | cssPath.setPageTemplate(this);
96 | }
97 |
98 | @Override
99 | public String toString() {
100 | StringBuilder str = new StringBuilder();
101 | str.append("网页模板:").append(this.templateName).append(",存储表:").append(this.tableName).append("\n\n");
102 | int i = 1;
103 | for (CssPath cssPath : cssPaths) {
104 | str.append(i++).append("、").append(cssPath.toString()).append("\n");
105 | }
106 | return str.toString();
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/ExtractResult.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.model;
22 |
23 | import java.util.ArrayList;
24 | import java.util.List;
25 |
26 | /**
27 | * 网页结构化信息抽取结果
28 | * 一个网页模板对应一个抽取结果
29 | * 如果一个网页有多个网页模板
30 | * 每个模板都抽取成功
31 | * 只要这些模板保存在不同的表中
32 | * URL作为主键就不会冲突
33 | *
34 | * @author 杨尚川
35 | *
36 | */
37 | public class ExtractResult {
38 | /**
39 | * 网页对应的URL
40 | */
41 | private String url;
42 | /**
43 | * 网页原始内容
44 | */
45 | private byte[] content;
46 | /**
47 | * 网页编码
48 | */
49 | private String encoding;
50 | /**
51 | * 网页关键词元数据
52 | */
53 | private String keywords;
54 | /**
55 | * 网页描述元数据
56 | */
57 | private String description;
58 | /**
59 | * 网页提取出的文本存储到哪个表
60 | */
61 | private String tableName;
62 | /**
63 | * 一个网页可能有多个抽取结果项,至少要一个
64 | */
65 | private final List extractResultItems = new ArrayList<>();
66 | /**
67 | * 抽取失败日志
68 | */
69 | private final List extractFailLogs = new ArrayList<>();
70 |
71 | public boolean isSuccess() {
72 | return extractFailLogs.isEmpty() && !extractResultItems.isEmpty();
73 | }
74 |
75 | public String getUrl() {
76 | return url;
77 | }
78 |
79 | public void setUrl(String url) {
80 | this.url = url;
81 | }
82 |
83 | public byte[] getContent() {
84 | return content;
85 | }
86 |
87 | public void setContent(byte[] content) {
88 | this.content = content;
89 | }
90 |
91 | public String getEncoding() {
92 | return encoding;
93 | }
94 |
95 | public void setEncoding(String encoding) {
96 | this.encoding = encoding;
97 | }
98 |
99 | public String getKeywords() {
100 | return keywords;
101 | }
102 |
103 | public void setKeywords(String keywords) {
104 | this.keywords = keywords;
105 | }
106 |
107 | public String getDescription() {
108 | return description;
109 | }
110 |
111 | public void setDescription(String description) {
112 | this.description = description;
113 | }
114 |
115 | public String getTableName() {
116 | return tableName;
117 | }
118 |
119 | public void setTableName(String tableName) {
120 | this.tableName = tableName;
121 | }
122 |
123 | public List getExtractResultItems() {
124 | return extractResultItems;
125 | }
126 |
127 | public void addExtractResultItem(ExtractResultItem extractResultItem) {
128 | this.extractResultItems.add(extractResultItem);
129 | }
130 |
131 | public List getExtractFailLogs() {
132 | return extractFailLogs;
133 | }
134 |
135 | public void addExtractFailLog(ExtractFailLog extractFailLog) {
136 | this.extractFailLogs.add(extractFailLog);
137 | extractFailLog.setExtractResult(this);
138 | }
139 |
140 | @Override
141 | public String toString() {
142 | return "ExtractResult [\nurl=" + url + ", \ntableName=" + tableName
143 | + ", \nextractResultItems=" + extractResultItems + ", \nextractFailLogs=" + extractFailLogs + "]";
144 | }
145 | }
146 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/model/ExtractFailLog.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.model;
22 |
23 | /**
24 | * 网页结构化信息抽取失败日志
25 | *
26 | * @author 杨尚川
27 | *
28 | */
29 | public class ExtractFailLog {
30 | /**
31 | * 网页结构化信息抽取结果
32 | */
33 | private ExtractResult extractResult;
34 | /**
35 | * 网页的URL
36 | */
37 | private String url;
38 | /**
39 | * 网页的URL模式
40 | */
41 | private String urlPattern;
42 | /**
43 | * 网页模板
44 | */
45 | private String templateName;
46 | /**
47 | * CSS路径
48 | */
49 | private String cssPath;
50 | /**
51 | * CSS路径下的抽取函数
52 | */
53 | private String extractExpression;
54 | /**
55 | * 抽取出的内容保存到的表的名称
56 | */
57 | private String tableName;
58 | /**
59 | * 抽取出的内容保存到的字段名称
60 | */
61 | private String fieldName;
62 | /**
63 | * 抽取出的内容保存到的字段描述,仅作注释使用
64 | */
65 | private String fieldDescription;
66 |
67 | public ExtractResult getExtractResult() {
68 | return extractResult;
69 | }
70 |
71 | public void setExtractResult(ExtractResult extractResult) {
72 | this.extractResult = extractResult;
73 | }
74 |
75 | public String getUrl() {
76 | return url;
77 | }
78 |
79 | public void setUrl(String url) {
80 | this.url = url;
81 | }
82 |
83 | public String getUrlPattern() {
84 | return urlPattern;
85 | }
86 |
87 | public void setUrlPattern(String urlPattern) {
88 | this.urlPattern = urlPattern;
89 | }
90 |
91 | public String getTemplateName() {
92 | return templateName;
93 | }
94 |
95 | public void setTemplateName(String templateName) {
96 | this.templateName = templateName;
97 | }
98 |
99 | public String getCssPath() {
100 | return cssPath;
101 | }
102 |
103 | public void setCssPath(String cssPath) {
104 | this.cssPath = cssPath;
105 | }
106 |
107 | public String getExtractExpression() {
108 | return extractExpression;
109 | }
110 |
111 | public void setExtractExpression(String extractExpression) {
112 | this.extractExpression = extractExpression;
113 | }
114 |
115 | public String getTableName() {
116 | return tableName;
117 | }
118 |
119 | public void setTableName(String tableName) {
120 | this.tableName = tableName;
121 | }
122 |
123 | public String getFieldName() {
124 | return fieldName;
125 | }
126 |
127 | public void setFieldName(String fieldName) {
128 | this.fieldName = fieldName;
129 | }
130 |
131 | public String getFieldDescription() {
132 | return fieldDescription;
133 | }
134 |
135 | public void setFieldDescription(String fieldDescription) {
136 | this.fieldDescription = fieldDescription;
137 | }
138 |
139 | @Override
140 | public String toString() {
141 | return "ExtractFailLog [\nurl=" + url + ", \nurlPattern=" + urlPattern
142 | + ", \ntemplateName=" + templateName + ", \ncssPath=" + cssPath
143 | + ", \nextractExpression=" + extractExpression + ", \ntableName="
144 | + tableName + ", \nfieldName=" + fieldName
145 | + ", \nfieldDescription=" + fieldDescription + "]";
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/ExtractFailLog.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.model;
22 |
23 | /**
24 | * 网页结构化信息抽取失败日志
25 | *
26 | * @author 杨尚川
27 | *
28 | */
29 | public class ExtractFailLog {
30 | /**
31 | * 网页结构化信息抽取结果
32 | */
33 | private ExtractResult extractResult;
34 | /**
35 | * 网页的URL
36 | */
37 | private String url;
38 | /**
39 | * 网页的URL模式
40 | */
41 | private String urlPattern;
42 | /**
43 | * 网页模板
44 | */
45 | private String templateName;
46 | /**
47 | * CSS路径
48 | */
49 | private String cssPath;
50 | /**
51 | * CSS路径下的抽取函数
52 | */
53 | private String extractExpression;
54 | /**
55 | * 抽取出的内容保存到的表的名称
56 | */
57 | private String tableName;
58 | /**
59 | * 抽取出的内容保存到的字段名称
60 | */
61 | private String fieldName;
62 | /**
63 | * 抽取出的内容保存到的字段描述,仅作注释使用
64 | */
65 | private String fieldDescription;
66 |
67 | public ExtractResult getExtractResult() {
68 | return extractResult;
69 | }
70 |
71 | public void setExtractResult(ExtractResult extractResult) {
72 | this.extractResult = extractResult;
73 | }
74 |
75 | public String getUrl() {
76 | return url;
77 | }
78 |
79 | public void setUrl(String url) {
80 | this.url = url;
81 | }
82 |
83 | public String getUrlPattern() {
84 | return urlPattern;
85 | }
86 |
87 | public void setUrlPattern(String urlPattern) {
88 | this.urlPattern = urlPattern;
89 | }
90 |
91 | public String getTemplateName() {
92 | return templateName;
93 | }
94 |
95 | public void setTemplateName(String templateName) {
96 | this.templateName = templateName;
97 | }
98 |
99 | public String getCssPath() {
100 | return cssPath;
101 | }
102 |
103 | public void setCssPath(String cssPath) {
104 | this.cssPath = cssPath;
105 | }
106 |
107 | public String getExtractExpression() {
108 | return extractExpression;
109 | }
110 |
111 | public void setExtractExpression(String extractExpression) {
112 | this.extractExpression = extractExpression;
113 | }
114 |
115 | public String getTableName() {
116 | return tableName;
117 | }
118 |
119 | public void setTableName(String tableName) {
120 | this.tableName = tableName;
121 | }
122 |
123 | public String getFieldName() {
124 | return fieldName;
125 | }
126 |
127 | public void setFieldName(String fieldName) {
128 | this.fieldName = fieldName;
129 | }
130 |
131 | public String getFieldDescription() {
132 | return fieldDescription;
133 | }
134 |
135 | public void setFieldDescription(String fieldDescription) {
136 | this.fieldDescription = fieldDescription;
137 | }
138 |
139 | @Override
140 | public String toString() {
141 | return "ExtractFailLog [\nurl=" + url + ", \nurlPattern=" + urlPattern
142 | + ", \ntemplateName=" + templateName + ", \ncssPath=" + cssPath
143 | + ", \nextractExpression=" + extractExpression + ", \ntableName="
144 | + tableName + ", \nfieldName=" + fieldName
145 | + ", \nfieldDescription=" + fieldDescription + "]";
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/model/CssPath.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.model;
22 |
23 | import java.util.ArrayList;
24 | import java.util.List;
25 |
26 | /**
27 | * CSS路径
28 | * CSS路径是页面模板的一级元素
29 | * CSS路径抽取到的内容如果还不满足要求
30 | * 需要使用二级元素即抽取函数来做进一步控制
31 | *
32 | * @author 杨尚川
33 | *
34 | */
35 | public class CssPath {
36 | /**
37 | * CSS路径对应的网页模板
38 | */
39 | private HtmlTemplate pageTemplate;
40 | /**
41 | * CSS路径
42 | */
43 | private String cssPath;
44 | /**
45 | * 提取属性,如果不指定属性,则提取文本
46 | */
47 | private String attr;
48 | /**
49 | * CSS路径对应的抽取函数列表
50 | */
51 | private List extractFunctions = new ArrayList<>();
52 | /**
53 | * CSS路径提取出的文本存储到哪个字段
54 | */
55 | private String fieldName;
56 | /**
57 | * CSS路径提取出的字段的中文含义,仅仅起注释作用,利于理解
58 | */
59 | private String fieldDescription;
60 |
61 | public HtmlTemplate getPageTemplate() {
62 | return pageTemplate;
63 | }
64 |
65 | public void setPageTemplate(HtmlTemplate pageTemplate) {
66 | this.pageTemplate = pageTemplate;
67 | }
68 |
69 | public String getCssPath() {
70 | return cssPath;
71 | }
72 |
73 | public void setCssPath(String cssPath) {
74 | this.cssPath = cssPath;
75 | }
76 |
77 | public String getAttr() {
78 | return attr;
79 | }
80 |
81 | public void setAttr(String attr) {
82 | this.attr = attr;
83 | }
84 |
85 | public List getExtractFunctions() {
86 | return extractFunctions;
87 | }
88 |
89 | public void setExtractFunctions(List extractFunctions) {
90 | this.extractFunctions = extractFunctions;
91 | for (ExtractFunction extractFunction : this.extractFunctions) {
92 | extractFunction.setCssPath(this);
93 | }
94 | }
95 |
96 | public boolean hasExtractFunction() {
97 | return !extractFunctions.isEmpty();
98 | }
99 |
100 | public void addExtractFunction(ExtractFunction extractFunction) {
101 | extractFunctions.add(extractFunction);
102 | extractFunction.setCssPath(this);
103 | }
104 |
105 | public String getFieldName() {
106 | return fieldName;
107 | }
108 |
109 | public void setFieldName(String fieldName) {
110 | this.fieldName = fieldName;
111 | }
112 |
113 | public String getFieldDescription() {
114 | return fieldDescription;
115 | }
116 |
117 | public void setFieldDescription(String fieldDescription) {
118 | this.fieldDescription = fieldDescription;
119 | }
120 |
121 | @Override
122 | public String toString() {
123 | StringBuilder str = new StringBuilder();
124 | str.append("CSS路径:").append(this.cssPath).append("\n");
125 | str.append("字段名:").append(this.fieldName).append("\n");
126 | str.append("字段含义:").append(this.fieldDescription).append("\n");
127 | for (ExtractFunction ef : this.extractFunctions) {
128 | str.append("\t").append("抽取函数:").append(ef.getExtractExpression()).append("\n");
129 | str.append("\t").append("字段名:").append(ef.getFieldName()).append("\n");
130 | str.append("\t").append("字段含义:").append(ef.getFieldDescription()).append("\n");
131 | }
132 | return str.toString();
133 | }
134 | }
135 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/model/CssPath.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.model;
22 |
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | import org.codehaus.jackson.annotate.JsonIgnore;
26 |
27 | /**
28 | * CSS路径
29 | * CSS路径是页面模板的一级元素
30 | * CSS路径抽取到的内容如果还不满足要求
31 | * 需要使用二级元素即抽取函数来做进一步控制
32 | *
33 | * @author 杨尚川
34 | *
35 | */
36 | public class CssPath {
37 | /**
38 | * CSS路径对应的网页模板
39 | */
40 | @JsonIgnore
41 | private HtmlTemplate pageTemplate;
42 | /**
43 | * CSS路径
44 | */
45 | private String cssPath;
46 | /**
47 | * 提取属性,如果不指定属性,则提取文本
48 | */
49 | private String attr;
50 | /**
51 | * CSS路径对应的抽取函数列表
52 | */
53 | private List extractFunctions = new ArrayList<>();
54 | /**
55 | * CSS路径提取出的文本存储到哪个字段
56 | */
57 | private String fieldName;
58 | /**
59 | * CSS路径提取出的字段的中文含义,仅仅起注释作用,利于理解
60 | */
61 | private String fieldDescription;
62 |
63 | public HtmlTemplate getPageTemplate() {
64 | return pageTemplate;
65 | }
66 |
67 | public void setPageTemplate(HtmlTemplate pageTemplate) {
68 | this.pageTemplate = pageTemplate;
69 | }
70 |
71 | public String getCssPath() {
72 | return cssPath;
73 | }
74 |
75 | public void setCssPath(String cssPath) {
76 | this.cssPath = cssPath;
77 | }
78 |
79 | public String getAttr() {
80 | return attr;
81 | }
82 |
83 | public void setAttr(String attr) {
84 | this.attr = attr;
85 | }
86 |
87 | public List getExtractFunctions() {
88 | return extractFunctions;
89 | }
90 |
91 | public void setExtractFunctions(List extractFunctions) {
92 | this.extractFunctions = extractFunctions;
93 | for (ExtractFunction extractFunction : this.extractFunctions) {
94 | extractFunction.setCssPath(this);
95 | }
96 | }
97 |
98 | public boolean hasExtractFunction() {
99 | return !extractFunctions.isEmpty();
100 | }
101 |
102 | public void addExtractFunction(ExtractFunction extractFunction) {
103 | extractFunctions.add(extractFunction);
104 | extractFunction.setCssPath(this);
105 | }
106 |
107 | public String getFieldName() {
108 | return fieldName;
109 | }
110 |
111 | public void setFieldName(String fieldName) {
112 | this.fieldName = fieldName;
113 | }
114 |
115 | public String getFieldDescription() {
116 | return fieldDescription;
117 | }
118 |
119 | public void setFieldDescription(String fieldDescription) {
120 | this.fieldDescription = fieldDescription;
121 | }
122 |
123 | @Override
124 | public String toString() {
125 | StringBuilder str = new StringBuilder();
126 | str.append("CSS路径:").append(this.cssPath).append("\n");
127 | str.append("字段名:").append(this.fieldName).append("\n");
128 | str.append("字段含义:").append(this.fieldDescription).append("\n");
129 | for (ExtractFunction ef : this.extractFunctions) {
130 | str.append("\t").append("抽取函数:").append(ef.getExtractExpression()).append("\n");
131 | str.append("\t").append("字段名:").append(ef.getFieldName()).append("\n");
132 | str.append("\t").append("字段含义:").append(ef.getFieldDescription()).append("\n");
133 | }
134 | return str.toString();
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/html-extractor-web/src/main/java/org/apdplat/extractor/html/server/service/JsonGenerator.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.server.service;
22 |
23 | import java.io.IOException;
24 | import java.util.ArrayList;
25 | import java.util.List;
26 | import org.apdplat.extractor.html.server.model.CssPath;
27 | import org.apdplat.extractor.html.server.model.ExtractFunction;
28 | import org.apdplat.extractor.html.server.model.HtmlTemplate;
29 | import org.apdplat.extractor.html.server.model.UrlPattern;
30 | import org.codehaus.jackson.map.ObjectMapper;
31 | import org.slf4j.LoggerFactory;
32 |
33 | /**
34 | * JSON生成器
35 | * @author 杨尚川
36 | */
37 | public class JsonGenerator {
38 | private static final org.slf4j.Logger LOGGER = LoggerFactory.getLogger(JsonGenerator.class);
39 | private static final ObjectMapper MAPPER = new ObjectMapper();
40 | private JsonGenerator(){}
41 |
42 | public static String generateExtractRegular(List urlPatterns){
43 | try {
44 | String value = MAPPER.writeValueAsString(urlPatterns);
45 | return value;
46 | } catch (IOException ex) {
47 | LOGGER.error("将抽取规则转换为JSON出错", ex);
48 | }
49 | return "[]";
50 | }
51 | public static void main(String[] args) {
52 | List urlPatterns = new ArrayList<>();
53 |
54 | UrlPattern urlPattern = new UrlPattern();
55 | urlPattern.setUrlPattern("http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html");
56 |
57 | urlPatterns.add(urlPattern);
58 |
59 | HtmlTemplate htmlTemplate = new HtmlTemplate();
60 | htmlTemplate.setTemplateName("网易财经频道");
61 | htmlTemplate.setTableName("finance");
62 |
63 | urlPattern.addHtmlTemplate(htmlTemplate);
64 |
65 | CssPath cssPath = new CssPath();
66 | cssPath.setCssPath("h1#h1title");
67 | cssPath.setFieldName("title");
68 | cssPath.setFieldDescription("标题");
69 |
70 | htmlTemplate.addCssPath(cssPath);
71 |
72 | cssPath = new CssPath();
73 | cssPath.setCssPath("div#endText");
74 | cssPath.setFieldName("content");
75 | cssPath.setFieldDescription("正文");
76 |
77 | htmlTemplate.addCssPath(cssPath);
78 |
79 | urlPattern = new UrlPattern();
80 | urlPattern.setUrlPattern("http://finance.qq.com/a/\\d{8}/\\d{6}.htm");
81 |
82 | urlPatterns.add(urlPattern);
83 |
84 | htmlTemplate = new HtmlTemplate();
85 | htmlTemplate.setTemplateName("腾讯财经频道");
86 | htmlTemplate.setTableName("finance");
87 |
88 | urlPattern.addHtmlTemplate(htmlTemplate);
89 |
90 | cssPath = new CssPath();
91 | cssPath.setCssPath("div#C-Main-Article-QQ div.hd h1");
92 | cssPath.setFieldName("title");
93 | cssPath.setFieldDescription("标题");
94 |
95 | htmlTemplate.addCssPath(cssPath);
96 |
97 | cssPath = new CssPath();
98 | cssPath.setCssPath("div#Cnt-Main-Article-QQ");
99 | cssPath.setFieldName("content");
100 | cssPath.setFieldDescription("正文");
101 |
102 | htmlTemplate.addCssPath(cssPath);
103 |
104 | ExtractFunction extractFunction = new ExtractFunction();
105 | extractFunction.setFieldName("content");
106 | extractFunction.setFieldDescription("正文");
107 | extractFunction.setExtractExpression("deleteChild(“div.ep-source”)");
108 |
109 | cssPath.addExtractFunction(extractFunction);
110 |
111 | System.out.println(generateExtractRegular(urlPatterns));
112 |
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/model/ExtractResult.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.model;
22 |
23 | import java.util.ArrayList;
24 | import java.util.HashMap;
25 | import java.util.List;
26 | import java.util.Map;
27 |
28 | /**
29 | * 网页结构化信息抽取结果
30 | * 一个网页模板对应一个抽取结果
31 | * 如果一个网页有多个网页模板
32 | * 每个模板都抽取成功
33 | * 只要这些模板保存在不同的表中
34 | * URL作为主键就不会冲突
35 | *
36 | * @author 杨尚川
37 | *
38 | */
39 | public class ExtractResult {
40 | /**
41 | * 网页对应的URL
42 | */
43 | private String url;
44 | /**
45 | * 网页原始内容
46 | */
47 | private byte[] content;
48 | /**
49 | * 网页编码
50 | */
51 | private String encoding;
52 | /**
53 | * 网页关键词元数据
54 | */
55 | private String keywords;
56 | /**
57 | * 网页描述元数据
58 | */
59 | private String description;
60 | /**
61 | * 网页提取出的文本存储到哪个表
62 | */
63 | private String tableName;
64 | /**
65 | * 一个网页可能有多个抽取结果项,至少要一个
66 | */
67 | private final Map> extractResultItems = new HashMap<>();
68 | /**
69 | * 抽取失败日志
70 | */
71 | private final List extractFailLogs = new ArrayList<>();
72 |
73 | public boolean isSuccess() {
74 | return extractFailLogs.isEmpty() && !extractResultItems.isEmpty();
75 | }
76 |
77 | public String getUrl() {
78 | return url;
79 | }
80 |
81 | public void setUrl(String url) {
82 | this.url = url;
83 | }
84 |
85 | public byte[] getContent() {
86 | return content;
87 | }
88 |
89 | public void setContent(byte[] content) {
90 | this.content = content;
91 | }
92 |
93 | public String getEncoding() {
94 | return encoding;
95 | }
96 |
97 | public void setEncoding(String encoding) {
98 | this.encoding = encoding;
99 | }
100 |
101 | public String getKeywords() {
102 | return keywords;
103 | }
104 |
105 | public void setKeywords(String keywords) {
106 | this.keywords = keywords;
107 | }
108 |
109 | public String getDescription() {
110 | return description;
111 | }
112 |
113 | public void setDescription(String description) {
114 | this.description = description;
115 | }
116 |
117 | public String getTableName() {
118 | return tableName;
119 | }
120 |
121 | public void setTableName(String tableName) {
122 | this.tableName = tableName;
123 | }
124 |
125 | public Map> getExtractResultItems() {
126 | return extractResultItems;
127 | }
128 |
129 | public void addExtractResultItem(ExtractResultItem extractResultItem) {
130 | List list = extractResultItems.get(extractResultItem.getField());
131 | if(list == null){
132 | list = new ArrayList<>();
133 | extractResultItems.put(extractResultItem.getField(), list);
134 | }
135 | list.add(extractResultItem);
136 | }
137 |
138 | public List getExtractFailLogs() {
139 | return extractFailLogs;
140 | }
141 |
142 | public void addExtractFailLog(ExtractFailLog extractFailLog) {
143 | this.extractFailLogs.add(extractFailLog);
144 | extractFailLog.setExtractResult(this);
145 | }
146 |
147 | @Override
148 | public String toString() {
149 | return "ExtractResult [\nurl=" + url + ", \ntableName=" + tableName
150 | + ", \nextractResultItems=" + extractResultItems + ", \nextractFailLogs=" + extractFailLogs + "]";
151 | }
152 | }
153 |
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/PersonalTechnology.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/wwdc-2016-ios-10/3386601.html=New Updates to iPhone Software
2 | http://learningenglish.voanews.com/a/phone-listening/3377513.html=Your Phone Might Be Listening to You
3 | http://learningenglish.voanews.com/a/twitter-update-2016/3364301.html=Changes to Twitter Let You Say More in a Tweet
4 | http://learningenglish.voanews.com/a/travel-apps/3356673.html=Travel Apps Offer Direction Fun
5 | http://learningenglish.voanews.com/a/google-io-2016/3344946.html=Google Announces New Services and Products
6 | http://learningenglish.voanews.com/a/tech-tools-relax/3332930.html=Tech Tools Offer Relaxation Techniques
7 | http://learningenglish.voanews.com/a/blue-light-filters/3325898.html=Filter Blue Light for Better Sleep
8 | http://learningenglish.voanews.com/a/nanowires-battery/3313100.html=Batteries That Last Forever Could Be Near
9 | http://learningenglish.voanews.com/a/education-apps-infographic/3304443.html=Apps to Help Students Do Their Best Work
10 | http://learningenglish.voanews.com/a/facebook-hidden-messages/3293621.html=Where to Find Your Hidden Messages on Facebook
11 | http://learningenglish.voanews.com/a/apps-battery-use/3283650.html=Which Apps Drain Your Phone's Battery
12 | http://learningenglish.voanews.com/a/best-place-to-live-teleport/3269894.html=Find Your Best Place to Live
13 | http://learningenglish.voanews.com/a/reword-just-not-sorry/3261377.html=Want to Choose Better Words
14 | http://learningenglish.voanews.com/a/apple-annouces-small-iphone-return/3251714.html=Apple Announces the Return of a Smaller iPhone
15 | http://learningenglish.voanews.com/a/how-good-are-you-at-geography-games/3239633.html=How Good Are You at Geography Games
16 | http://learningenglish.voanews.com/a/facebook-reactions/3227759.html=Facebook Reactions Much More Than a Like
17 | http://learningenglish.voanews.com/a/apple-fbi-iphone/3216399.html=Apple vs FBI And Your Privacy
18 | http://learningenglish.voanews.com/a/fotor-photo-editing/3203863.html=Photo Editing App Improves Your Shots
19 | http://learningenglish.voanews.com/a/3194948.html=App Feels Earthquakes Through Mobile Phones
20 | http://learningenglish.voanews.com/a/facebook-new-features/3183507.html=Facebook Updates with New Features
21 | http://learningenglish.voanews.com/a/gmail-tools/3173145.html=Tools to Use with Gmail
22 | http://learningenglish.voanews.com/a/ces-2016/3163620.html=Electronics Show Displays Newest Gadgets
23 | http://learningenglish.voanews.com/a/american-concerts-available-worldwide/3152651.html=American Concerts Available Worldwide
24 | http://learningenglish.voanews.com/a/netflix-expands-to-190-countries/3144530.html=Netflix Expands to 190 Countries
25 | http://learningenglish.voanews.com/a/food-apps-infographic/3133599.html=Apps for Healthy Food Choices
26 | http://learningenglish.voanews.com/a/personal-tech-development-of-the-year/3107202.html=Personal Technology Trends of 2015
27 | http://learningenglish.voanews.com/a/tracking-santa-with-technology/3112564.html=Tracking Santa with Technology
28 | http://learningenglish.voanews.com/a/high-tech-devices-to-enjoy-holiday-music/3102745.html='High Tech' Devices to Enjoy Holiday Music
29 | http://learningenglish.voanews.com/a/facebook-quiz-privacy/3092280.html=Protect Your Privacy While Having Fun with Facebook Quizzes
30 | http://learningenglish.voanews.com/a/apple-tv-fire-tv-or-roku/3082835.html=Apple TV Fire TV or Roku
31 | http://learningenglish.voanews.com/a/tech-gifts-2015/3073944.html=15 Tech Gift Ideas for 2015
32 | http://learningenglish.voanews.com/a/self-driving-cars/3062082.html=Will Your Next Car Drive Itself
33 | http://learningenglish.voanews.com/a/apple-tv-review/3032591.html=Apple TV Review
34 | http://learningenglish.voanews.com/a/computer-ssd/3023674.html=Solid State Drive Gives New Life to Old Computer
35 | http://learningenglish.voanews.com/a/ten-tech-tools-to-teach-you-new-words/3015244.html=10 Tech Tools to Teach You New Words
36 | http://learningenglish.voanews.com/a/emoji-say-volumes-without-a-word-/3004375.html=Emojis Say Volumes Without a Word
37 | http://learningenglish.voanews.com/a/top-5-crowdfunding-projects-ever/2993888.html=Top 5 Crowdfunding Projects Ever
38 | http://learningenglish.voanews.com/a/cell-phone-use-among-friends-how-rude/2983035.html=Cell Phone Use Among Friends How Rude
39 | http://learningenglish.voanews.com/a/amazons-tool-for-reading-one-word-at-a-time/2974264.html=Amazon's Tool for Reading One Word at a Time
40 | http://learningenglish.voanews.com/a/longer-battery-life-among-apple-ios9-updates/2967811.html=Longer Battery Life Among Apple iOS9 Updates
41 | http://learningenglish.voanews.com/a/apple-iphone-ipad-appletv-watch/2954969.html=New iPhones Watches TVs Big iPads Offer Better Selfies
--------------------------------------------------------------------------------
/html-extractor/src/main/java/org/apdplat/extractor/html/impl/ExtractFunctionExecutor.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * APDPlat - Application Product Development Platform
4 | * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
5 | *
6 | * This program is free software: you can redistribute it and/or modify
7 | * it under the terms of the GNU General Public License as published by
8 | * the Free Software Foundation, either version 3 of the License, or
9 | * (at your option) any later version.
10 | *
11 | * This program is distributed in the hope that it will be useful,
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | * GNU General Public License for more details.
15 | *
16 | * You should have received a copy of the GNU General Public License
17 | * along with this program. If not, see .
18 | *
19 | */
20 |
21 | package org.apdplat.extractor.html.impl;
22 |
23 | import org.apache.commons.lang.StringUtils;
24 | import org.apdplat.extractor.html.model.CssPath;
25 | import org.jsoup.nodes.Document;
26 | import org.jsoup.nodes.Element;
27 | import org.jsoup.select.Elements;
28 | import org.slf4j.Logger;
29 | import org.slf4j.LoggerFactory;
30 |
31 | /**
32 | * 抽取函数执行引擎,提供的抽取函数有:
33 | * 1、deleteChild(div.ep-source)
34 | * 2、removeText(作者:)
35 | * 3、substring(0,19) 抽取函数的格式为:函数名称+(+逗号分割的参数+)
36 | *
37 | * @author 杨尚川
38 | *
39 | */
40 | public class ExtractFunctionExecutor {
41 | public static final Logger LOGGER = LoggerFactory.getLogger(ExtractFunctionExecutor.class);
42 |
43 | /**
44 | * 执行抽取函数
45 | *
46 | * @param text CSS路径抽取出来的文本
47 | * @param doc 根文档
48 | * @param cssPath CSS路径对象
49 | * @param parseExpression 抽取函数
50 | * @return 抽取函数处理之后的文本
51 | */
52 | public static String execute(String text, Document doc, CssPath cssPath, String parseExpression) {
53 | if (parseExpression.startsWith("deleteChild")) {
54 | return executeDeleteChild(text, doc, cssPath, parseExpression);
55 | }
56 | if (parseExpression.startsWith("removeText")) {
57 | return executeRemoveText(text, parseExpression);
58 | }
59 | if (parseExpression.startsWith("substring")) {
60 | return executeSubstring(text, parseExpression);
61 | }
62 |
63 | return null;
64 | }
65 |
66 | /**
67 | * 截取指定范围的文本 使用方法:substring(0,19)
68 | * 括号内的参数为2个,分别是字符索引下标,截取从0开始到19的字符串,索引包括0,不包括19,即[0 - 19)
69 | *
70 | * @param text CSS路径抽取出来的文本
71 | * @param parseExpression 抽取函数
72 | * @return 抽取函数处理之后的文本
73 | */
74 | public static String executeSubstring(String text, String parseExpression) {
75 | LOGGER.debug("substring抽取函数之前:" + text);
76 | String parameter = parseExpression.replace("substring(", "");
77 | parameter = parameter.substring(0, parameter.length() - 1);
78 | String[] attr = parameter.split(",");
79 | if (attr != null && attr.length == 2) {
80 | int beginIndex = Integer.parseInt(attr[0]);
81 | int endIndex = Integer.parseInt(attr[1]);
82 | text = text.substring(beginIndex, endIndex);
83 | }
84 | LOGGER.debug("substring抽取函数之后:" + text);
85 | return text;
86 | }
87 |
88 | /**
89 | * 删除指定的文本 使用方法:removeText(作者:) 括号内的参数为文本字符,从CSS路径匹配的文本中删除参数文本
90 | *
91 | * @param text CSS路径抽取出来的文本
92 | * @param parseExpression 抽取函数
93 | * @return 抽取函数处理之后的文本
94 | */
95 | public static String executeRemoveText(String text, String parseExpression) {
96 | LOGGER.debug("removeText抽取函数之前:" + text);
97 | String parameter = parseExpression.replace("removeText(", "");
98 | parameter = parameter.substring(0, parameter.length() - 1);
99 | text = text.replace(parameter, "");
100 | LOGGER.debug("removeText抽取函数之后:" + text);
101 | return text;
102 | }
103 |
104 | /**
105 | * 删除子CSS路径的内容 使用方法:deleteChild(div.ep-source)
106 | * 括号内的参数为相对CSS路径的子路径,从CSS路径匹配的文本中删除子路径匹配的文本
107 | *
108 | * @param text CSS路径抽取出来的文本
109 | * @param doc 根文档
110 | * @param cssPath CSS路径对象
111 | * @param parseExpression 抽取函数
112 | * @return 抽取函数处理之后的文本
113 | */
114 | public static String executeDeleteChild(String text, Document doc, CssPath cssPath, String parseExpression) {
115 | LOGGER.debug("deleteChild抽取函数之前:" + text);
116 | String parameter = parseExpression.replace("deleteChild(", "");
117 | parameter = parameter.substring(0, parameter.length() - 1);
118 | Elements elements = doc.select(cssPath.getCssPath() + " " + parameter);
119 | for (Element element : elements) {
120 | String t = element.text();
121 | if (StringUtils.isNotBlank(t)) {
122 | LOGGER.debug("deleteChild抽取函数删除:" + t);
123 | text = text.replace(t, "");
124 | }
125 | }
126 | LOGGER.debug("deleteChild抽取函数之后:" + text);
127 | return text;
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/mvnw.cmd:
--------------------------------------------------------------------------------
1 | @REM ----------------------------------------------------------------------------
2 | @REM Licensed to the Apache Software Foundation (ASF) under one
3 | @REM or more contributor license agreements. See the NOTICE file
4 | @REM distributed with this work for additional information
5 | @REM regarding copyright ownership. The ASF licenses this file
6 | @REM to you under the Apache License, Version 2.0 (the
7 | @REM "License"); you may not use this file except in compliance
8 | @REM with the License. You may obtain a copy of the License at
9 | @REM
10 | @REM http://www.apache.org/licenses/LICENSE-2.0
11 | @REM
12 | @REM Unless required by applicable law or agreed to in writing,
13 | @REM software distributed under the License is distributed on an
14 | @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | @REM KIND, either express or implied. See the License for the
16 | @REM specific language governing permissions and limitations
17 | @REM under the License.
18 | @REM ----------------------------------------------------------------------------
19 |
20 | @REM ----------------------------------------------------------------------------
21 | @REM Maven2 Start Up Batch script
22 | @REM
23 | @REM Required ENV vars:
24 | @REM JAVA_HOME - location of a JDK home dir
25 | @REM
26 | @REM Optional ENV vars
27 | @REM M2_HOME - location of maven2's installed home dir
28 | @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
29 | @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a key stroke before ending
30 | @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
31 | @REM e.g. to debug Maven itself, use
32 | @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
33 | @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
34 | @REM ----------------------------------------------------------------------------
35 |
36 | @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
37 | @echo off
38 | @REM enable echoing my setting MAVEN_BATCH_ECHO to 'on'
39 | @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO%
40 |
41 | @REM set %HOME% to equivalent of $HOME
42 | if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
43 |
44 | @REM Execute a user defined script before this one
45 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
46 | @REM check for pre script, once with legacy .bat ending and once with .cmd ending
47 | if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat"
48 | if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd"
49 | :skipRcPre
50 |
51 | @setlocal
52 |
53 | set ERROR_CODE=0
54 |
55 | @REM To isolate internal variables from possible post scripts, we use another setlocal
56 | @setlocal
57 |
58 | @REM ==== START VALIDATION ====
59 | if not "%JAVA_HOME%" == "" goto OkJHome
60 |
61 | echo.
62 | echo Error: JAVA_HOME not found in your environment. >&2
63 | echo Please set the JAVA_HOME variable in your environment to match the >&2
64 | echo location of your Java installation. >&2
65 | echo.
66 | goto error
67 |
68 | :OkJHome
69 | if exist "%JAVA_HOME%\bin\java.exe" goto init
70 |
71 | echo.
72 | echo Error: JAVA_HOME is set to an invalid directory. >&2
73 | echo JAVA_HOME = "%JAVA_HOME%" >&2
74 | echo Please set the JAVA_HOME variable in your environment to match the >&2
75 | echo location of your Java installation. >&2
76 | echo.
77 | goto error
78 |
79 | @REM ==== END VALIDATION ====
80 |
81 | :init
82 |
83 | set MAVEN_CMD_LINE_ARGS=%*
84 |
85 | @REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
86 | @REM Fallback to current working directory if not found.
87 |
88 | set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
89 | IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
90 |
91 | set EXEC_DIR=%CD%
92 | set WDIR=%EXEC_DIR%
93 | :findBaseDir
94 | IF EXIST "%WDIR%"\.mvn goto baseDirFound
95 | cd ..
96 | IF "%WDIR%"=="%CD%" goto baseDirNotFound
97 | set WDIR=%CD%
98 | goto findBaseDir
99 |
100 | :baseDirFound
101 | set MAVEN_PROJECTBASEDIR=%WDIR%
102 | cd "%EXEC_DIR%"
103 | goto endDetectBaseDir
104 |
105 | :baseDirNotFound
106 | set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
107 | cd "%EXEC_DIR%"
108 |
109 | :endDetectBaseDir
110 |
111 | IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
112 |
113 | @setlocal EnableExtensions EnableDelayedExpansion
114 | for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
115 | @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
116 |
117 | :endReadAdditionalConfig
118 |
119 | SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
120 |
121 | set WRAPPER_JAR="".\.mvn\wrapper\maven-wrapper.jar""
122 | set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
123 |
124 | %MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CMD_LINE_ARGS%
125 | if ERRORLEVEL 1 goto error
126 | goto end
127 |
128 | :error
129 | set ERROR_CODE=1
130 |
131 | :end
132 | @endlocal & set ERROR_CODE=%ERROR_CODE%
133 |
134 | if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost
135 | @REM check for post script, once with legacy .bat ending and once with .cmd ending
136 | if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat"
137 | if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd"
138 | :skipRcPost
139 |
140 | @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
141 | if "%MAVEN_BATCH_PAUSE%" == "on" pause
142 |
143 | if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE%
144 |
145 | exit /B %ERROR_CODE%
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/HealthLifestyle.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/health-and-lifestyle-exercise-protects-against-two-major-diseases/3391399.html=Exercise Can Protect Against Two Major Diseases
2 | http://learningenglish.voanews.com/a/aspirin-may-cut-cancer-deaths/3355757.html=Aspirin May Cut Cancer Deaths
3 | http://learningenglish.voanews.com/a/scientists-discover-gene-multiple-sclerosis/3369386.html=Scientists Discover Gene Responsible for Multiple Sclerosis
4 | http://learningenglish.voanews.com/a/3359431.html=How Do You Save Unwanted Babies
5 | http://learningenglish.voanews.com/a/3357042.html=This Superbug Is Too Strong for Antibiotics
6 | http://learningenglish.voanews.com/a/urban-residents-breathe-unsafe-air/3347253.html=WHO 80 of Urban Residents Breathe Unsafe Air
7 | http://learningenglish.voanews.com/a/advice-for-health-workers-caring-for-victims-of-female-genital-mutilation/3342483.html=Health Workers Advised on Care of FGM Victims
8 | http://learningenglish.voanews.com/a/3332366.html=Researchers Develop Better Cheaper Faster Zika Test
9 | http://learningenglish.voanews.com/a/white-women-are-dying-too-soon-in-america/3339042.html=White Women in US Are Dying Sooner
10 | http://learningenglish.voanews.com/a/rich-people-haver-better-stress-than-poor/3329623.html=Rich People Even Have Better Stress Than Poor
11 | http://learningenglish.voanews.com/a/medical-mistake-cause-third-of-deaths/3326271.html=Medical Mistakes Third-Leading Cause of Death in US
12 | http://learningenglish.voanews.com/a/emergency-medical-care-in-a-backpack/3318666.html=Emergency Medical Care in a Backpack
13 | http://learningenglish.voanews.com/a/blood-test-shows-when-a-person-has-a-concussion/3308652.html=Blood Test Shows Concussion Injury
14 | http://learningenglish.voanews.com/a/plants-help-women-live-longer/3311928.html=Study Plants May Help Women Live Longer
15 | http://learningenglish.voanews.com/a/four-healthy-tasty-spices/3305474.html=Can You Name Four of the Healthiest Tastiest Spices
16 | http://learningenglish.voanews.com/a/suicide-rates-increase-in-us/3304646.html=Suicide Rates Increase in US
17 | http://learningenglish.voanews.com/a/one-minute-excercise/3306957.html=One Minute of Exercise May Be All You Need
18 | http://learningenglish.voanews.com/a/contagious-virues-sickens-million/3306881.html=Highly Contagious Virus Sickens Millions Costs Billions
19 | http://learningenglish.voanews.com/a/why-its-harder-to-sleep-new-place/3301774.html=Why It's Harder to Sleep in a New Place
20 | http://learningenglish.voanews.com/a/who-drowning-among-top-causes-of-deaths-in-children/3294288.html=WHO Drowning Among Top Causes of Death in Children
21 | http://learningenglish.voanews.com/a/3258485.html=Mentally Ill Indonesians 'Living in Hell'
22 | http://learningenglish.voanews.com/a/3283010.html=US Health Officials Zika 'Scarier Than We Thought'
23 | http://learningenglish.voanews.com/a/study-says-there-are-more-obese-people-in-the-world-than-underweight-people/3271436.html=Study More Obese People Than Underweight
24 | http://learningenglish.voanews.com/a/want-to-lose-weight-listen-to-yourself-eat/3244250.html=Want to Lose Weight Listen to Yourself Eat
25 | http://learningenglish.voanews.com/a/asian-kids-face-rising-hunger-and-obesity-rates/3258511.html=Asian Children Face Hunger and Obesity
26 | http://learningenglish.voanews.com/a/3242332.html=Mummies Contain Clues to Colon Cancer
27 | http://learningenglish.voanews.com/a/is-email-stressing-you-out/3209644.html=Is Email Stressing You Out
28 | http://learningenglish.voanews.com/a/3236261.html=Social Media Is Keeping Young Adults Awake
29 | http://learningenglish.voanews.com/a/3236302.html=Happy National Napping Day
30 | http://learningenglish.voanews.com/a/3223698.html=Ever Feel Like You'll Die of A Broken Heart
31 | http://learningenglish.voanews.com/a/blood-cancer-therapy-is-last-chance-for-some/3208279.html=Blood Cancer Therapy Is Last Chance for Some
32 | http://learningenglish.voanews.com/a/helicopter-parenting/3194736.html=Helping Children Too Much Is Hurting Them
33 | http://learningenglish.voanews.com/a/us-14-new-reports-of-sexual-transmission-of-zika-virus/3206754.html=US 14 New Reports of Sexual Transmission of Zika Virus
34 | http://learningenglish.voanews.com/a/spotlight-on-zika-helps-other-neglected-tropical-diseases/3196431.html=Spotlight on Zika Helps Neglected Tropical Diseases
35 | http://learningenglish.voanews.com/a/ebola-zika-clues/3193225.html=Ebola Crisis Might Hold Clues for Fighting Zika Virus
36 | http://learningenglish.voanews.com/a/willpower-do-you-think-you-can/3159018.html=Do You Think You Can
37 | http://learningenglish.voanews.com/a/power-of-touch/3179086.html=The Power of Touch
38 | http://learningenglish.voanews.com/a/zika-virus-olympic-concerns/3187238.html=Olympic Athletes Concerned About Zika Virus
39 | http://learningenglish.voanews.com/a/researchers-find-link-between-zika-virus-and-birth-defects/3187097.html=Researchers Link Between Zika Virus and Birth Defects
40 | http://learningenglish.voanews.com/a/white-house-seeks-1point-8-billion-to-combat-zika/3182160.html=Obama Seeks $1.8 Billion to Combat Zika
41 | http://learningenglish.voanews.com/a/zika-virus-detected-in-body-fluids/3178780.html=Zika Virus Detected in Body Fluids
42 | http://learningenglish.voanews.com/a/person-infected-with-zika-virus-through-sexual-relations/3175064.html=Person Infected With Zika Virus Through Sexual Contact
43 | http://learningenglish.voanews.com/a/who-declares-zika-virus-a-global-health-emergency/3172096.html=WHO Declares Zika A Global Health Emergency
44 | http://learningenglish.voanews.com/a/copper-kills-viruses-on-contact/3147962.html=Copper Metal Kills Viruses on Contact
45 | http://learningenglish.voanews.com/a/urgent-call-to-stop-the-spread-of-zika-virus/3165458.html=Urgent Call to Stop Zika Virus
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/ScienceintheNews.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/bones-of-hobbit-ancester-found-in-indonesia/3377930.html='Hobbit' Ancestor Bones Found
2 | http://learningenglish.voanews.com/a/australian-spider-worlds-fastest/3387065.html=Australian Spider May Be World's Fastest
3 | http://learningenglish.voanews.com/a/scientists-say-gene-editing-should-stay-in-laboratories/3390454.html=Scientists Gene Editing Should Stay in Laboratories
4 | http://learningenglish.voanews.com/a/how-3d-printing-can-create-tissue-from-stem-cells/3374074.html=How 3-D Printing Can Create Tissue from Stem Cells
5 | http://learningenglish.voanews.com/a/was-mars-less-red-long-ago/3355973.html=Was Mars -- The Red Planet -- Once White
6 | http://learningenglish.voanews.com/a/what-it-takes-to-be-an-astronaut/3367765.html=What It Takes to Be an Astronaut
7 | http://learningenglish.voanews.com/a/largest-sponge-found-off-hawaii-coast/3360778.html=Minivan-sized Sponge Found off Hawaii Coast
8 | http://learningenglish.voanews.com/a/knife-cuts-on-mastodon-bone-reveal-earlier-humans/3347233.html=Knife Cuts on Mastodon Bone Reveal Earlier Humans
9 | http://learningenglish.voanews.com/a/artificial-intelligence-helpful-and-dangerous/3334422.html=Artificial Intelligence Helpful and Dangerous
10 | http://learningenglish.voanews.com/a/solar-cooker-helps-reduce-smoke-pollution-deaths/3313364.html=Solar Cooker Reduces Smoke Pollution Deaths
11 | http://learningenglish.voanews.com/a/nasa-training-for-commercial-flights-to-space/3318170.html=NASA Astronauts Train for Commercial Space Flights
12 | http://learningenglish.voanews.com/a/nasa-discovers-nearly-1300-planets/3326050.html=NASA Discovers Nearly 1300 New Planets
13 | http://learningenglish.voanews.com/a/amazing-amazon-hides-coral-reef/3314107.html=Amazing Amazon Hides Atlantic's Coral Reef
14 | http://learningenglish.voanews.com/a/dinosaurs-already-decreasing-before-asteroid-hit/3298965.html=Dinosaurs Already Decreasing Before Asteroid Hit
15 | http://learningenglish.voanews.com/a/some-birds-as-smarts-as-apes/3298714.html=Some Birds as Smart as Apes
16 | http://learningenglish.voanews.com/a/monkeys-raft-to-panama/3298794.html=Scientists Monkeys Used Raft to Cross Ocean
17 | http://learningenglish.voanews.com/a/polio-virus-used-to-kill-brain-tumors/3196863.html=Polio Virus Used to Kill Brain Tumors
18 | http://learningenglish.voanews.com/a/researchers-look-for-ways-to-heal-brains/3109209.html=Researchers Repair Brain Injuries With New Cells
19 | http://learningenglish.voanews.com/a/hawking-zuckerberg-help-launch-search-for-life-in-the-universe/3287925.html=Hawking Zuckerberg to Search for Life in the Universe
20 | http://learningenglish.voanews.com/a/marine-biologist-fight-for-coral-reefs-makes-science-fun/3232146.html=Scientist Fights for Coral Reefs Makes Science Fun
21 | http://learningenglish.voanews.com/a/scientists-could-be-closer-to-aids-cure/3265201.html=Scientists Could be Closer to AIDS Cure
22 | http://learningenglish.voanews.com/a/will-the-plain-of-jars-mystery-be-solved/3250131.html=Scientists Closer to Solving Plain of Jars Mystery
23 | http://learningenglish.voanews.com/a/making-world-better-tech/3247753.html=Four Young People Who Make the World Better
24 | http://learningenglish.voanews.com/a/climate-change-sea-levels-and-arctic-temperatures-rise/3203794.html=Sea Levels and Arctic Temperatures Rise
25 | http://learningenglish.voanews.com/a/astronomers-eclipse-window-seat/3229536.html=Astronomers Get Window Seat for Eclipse
26 | http://learningenglish.voanews.com/a/smart-bandages-could-heal-wounds-quickly/3188911.html=Smart Bandages to Heal Wounds More Quickly
27 | http://learningenglish.voanews.com/a/oceans-could-hold-more-plastics-than-fish-2050/3166848.html=Oceans To Hold More Plastic Than Fish by 2050
28 | http://learningenglish.voanews.com/a/scott-kelly-about-a-year-in-space/3208316.html=Astronaut Scott Kelly Talks about His Year in Space
29 | http://learningenglish.voanews.com/a/volcanic-clays-kill-bacteria/3183724.html=Some Volcanic Clays Kill Bacteria
30 | http://learningenglish.voanews.com/a/eistein-is-proven-right/3188629.html=Gravitational Waves Detected Einstein Is Right
31 | http://learningenglish.voanews.com/a/nasa-asteroid/3183344.html=NASA Big Asteroid Could Pass Near Earth Next Month
32 | http://learningenglish.voanews.com/a/rarely-seen-bush-dogs-pictures-panama/3161723.html=Cameras Capture Pictures of Mysterious Bush Dog
33 | http://learningenglish.voanews.com/a/ninth-planet-may-be-in-solar-system/3156472.html=Is There a Ninth Planet in Our Solar System
34 | http://learningenglish.voanews.com/a/device-shows-human-body-as-never-seen-before/3066410.html=New Device Shows Human Body As Never Seen Before
35 | http://learningenglish.voanews.com/a/how-do-dogs-drink-water/3116091.html=Think You Know How Dogs Drink Water
36 | http://learningenglish.voanews.com/a/habitat-loss-endangers-migrating-birds/3109229.html=Migrating Birds Can't Find Their Way Home
37 | http://learningenglish.voanews.com/a/plant-your-mobile-charger-in-the-dirt/3057157.html=Plant Your Mobile Charger in the Dirt
38 | http://learningenglish.voanews.com/a/look-back-at-science-in-2015/3115846.html=A Look Back at 2015 The Year in Science and Technology
39 | http://learningenglish.voanews.com/a/spacex-rocket-launches-and-returns/3115778.html=SpaceX Celebrates Historic Rocket Launch Landing
40 | http://learningenglish.voanews.com/a/apollo-11-spacecraft-soon-to-be-in-3d/3109269.html=Apollo 11 Spacecraft Lands in Your Smartphone
41 | http://learningenglish.voanews.com/a/could-a-jolt-to-the-brain-stop-motion-sickness/3060134.html=Could an Electric Current Stop Motion Sickness
42 | http://learningenglish.voanews.com/a/researchers-use-lasers-to-chill-water/3072071.html=Lasers Used to Chill Instead of Heat
43 | http://learningenglish.voanews.com/a/nasa-paints-pluto-wild-with-color/3065467.html=NASA Paints Pluto Wild With Color
44 | http://learningenglish.voanews.com/a/whisky-tested-as-alternative-fuel-for-vehicles/3065592.html=Scottish Whisky Tested as Alternative to Fossil Fuels
45 | http://learningenglish.voanews.com/a/melting-glaciers-and-climate-talks/3056901.html=Melting Glaciers on Climate Talk List in Paris
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/ThisIsAmerica.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/young-traveler-hopes-to-visit-every-national-park-in-america-/3378000.html=Young Traveler Hopes to Visit Every National Park
2 | http://learningenglish.voanews.com/a/should-insensitive-place-names-be-changed/3370802.html=Should Offensive Place Names Be Changed
3 | http://learningenglish.voanews.com/a/the-new-sanctuary-movement-seeks-to-help-immigrants-to-us/3361063.html=Sanctuary Movement Helps Immigrants at Risk
4 | http://learningenglish.voanews.com/a/hallowed-ground-arlington-national-cemetery/2780835.html=Memorial Day Arlington National Cemetery
5 | http://learningenglish.voanews.com/a/us-golf-courses-try-new-ways-to-get-more-people-to-play/3347618.html=How Can Golf Appeal to More People
6 | http://learningenglish.voanews.com/a/rockford-peaches-baseball-girls-a-league-of-their-own/3344028.html=All-Girls Baseball Team in Seattle Makes History
7 | http://learningenglish.voanews.com/a/death-judge-scalia-slows-top-american-court/3337624.html=Death of Judge Scalia Slows Top American Court
8 | http://learningenglish.voanews.com/a/american-schools-remain-divided-by-race/3336002.html=American Schools Remain Divided by Race
9 | http://learningenglish.voanews.com/a/real-mermaids-not-really-but/3313412.html=Real Mermaids Not Really But
10 | http://learningenglish.voanews.com/a/ex-prisoners-ask-for-2nd-chance-and-job/3318669.html=Ex-Prisoners Ask for Second Chance and a Job
11 | http://learningenglish.voanews.com/a/top-democratic-and-republican-senators-disagree-on-next-steps-for-supreme-court-nomination/3306320.html=Democrats Republicans Disagree on Supreme Court Nomination
12 | http://learningenglish.voanews.com/a/what-bathroom-should-transgender-people-use/3308562.html=What Bathroom Should Transgender People Use
13 | http://learningenglish.voanews.com/a/high-demand-from-foreign-investors-for-special-us-visas/3298194.html=High Demand from Investors for Special US Visas
14 | http://learningenglish.voanews.com/a/us-immigration-case/3292826.html=Will US Let Some Undocumented Immigrants Stay
15 | http://learningenglish.voanews.com/a/pope-message-on-family-in-line-with-many-us-catholics/3287739.html=Pope's Message on Family in Line with US Catholics
16 | http://learningenglish.voanews.com/a/twelve-places-often-mispronounced/3282903.html=Twelve Mispronounced US Place Names
17 | http://learningenglish.voanews.com/a/supreme-court-helps-define-one-person-one-vote/3276158.html=Supreme Court Helps Define One Person One Vote
18 | http://learningenglish.voanews.com/a/large-cars-new-york-auto-show/3265049.html=Large Cars on Top At New York Auto Show
19 | http://learningenglish.voanews.com/a/are-16-17-mature-enough-to-vote/3247882.html=Are Teenagers Mature Enough to Vote
20 | http://learningenglish.voanews.com/a/some-foreign-leaders-taking-sides-against-trump/3255022.html=Foreign Leaders Taking Sides Against Donald Trump
21 | http://learningenglish.voanews.com/a/americans-take-80-world-opioid-supply/3244249.html=Americans Take 80 of World's Opioid Supply
22 | http://learningenglish.voanews.com/a/record-number-americans-giving-up-us-citizenship/3219840.html=Americans Who Gave Up Passports Tell Why
23 | http://learningenglish.voanews.com/a/trappist-monastery-in-virginia-countryside-adapts-to-the-21st-century/3203507.html=Silent Monks Learn to Speak for Revenue
24 | http://learningenglish.voanews.com/a/is-there-more-lying-in-this-election/3208230.html=Is There More Lying In This Election
25 | http://learningenglish.voanews.com/a/a-rages-to-riches-story/3209830.html=From Child of Freed Slaves to Millionaire
26 | http://learningenglish.voanews.com/a/smartest-americans/3186794.html=Where Are the Best Educated Americans
27 | http://learningenglish.voanews.com/a/presidents-day-2016-presidential-firsts-/3186922.html=Presidential Facts and 'Firsts'
28 | http://learningenglish.voanews.com/a/americas-destinations-on-the-rise/3185649.html=America's Destinations 'On the Rise'
29 | http://learningenglish.voanews.com/a/us-gives-less-foreign-aid-than-americans-think/3172204.html=US Gives Less Foreign Aid than Americans Think
30 | http://learningenglish.voanews.com/a/more-cases-of-brain-disease-from-football-blows/3175188.html=More Cases of Brain Disease from Football Blows
31 | http://learningenglish.voanews.com/a/white-house-protester-dies-after-longest-vigil/3166054.html=White House Protester Dies After Longest Vigil
32 | http://learningenglish.voanews.com/a/3158747.html=Water Crisis in US City a 'Disaster Not Just an Emergency'
33 | http://learningenglish.voanews.com/a/bulgarians-biking-from-alaska-to-argentina/3143338.html=Bulgarians Biking From Alaska to Argentina
34 | http://learningenglish.voanews.com/a/sean-penn-guzman-el-chapo-propaganda/3148549.html=Sean Penn Journalist or Propagandist
35 | http://learningenglish.voanews.com/a/voa-study-questions/3133320.html=Americans Jobs More Important Than Terrorism
36 | http://learningenglish.voanews.com/a/new-years-music/1813304.html=Memories and Hopes Meet in New Year's Music
37 | http://learningenglish.voanews.com/a/what-does-that-mean-what-did-she-say-expressions-american-presidential-candidates-use-in-debates/3106823.html=Political Idioms What Did He Say
38 | http://learningenglish.voanews.com/a/floridas-key-west-close-to-perfect-far-from-normal/3102410.html=Florida's Key West Close to Perfect Far From Normal
39 | http://learningenglish.voanews.com/a/hold-the-butter-modern-diets-meet-holiday-traditions/3117341.html=Hold The Butter Modern Diets Meet Holiday Traditions
40 | http://learningenglish.voanews.com/a/route-66-end-of-the-trail-california/3109251.html=Route 66 California The End of the Trail
41 | http://learningenglish.voanews.com/a/whay-are-americans-so-angry/3099245.html=Why Are Americans So Angry
42 | http://learningenglish.voanews.com/a/arizona-spirit-route-66-voa/3099375.html=Arizona The Spirit of Route 66
43 | http://learningenglish.voanews.com/a/shootings-leads-questions-police-tactics/3085017.html=Shootings Lead to Questions About Police Tactics
44 | http://learningenglish.voanews.com/a/route-66-voa-santa-fe-city-different/3088211.html=Santa Fe The City Different
45 | http://learningenglish.voanews.com/a/four-famous-foods-on-route-66/3073335.html=Route 66 Serves Up Pizza Burgers 'Horseshoes' and More
--------------------------------------------------------------------------------
/html-extractor-web/src/main/webapp/api/all_extract_regular.jsp:
--------------------------------------------------------------------------------
1 | <%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding="UTF-8"%>
2 | [
3 | {
4 | "urlPattern": "http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html",
5 | "regexPattern": "/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html",
6 | "pageTemplates": [
7 | {
8 | "templateName": "网易财经频道1",
9 | "tableName": "finance",
10 | "cssPaths": [
11 | {
12 | "fieldName": "title",
13 | "cssPath": "h1",
14 | "fieldDescription": "标题",
15 | "extractFunctions": []
16 | },
17 | {
18 | "fieldName": "content",
19 | "cssPath": "div#endText",
20 | "fieldDescription": "正文",
21 | "extractFunctions": []
22 | }
23 | ]
24 | },
25 | {
26 | "templateName": "网易财经频道2",
27 | "tableName": "finance",
28 | "cssPaths": [
29 | {
30 | "fieldName": "title",
31 | "cssPath": "h1",
32 | "fieldDescription": "标题",
33 | "extractFunctions": []
34 | },
35 | {
36 | "fieldName": "publishTime",
37 | "cssPath": "html body div#js-epContent.ep-content div.ep-content-bg div#epContentLeft.ep-content-main div.ep-main-bg div.clearfix div.ep-info div.left",
38 | "fieldDescription": "发表时间",
39 | "extractFunctions": [
40 | {
41 | "fieldName": "publishTime",
42 | "fieldDescription": "发表时间",
43 | "extractExpression": "substring(0,19)"
44 | }
45 | ]
46 | },
47 | {
48 | "fieldName": "content",
49 | "cssPath": "div#endText",
50 | "fieldDescription": "正文",
51 | "extractFunctions": []
52 | }
53 | ]
54 | },
55 | {
56 | "templateName": "网易财经栏目",
57 | "tableName": "finance",
58 | "cssPaths": [
59 | {
60 | "fieldName": "title",
61 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l h1",
62 | "fieldDescription": "标题",
63 | "extractFunctions": []
64 | },
65 | {
66 | "fieldName": "content",
67 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.w_text",
68 | "fieldDescription": "正文",
69 | "extractFunctions": []
70 | },
71 | {
72 | "fieldName": "author",
73 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.author span.name",
74 | "fieldDescription": "作者",
75 | "extractFunctions": [
76 | {
77 | "fieldName": "author",
78 | "fieldDescription": "作者",
79 | "extractExpression": "removeText(作者:)"
80 | }
81 | ]
82 | },
83 | {
84 | "fieldName": "introduction",
85 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.introduction p",
86 | "fieldDescription": "导语",
87 | "extractFunctions": []
88 | },
89 | {
90 | "fieldName": "followers",
91 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.words_bbs div#tieArea.tie-area div#tiePostBox.tie-post div.tie-titlebar span.tie-info a.js-bactCount",
92 | "fieldDescription": "跟贴人数",
93 | "extractFunctions": []
94 | },
95 | {
96 | "fieldName": "tieTotalCount",
97 | "cssPath": "html body div#money_wrap.money_wrap div.common_wrap div.area div.w_main div.col_l div.author a.discuss span.tieTotalCount tieTotalCount",
98 | "fieldDescription": "参与讨论人数",
99 | "extractFunctions": []
100 | }
101 | ]
102 | }
103 | ]
104 | },
105 | {
106 | "urlPattern": "http://finance.qq.com/a/\\d{8}/\\d{6}.htm",
107 | "regexPattern": "/a/\\d{8}/\\d{6}.htm",
108 | "pageTemplates": [
109 | {
110 | "templateName": "腾讯财经频道",
111 | "tableName": "finance",
112 | "cssPaths": [
113 | {
114 | "fieldName": "title",
115 | "cssPath": "div#C-Main-Article-QQ div.hd h1",
116 | "fieldDescription": "标题",
117 | "extractFunctions": []
118 | },
119 | {
120 | "fieldName": "content",
121 | "cssPath": "div#Cnt-Main-Article-QQ",
122 | "fieldDescription": "正文",
123 | "extractFunctions": [
124 | {
125 | "fieldName": "content",
126 | "fieldDescription": "正文",
127 | "extractExpression": "deleteChild(div.ep-source)"
128 | }
129 | ]
130 | }
131 | ]
132 | }
133 | ]
134 | }
135 | ]
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/NewsWords.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/3317431.html=Resilient
2 | http://learningenglish.voanews.com/a/3317432.html=Restoration
3 | http://learningenglish.voanews.com/a/3317434.html=Kickoff
4 | http://learningenglish.voanews.com/a/3317428.html=Revolutionary
5 | http://learningenglish.voanews.com/a/3317427.html=Cronyism
6 | http://learningenglish.voanews.com/a/3317430.html=Atrocities
7 | http://learningenglish.voanews.com/a/3317429.html=Rhetoric
8 | http://learningenglish.voanews.com/a/3210134.html=Transplant
9 | http://learningenglish.voanews.com/a/3210131.html=Birthright Citizenship
10 | http://learningenglish.voanews.com/a/3210133.html=Holy Grail
11 | http://learningenglish.voanews.com/a/3210129.html=Doping
12 | http://learningenglish.voanews.com/a/3210128.html=Emissions
13 | http://learningenglish.voanews.com/a/3210130.html=Tears of Joy
14 | http://learningenglish.voanews.com/a/3210123.html=Exonerated
15 | http://learningenglish.voanews.com/a/3210122.html=Drought
16 | http://learningenglish.voanews.com/a/3210127.html=Transgender
17 | http://learningenglish.voanews.com/a/3210124.html=Migrants
18 | http://learningenglish.voanews.com/a/2705670.html=Global
19 | http://learningenglish.voanews.com/a/2705669.html=Crude
20 | http://learningenglish.voanews.com/a/2705671.html=Recycling
21 | http://learningenglish.voanews.com/a/2705668.html=Viable
22 | http://learningenglish.voanews.com/a/2705673.html=Incarcerated
23 | http://learningenglish.voanews.com/a/2667269.html=Confidential
24 | http://learningenglish.voanews.com/a/2667266.html=Chaos
25 | http://learningenglish.voanews.com/a/2667265.html=Scope
26 | http://learningenglish.voanews.com/a/2601328.html=Combat
27 | http://learningenglish.voanews.com/a/2601326.html=Glaucoma
28 | http://learningenglish.voanews.com/a/2670607.html=Infrastructure
29 | http://learningenglish.voanews.com/a/2670605.html=Gridlock
30 | http://learningenglish.voanews.com/a/2601327.html=Museum
31 | http://learningenglish.voanews.com/a/2601324.html=Turmoil
32 | http://learningenglish.voanews.com/a/2601325.html=Secure
33 | http://learningenglish.voanews.com/a/2553596.html=Unleashed
34 | http://learningenglish.voanews.com/a/2553581.html=Crippling
35 | http://learningenglish.voanews.com/a/2553587.html=Strategy
36 | http://learningenglish.voanews.com/a/2553567.html=Obligation
37 | http://learningenglish.voanews.com/a/2553576.html=Agenda
38 | http://learningenglish.voanews.com/a/2533606.html=Delicate
39 | http://learningenglish.voanews.com/a/2533607.html=Hawkish
40 | http://learningenglish.voanews.com/a/2533608.html=Looting
41 | http://learningenglish.voanews.com/a/2461984.html=Referendum
42 | http://learningenglish.voanews.com/a/2461990.html=Apprehended
43 | http://learningenglish.voanews.com/a/2461986.html=Ringleader
44 | http://learningenglish.voanews.com/a/2461985.html=Sovereign
45 | http://learningenglish.voanews.com/a/2533611.html=Objective
46 | http://learningenglish.voanews.com/a/2461987.html=Assets
47 | http://learningenglish.voanews.com/a/2670609.html=Significant
48 | http://learningenglish.voanews.com/a/2670610.html=Facilitate
49 | http://learningenglish.voanews.com/a/2511974.html=National Guard
50 | http://learningenglish.voanews.com/a/2533610.html=Symptoms
51 | http://learningenglish.voanews.com/a/2511969.html=Undocumented Workers
52 | http://learningenglish.voanews.com/a/2511971.html=Defiant
53 | http://learningenglish.voanews.com/a/2511972.html=Expedite
54 | http://learningenglish.voanews.com/a/2494865.html=Civil War
55 | http://learningenglish.voanews.com/a/2494868.html=Reconnaisance
56 | http://learningenglish.voanews.com/a/2494867.html=Confine
57 | http://learningenglish.voanews.com/a/2494866.html=Potential
58 | http://learningenglish.voanews.com/a/2494864.html=Analyst
59 | http://learningenglish.voanews.com/a/2479301.html=Rank
60 | http://learningenglish.voanews.com/a/2479300.html=Emergence
61 | http://learningenglish.voanews.com/a/2479297.html=Runoff
62 | http://learningenglish.voanews.com/a/2479298.html=Sporadic
63 | http://learningenglish.voanews.com/a/2479296.html=Fraud
64 | http://learningenglish.voanews.com/a/2438621.html=Solution
65 | http://learningenglish.voanews.com/a/2438625.html=Recall
66 | http://learningenglish.voanews.com/a/2511968.html=Malaria
67 | http://learningenglish.voanews.com/a/2438620.html=Irreversible
68 | http://learningenglish.voanews.com/a/2438618.html=Consensus
69 | http://learningenglish.voanews.com/a/2438617.html=Revision
70 | http://learningenglish.voanews.com/a/2438614.html=Stabilize
71 | http://learningenglish.voanews.com/a/2438612.html=Convoy
72 | http://learningenglish.voanews.com/a/2438615.html=Maritime
73 | http://learningenglish.voanews.com/a/2438613.html=Critical
74 | http://learningenglish.voanews.com/a/2438657.html=Biofuel
75 | http://learningenglish.voanews.com/a/1903494.html=Sectarian
76 | http://learningenglish.voanews.com/a/1903491.html=Regime
77 | http://learningenglish.voanews.com/a/1903493.html=Startup
78 | http://learningenglish.voanews.com/a/1903490.html=Mainstream
79 | http://learningenglish.voanews.com/a/1903492.html=Legislation
80 | http://learningenglish.voanews.com/a/1899356.html=Chemical Weapons
81 | http://learningenglish.voanews.com/a/1899359.html=Accord
82 | http://learningenglish.voanews.com/a/1899357.html=Obamacare
83 | http://learningenglish.voanews.com/a/1899355.html=Pragmatic
84 | http://learningenglish.voanews.com/a/1899354.html=Turmoil
85 | http://learningenglish.voanews.com/a/1893953.html=Casualty
86 | http://learningenglish.voanews.com/a/1893949.html=Unilateral
87 | http://learningenglish.voanews.com/a/1893955.html=Credibility
88 | http://learningenglish.voanews.com/a/1893947.html=Humanitarian
89 | http://learningenglish.voanews.com/a/1893946.html=Investigation
90 | http://learningenglish.voanews.com/a/1890651.html=Insurgency
91 | http://learningenglish.voanews.com/a/1890630.html=Fiscal
92 | http://learningenglish.voanews.com/a/1890653.html=Verification
93 | http://learningenglish.voanews.com/a/1890628.html=Partisan
94 | http://learningenglish.voanews.com/a/1894815.html=Sanctions
95 | http://learningenglish.voanews.com/a/1884224.html=Paparazzi
96 | http://learningenglish.voanews.com/a/1884223.html=Dissident
97 | http://learningenglish.voanews.com/a/1884221.html=NATO
98 | http://learningenglish.voanews.com/a/1884222.html=Censorship
99 | http://learningenglish.voanews.com/a/1884220.html=Consumer
100 | http://learningenglish.voanews.com/a/1879109.html=Furloughed
101 | http://learningenglish.voanews.com/a/1878855.html=Exchange Rate
102 | http://learningenglish.voanews.com/a/1878853.html=Candidate
103 | http://learningenglish.voanews.com/a/1878949.html=Drone
104 | http://learningenglish.voanews.com/a/1878887.html=Bipartisanship
105 | http://learningenglish.voanews.com/a/1874349.html=Recession
106 | http://learningenglish.voanews.com/a/1874348.html=Mortgage
107 | http://learningenglish.voanews.com/a/1874354.html=Supreme Court
108 | http://learningenglish.voanews.com/a/1874352.html=Bilateral
109 | http://learningenglish.voanews.com/a/1874346.html=Summit
110 | http://learningenglish.voanews.com/a/1868773.html=Espionage
111 | http://learningenglish.voanews.com/a/1868824.html=Investment
112 | http://learningenglish.voanews.com/a/1868826.html=House of Representatives
113 | http://learningenglish.voanews.com/a/1868816.html=Indictment
114 | http://learningenglish.voanews.com/a/1868817.html=Encouraging
115 | http://learningenglish.voanews.com/a/1864638.html=Dialogue
116 | http://learningenglish.voanews.com/a/1864614.html=Bankruptcy
117 | http://learningenglish.voanews.com/a/1864598.html=Senate
118 | http://learningenglish.voanews.com/a/1864166.html=Presidential
119 | http://learningenglish.voanews.com/a/1864123.html=Surveillance
120 | http://learningenglish.voanews.com/a/1864102.html=Stock Market
121 | http://learningenglish.voanews.com/a/1859189.html=Embargo
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## HtmlExtractor是一个Java实现的基于模板的网页结构化信息精准抽取组件,本身并不包含爬虫功能,但可被爬虫或其他程序调用以便更精准地对网页结构化信息进行抽取。
2 |
3 | ## HtmlExtractor是为大规模分布式环境设计的,采用主从架构,主节点负责维护抽取规则,从节点向主节点请求抽取规则,当抽取规则发生变化,主节点主动通知从节点,从而能实现抽取规则变化之后的实时动态生效。
4 |
5 | ## [捐赠致谢](https://github.com/ysc/QuestionAnsweringSystem/wiki/donation)
6 |
7 | ## 如何使用?
8 |
9 | 使用SeleniumHtmlFetcher需要安装驱动:
10 | FirefoxDriver: brew install geckodriver
11 | ChromeDriver: brew install chromedriver
12 |
13 | HtmlExtractor由2个子项目构成,html-extractor和html-extractor-web。
14 | html-extractor实现了数据抽取逻辑,是从节点,html-extractor-web提供web界面来维护抽取规则,是主节点。
15 | html-extractor是一个jar包,可通过maven引用:
16 |
17 | org.apdplat
18 | html-extractor
19 | 1.1
20 |
21 | html-extractor-web是一个war包,需要部署到Servlet/Jsp容器上。
22 | 在html-extractor-web目录下运行mvn jetty:run就可以启动Servlet/Jsp容器jetty,之后打开浏览器访问:
23 | http://localhost:8080/html-extractor-web/api/ 查看自己定义的规则。
24 |
25 | 注意:页面模板中定义的所有CSS路径和抽取表达式全部抽取成功,才算抽取成功,
26 | 只要有一个CSS路径或抽取表达式失败,就是抽取失败。
27 |
28 | [如何使用HtmlExtractor实现基于模板的网页结构化信息精准抽取?](http://my.oschina.net/apdplat/blog/402149)
29 |
30 | ## 单机集中式使用方法:
31 |
32 | //1、构造抽取规则
33 | List urlPatterns = new ArrayList<>();
34 | //1.1、构造URL模式
35 | UrlPattern urlPattern = new UrlPattern();
36 | urlPattern.setUrlPattern("http://money.163.com/\\d{2}/\\d{4}/\\d{2}/[0-9A-Z]{16}.html");
37 | //1.2、构造HTML模板
38 | HtmlTemplate htmlTemplate = new HtmlTemplate();
39 | htmlTemplate.setTemplateName("网易财经频道");
40 | htmlTemplate.setTableName("finance");
41 | //1.3、将URL模式和HTML模板建立关联
42 | urlPattern.addHtmlTemplate(htmlTemplate);
43 | //1.4、构造CSS路径
44 | CssPath cssPath = new CssPath();
45 | cssPath.setCssPath("h1");
46 | cssPath.setFieldName("title");
47 | cssPath.setFieldDescription("标题");
48 | //1.5、将CSS路径和模板建立关联
49 | htmlTemplate.addCssPath(cssPath);
50 | //1.6、构造CSS路径
51 | cssPath = new CssPath();
52 | cssPath.setCssPath("div#endText");
53 | cssPath.setFieldName("content");
54 | cssPath.setFieldDescription("正文");
55 | //1.7、将CSS路径和模板建立关联
56 | htmlTemplate.addCssPath(cssPath);
57 | //可象上面那样构造多个URLURL模式
58 | urlPatterns.add(urlPattern);
59 |
60 | //2、获取抽取规则对象
61 | ExtractRegular extractRegular = ExtractRegular.getInstance(urlPatterns);
62 | //注意:可通过如下3个方法动态地改变抽取规则
63 | //extractRegular.addUrlPatterns(urlPatterns);
64 | //extractRegular.addUrlPattern(urlPattern);
65 | //extractRegular.removeUrlPattern(urlPattern.getUrlPattern());
66 |
67 | //3、获取HTML抽取工具
68 | HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular);
69 |
70 | //4、抽取网页
71 | String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html";
72 | HtmlFetcher htmlFetcher = new JSoupHtmlFetcher();
73 | String html = htmlFetcher.fetch(url);
74 | List extractResults = htmlExtractor.extract(url, html);
75 |
76 | //5、输出结果
77 | int i = 1;
78 | for (ExtractResult extractResult : extractResults) {
79 | System.out.println((i++) + "、网页 " + extractResult.getUrl() + " 的抽取结果");
80 | if(!extractResult.isSuccess()){
81 | System.out.println("抽取失败:");
82 | for(ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()){
83 | System.out.println("\turl:"+extractFailLog.getUrl());
84 | System.out.println("\turlPattern:"+extractFailLog.getUrlPattern());
85 | System.out.println("\ttemplateName:"+extractFailLog.getTemplateName());
86 | System.out.println("\tfieldName:"+extractFailLog.getFieldName());
87 | System.out.println("\tfieldDescription:"+extractFailLog.getFieldDescription());
88 | System.out.println("\tcssPath:"+extractFailLog.getCssPath());
89 | if(extractFailLog.getExtractExpression()!=null) {
90 | System.out.println("\textractExpression:" + extractFailLog.getExtractExpression());
91 | }
92 | }
93 | continue;
94 | }
95 | Map> extractResultItems = extractResult.getExtractResultItems();
96 | for(String field : extractResultItems.keySet()){
97 | List values = extractResultItems.get(field);
98 | if(values.size() > 1){
99 | int j=1;
100 | System.out.println("\t多值字段:"+field);
101 | for(ExtractResultItem item : values){
102 | System.out.println("\t\t"+(j++)+"、"+field+" = "+item.getValue());
103 | }
104 | }else{
105 | System.out.println("\t"+field+" = "+values.get(0).getValue());
106 | }
107 | }
108 | System.out.println("\tdescription = "+extractResult.getDescription());
109 | System.out.println("\tkeywords = "+extractResult.getKeywords());
110 | }
111 |
112 | ## 多机分布式使用方法:
113 |
114 | 1、运行主节点,负责维护抽取规则:
115 | 方法一:在html-extractor-web目录下运行mvn jetty:run 。
116 | 方法二:在html-extractor-web目录下运行mvn install ,
117 | 然后将target/html-extractor-web-1.0.war部署到Tomcat。
118 |
119 | 2、获取一个HtmlExtractor的实例(从节点),示例代码如下:
120 | String allExtractRegularUrl = "http://localhost:8080/HtmlExtractorServer/api/all_extract_regular.jsp";
121 | String redisHost = "localhost";
122 | int redisPort = 6379;
123 | ExtractRegular extractRegular = ExtractRegular.getInstance(allExtractRegularUrl, redisHost, redisPort);
124 | HtmlExtractor htmlExtractor = new DefaultHtmlExtractor(extractRegular);
125 |
126 | 3、抽取信息,示例代码如下:
127 | String url = "http://money.163.com/08/1219/16/4THR2TMP002533QK.html";
128 | HtmlFetcher htmlFetcher = new JSoupHtmlFetcher();
129 | String html = htmlFetcher.fetch(url);
130 | List extractResults = htmlExtractor.extract(url, html);
131 |
132 | int i = 1;
133 | for (ExtractResult extractResult : extractResults) {
134 | System.out.println((i++) + "、网页 " + extractResult.getUrl() + " 的抽取结果");
135 | if(!extractResult.isSuccess()){
136 | System.out.println("抽取失败:");
137 | for(ExtractFailLog extractFailLog : extractResult.getExtractFailLogs()){
138 | System.out.println("\turl:"+extractFailLog.getUrl());
139 | System.out.println("\turlPattern:"+extractFailLog.getUrlPattern());
140 | System.out.println("\ttemplateName:"+extractFailLog.getTemplateName());
141 | System.out.println("\tfieldName:"+extractFailLog.getFieldName());
142 | System.out.println("\tfieldDescription:"+extractFailLog.getFieldDescription());
143 | System.out.println("\tcssPath:"+extractFailLog.getCssPath());
144 | if(extractFailLog.getExtractExpression()!=null) {
145 | System.out.println("\textractExpression:" + extractFailLog.getExtractExpression());
146 | }
147 | }
148 | continue;
149 | }
150 | for(ExtractResultItem extractResultItem : extractResult.getExtractResultItems()){
151 | System.out.print("\t"+extractResultItem.getField()+" = "+extractResultItem.getValue());
152 | }
153 | System.out.println("\tdescription = "+extractResult.getDescription());
154 | System.out.println("\tkeywords = "+extractResult.getKeywords());
155 | }
156 |
157 | [https://travis-ci.org/ysc/HtmlExtractor](https://travis-ci.org/ysc/HtmlExtractor)
158 |
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/EverydayGrammar.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/everyday-grammar-relative-pronouns/2793920.html=Relative Pronouns
2 | http://learningenglish.voanews.com/a/everyday-grammar-may-might-must-modals-certainty/2887387.html=May Might Must - Modals of Certainty
3 | http://learningenglish.voanews.com/a/modals-permission-everyday-grammar/3355585.html=Modals for Asking Permission
4 | http://learningenglish.voanews.com/a/you-really-should-learn-modals-everyday-gramamr/3355517.html=You Really Should Learn Modals
5 | http://learningenglish.voanews.com/a/grammar-demonstrative-pronouns-determiners/3347315.html=Demonstrating How to Use Demonstratives
6 | http://learningenglish.voanews.com/a/are-you-how-you-talk/3337552.html=Are You How You Talk
7 | http://learningenglish.voanews.com/a/who-makes-grammar-rules/3325780.html=Who Makes Grammar Rules
8 | http://learningenglish.voanews.com/a/everyday-grammar-commonly-confused-words-part-three-homophones/3317204.html=Commonly Confused Words Part 3 Homophones
9 | http://learningenglish.voanews.com/a/everyday-grammar-commonly-confused-words-part-two/3307049.html=Commonly Confused Words Part Two
10 | http://learningenglish.voanews.com/a/everyday-grammar-commonly-confused-words-week-one/3294436.html=Commonly Confused Words Part One
11 | http://learningenglish.voanews.com/a/they-say-reported-speech-is-easy/3280282.html=They Say That Reported Speech Is Easy
12 | http://learningenglish.voanews.com/a/identify-with-relative-pronouns/3261879.html=Identify With Relative Pronouns
13 | http://learningenglish.voanews.com/a/changing-prepositions-with-provide/3259606.html=Changing Prepositions With Provide
14 | http://learningenglish.voanews.com/a/using-the-passive-voice/3247545.html=Using the Passive Voice
15 | http://learningenglish.voanews.com/a/everyday-grammar-double-negatives/2743416.html=The Story of the Double Negative
16 | http://learningenglish.voanews.com/a/using-right-article-everyday-grammar/2819461.html=Using the Right Article
17 | http://learningenglish.voanews.com/a/everyday-grammar-making-wishes/3218288.html=Do You Wish You Knew Better Grammar
18 | http://learningenglish.voanews.com/a/how-much-do-you-know-about-quantifiers/3206680.html=How Much Do You Know about Quantifiers
19 | http://learningenglish.voanews.com/a/everyday-grammar-understanding-noncount-nouns/3193621.html=Understanding Noncount Nouns
20 | http://learningenglish.voanews.com/a/past-unreal-conditionals/3181755.html=Past Unreal Conditionals
21 | http://learningenglish.voanews.com/a/if-you-learn-conditionals-be-glad/3173342.html=If You Learn Conditionals You ll Be Glad You Did
22 | http://learningenglish.voanews.com/a/improve-writing-contrast-concession/3163659.html=Improve Your Writing with Contrast and Concession
23 | http://learningenglish.voanews.com/a/perfect-progressive-tenses-everyday-grammar/3141901.html=The Perfect Progressive Tenses
24 | http://learningenglish.voanews.com/a/everyday-grammar-have-you-perfected-the-perfect-tenses/3137265.html=Have You Perfected the Perfect Tenses
25 | http://learningenglish.voanews.com/a/everyday-grammar-are-you-progressing-with-the-progressive--tenses/3131962.html=Are You Progressing with Progressive Tenses
26 | http://learningenglish.voanews.com/a/introduction-to-verb-tenses-everyday-grammar/3123576.html=An Introduction to Verb Tenses
27 | http://learningenglish.voanews.com/a/getting-to-know-gerunds-and-infinitives/3111996.html=Getting to Know Gerunds and Infinitives
28 | http://learningenglish.voanews.com/a/everyday-grammar-should-vs-shall/3107315.html=The Should vs Shall Debate
29 | http://learningenglish.voanews.com/a/for-or-since-what-is-the-difference/3097366.html=For or Since What Is the Difference
30 | http://learningenglish.voanews.com/a/phrasal-verbs-to-help-you-with-technology/3085650.html=30 Phrasal Verbs to Help You With Technology
31 | http://learningenglish.voanews.com/a/learn-prepositions-in-on-at/3073690.html=Are You In On or At Prepositions that Tell of Time and Place
32 | http://learningenglish.voanews.com/a/six-difference-between-britsh-and-american-english/3063743.html=Six Differences Between British and American English
33 | http://learningenglish.voanews.com/a/everyday-grammar-three-grammar-rules-that-are-dying/3053579.html=Everyday Grammar 3 Grammar Rules That Are Dying
34 | http://learningenglish.voanews.com/a/top-10-separable-phrasal-verbs/3041841.html=Everyday Grammar Our Top 10 Separable Phrasal Verbs
35 | http://learningenglish.voanews.com/a/everyday-grammar-using-transitions-for-smoother-writing/3029586.html=Everyday Grammar Make Your Writing Smoother with Transitions
36 | http://learningenglish.voanews.com/a/everyday-grammar-pow-whizz-what-are-onomatopoeia/3018658.html=Pow Whizz What Are Onomatopoeia
37 | http://learningenglish.voanews.com/a/everyday-grammar-introducing-phrasal-verbs/3010251.html=Everyday Grammar Introducing Phrasal Verbs
38 | http://learningenglish.voanews.com/a/everyday-grammar-when-nouns-act-like-adjectives/2998821.html=Everyday Grammar When Nouns Act Like Adjectives
39 | http://learningenglish.voanews.com/a/everyday-grammar-comparatives-superlatives/2989386.html=Everyday Grammar Comparatives and Superlatives
40 | http://learningenglish.voanews.com/a/everyday-grammar-do-does-you-understand-subject-verb-agreement/2977592.html=Everyday Grammar Do/Does You Understand Subject-Verb Agreement
41 | http://learningenglish.voanews.com/a/unusual-plurals-everyday-grammar/2968871.html=Everyday Grammar Unusual Plurals
42 | http://learningenglish.voanews.com/a/tag-questions-are-easy-arent-they-everyday-grammar/2956417.html=Everyday Grammar Tag Questions Are Easy Aren't They
43 | http://learningenglish.voanews.com/a/everyday-grammar-relative-adverbs/2944081.html=Everyday Grammar Three Reasons to Learn Relative Adverbs
44 | http://learningenglish.voanews.com/a/everyday-grammar-fun-with-future-tenses/2935173.html=Fun with Future Tenses
45 | http://learningenglish.voanews.com/a/everyday-grammar-we-suggest-that-you-learn-the-subjunctive/2925403.html=Everyday Grammar We Suggest That You Learn the Subjunctive
46 | http://learningenglish.voanews.com/a/everyday-grammar-the-sounds-of-grammar-betty-azar/2916335.html=Everyday Grammar The Sounds of Grammar with Betty Azar
47 | http://learningenglish.voanews.com/a/are-causatives-making-you-crazy-everyday-grammar/2903050.html=Are Causatives Making You Crazy
48 | http://learningenglish.voanews.com/a/everyday-grammar-modals-permission-can-may/2877141.html=Can I Could I May I
49 | http://learningenglish.voanews.com/a/everyday-grammar-you-had-better-learn-modals/2865365.html=Everyday Grammar You Had Better Learn Modals
50 | http://learningenglish.voanews.com/a/everyday-grammar-reported-speech/2856671.html=Everyday Grammar Mastering Reported Speech
51 | http://learningenglish.voanews.com/a/beating-problems-with-adverbs-everyday-grammar/2843494.html=Everyday Grammar Beating Problems with Adverbs
52 | http://learningenglish.voanews.com/a/everyday-grammar-words-come-and-go-in-english/2832644.html=Words Come and Go in English
53 | http://learningenglish.voanews.com/a/everyday-grammar-when-passive-is-better-than-active/2825976.html=Everyday Grammar When Passive Is Better than Active
54 | http://learningenglish.voanews.com/a/everyday-grammar-past-unreal-conditional-mixed-conditional/2809016.html=Everyday Grammar Advanced Conditionals
55 | http://learningenglish.voanews.com/a/everyday-grammar-introducing-conditionals/2778457.html=Everyday Grammar Introducing Conditionals
56 | http://learningenglish.voanews.com/a/problems-with-pronouns-and-gender/2770727.html=Problems with Pronouns and Gender
57 | http://learningenglish.voanews.com/a/everyday-grammar-simple-past-and-present-perfect/2752310.html=Simple Past and Present Perfect
58 | http://learningenglish.voanews.com/a/prepositions-time-place-everyday-grammar-in-on-at/2732061.html=Everyday Grammar In On and At
59 | http://learningenglish.voanews.com/a/everyday-grammar-gerunds-infinitives/2722827.html=Everyday Grammar Gerunds and Infinitives
60 | http://learningenglish.voanews.com/a/everyday-grammar-prepositions-provide/2701412.html=Everyday Grammar Put Prepositions in Their Place
61 | http://learningenglish.voanews.com/a/everyday-grammar-subject-object-pronouns/2674867.html=Can You Correct Her and I
--------------------------------------------------------------------------------
/mvnw:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # ----------------------------------------------------------------------------
3 | # Licensed to the Apache Software Foundation (ASF) under one
4 | # or more contributor license agreements. See the NOTICE file
5 | # distributed with this work for additional information
6 | # regarding copyright ownership. The ASF licenses this file
7 | # to you under the Apache License, Version 2.0 (the
8 | # "License"); you may not use this file except in compliance
9 | # with the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing,
14 | # software distributed under the License is distributed on an
15 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | # KIND, either express or implied. See the License for the
17 | # specific language governing permissions and limitations
18 | # under the License.
19 | # ----------------------------------------------------------------------------
20 |
21 | # ----------------------------------------------------------------------------
22 | # Maven2 Start Up Batch script
23 | #
24 | # Required ENV vars:
25 | # ------------------
26 | # JAVA_HOME - location of a JDK home dir
27 | #
28 | # Optional ENV vars
29 | # -----------------
30 | # M2_HOME - location of maven2's installed home dir
31 | # MAVEN_OPTS - parameters passed to the Java VM when running Maven
32 | # e.g. to debug Maven itself, use
33 | # set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
34 | # MAVEN_SKIP_RC - flag to disable loading of mavenrc files
35 | # ----------------------------------------------------------------------------
36 |
37 | if [ -z "$MAVEN_SKIP_RC" ] ; then
38 |
39 | if [ -f /etc/mavenrc ] ; then
40 | . /etc/mavenrc
41 | fi
42 |
43 | if [ -f "$HOME/.mavenrc" ] ; then
44 | . "$HOME/.mavenrc"
45 | fi
46 |
47 | fi
48 |
49 | # OS specific support. $var _must_ be set to either true or false.
50 | cygwin=false;
51 | darwin=false;
52 | mingw=false
53 | case "`uname`" in
54 | CYGWIN*) cygwin=true ;;
55 | MINGW*) mingw=true;;
56 | Darwin*) darwin=true
57 | #
58 | # Look for the Apple JDKs first to preserve the existing behaviour, and then look
59 | # for the new JDKs provided by Oracle.
60 | #
61 | if [ -z "$JAVA_HOME" ] && [ -L /System/Library/Frameworks/JavaVM.framework/Versions/CurrentJDK ] ; then
62 | #
63 | # Apple JDKs
64 | #
65 | export JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/CurrentJDK/Home
66 | fi
67 |
68 | if [ -z "$JAVA_HOME" ] && [ -L /System/Library/Java/JavaVirtualMachines/CurrentJDK ] ; then
69 | #
70 | # Apple JDKs
71 | #
72 | export JAVA_HOME=/System/Library/Java/JavaVirtualMachines/CurrentJDK/Contents/Home
73 | fi
74 |
75 | if [ -z "$JAVA_HOME" ] && [ -L "/Library/Java/JavaVirtualMachines/CurrentJDK" ] ; then
76 | #
77 | # Oracle JDKs
78 | #
79 | export JAVA_HOME=/Library/Java/JavaVirtualMachines/CurrentJDK/Contents/Home
80 | fi
81 |
82 | if [ -z "$JAVA_HOME" ] && [ -x "/usr/libexec/java_home" ]; then
83 | #
84 | # Apple JDKs
85 | #
86 | export JAVA_HOME=`/usr/libexec/java_home`
87 | fi
88 | ;;
89 | esac
90 |
91 | if [ -z "$JAVA_HOME" ] ; then
92 | if [ -r /etc/gentoo-release ] ; then
93 | JAVA_HOME=`java-config --jre-home`
94 | fi
95 | fi
96 |
97 | if [ -z "$M2_HOME" ] ; then
98 | ## resolve links - $0 may be a link to maven's home
99 | PRG="$0"
100 |
101 | # need this for relative symlinks
102 | while [ -h "$PRG" ] ; do
103 | ls=`ls -ld "$PRG"`
104 | link=`expr "$ls" : '.*-> \(.*\)$'`
105 | if expr "$link" : '/.*' > /dev/null; then
106 | PRG="$link"
107 | else
108 | PRG="`dirname "$PRG"`/$link"
109 | fi
110 | done
111 |
112 | saveddir=`pwd`
113 |
114 | M2_HOME=`dirname "$PRG"`/..
115 |
116 | # make it fully qualified
117 | M2_HOME=`cd "$M2_HOME" && pwd`
118 |
119 | cd "$saveddir"
120 | # echo Using m2 at $M2_HOME
121 | fi
122 |
123 | # For Cygwin, ensure paths are in UNIX format before anything is touched
124 | if $cygwin ; then
125 | [ -n "$M2_HOME" ] &&
126 | M2_HOME=`cygpath --unix "$M2_HOME"`
127 | [ -n "$JAVA_HOME" ] &&
128 | JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
129 | [ -n "$CLASSPATH" ] &&
130 | CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
131 | fi
132 |
133 | # For Migwn, ensure paths are in UNIX format before anything is touched
134 | if $mingw ; then
135 | [ -n "$M2_HOME" ] &&
136 | M2_HOME="`(cd "$M2_HOME"; pwd)`"
137 | [ -n "$JAVA_HOME" ] &&
138 | JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`"
139 | # TODO classpath?
140 | fi
141 |
142 | if [ -z "$JAVA_HOME" ]; then
143 | javaExecutable="`which javac`"
144 | if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then
145 | # readlink(1) is not available as standard on Solaris 10.
146 | readLink=`which readlink`
147 | if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then
148 | if $darwin ; then
149 | javaHome="`dirname \"$javaExecutable\"`"
150 | javaExecutable="`cd \"$javaHome\" && pwd -P`/javac"
151 | else
152 | javaExecutable="`readlink -f \"$javaExecutable\"`"
153 | fi
154 | javaHome="`dirname \"$javaExecutable\"`"
155 | javaHome=`expr "$javaHome" : '\(.*\)/bin'`
156 | JAVA_HOME="$javaHome"
157 | export JAVA_HOME
158 | fi
159 | fi
160 | fi
161 |
162 | if [ -z "$JAVACMD" ] ; then
163 | if [ -n "$JAVA_HOME" ] ; then
164 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
165 | # IBM's JDK on AIX uses strange locations for the executables
166 | JAVACMD="$JAVA_HOME/jre/sh/java"
167 | else
168 | JAVACMD="$JAVA_HOME/bin/java"
169 | fi
170 | else
171 | JAVACMD="`which java`"
172 | fi
173 | fi
174 |
175 | if [ ! -x "$JAVACMD" ] ; then
176 | echo "Error: JAVA_HOME is not defined correctly." >&2
177 | echo " We cannot execute $JAVACMD" >&2
178 | exit 1
179 | fi
180 |
181 | if [ -z "$JAVA_HOME" ] ; then
182 | echo "Warning: JAVA_HOME environment variable is not set."
183 | fi
184 |
185 | CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher
186 |
187 | # For Cygwin, switch paths to Windows format before running java
188 | if $cygwin; then
189 | [ -n "$M2_HOME" ] &&
190 | M2_HOME=`cygpath --path --windows "$M2_HOME"`
191 | [ -n "$JAVA_HOME" ] &&
192 | JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
193 | [ -n "$CLASSPATH" ] &&
194 | CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
195 | fi
196 |
197 | # traverses directory structure from process work directory to filesystem root
198 | # first directory with .mvn subdirectory is considered project base directory
199 | find_maven_basedir() {
200 | local basedir=$(pwd)
201 | local wdir=$(pwd)
202 | while [ "$wdir" != '/' ] ; do
203 | if [ -d "$wdir"/.mvn ] ; then
204 | basedir=$wdir
205 | break
206 | fi
207 | wdir=$(cd "$wdir/.."; pwd)
208 | done
209 | echo "${basedir}"
210 | }
211 |
212 | # concatenates all lines of a file
213 | concat_lines() {
214 | if [ -f "$1" ]; then
215 | echo "$(tr -s '\n' ' ' < "$1")"
216 | fi
217 | }
218 |
219 | export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-$(find_maven_basedir)}
220 | MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS"
221 |
222 | # Provide a "standardized" way to retrieve the CLI args that will
223 | # work with both Windows and non-Windows executions.
224 | MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@"
225 | export MAVEN_CMD_LINE_ARGS
226 |
227 | WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
228 |
229 | exec "$JAVACMD" \
230 | $MAVEN_OPTS \
231 | -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \
232 | "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \
233 | ${WRAPPER_LAUNCHER} "$@"
234 |
--------------------------------------------------------------------------------
/html-extractor-web/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 |
5 | org.apdplat
6 | html-extractor-web
7 | 1.1
8 | war
9 |
10 |
11 | html-extractor-web
12 | https://github.com/ysc/HtmlExtractor
13 |
14 | html-extractor-web是html-extractor的web接口,负责维护抽取规则。
15 |
16 |
17 | APDPlat
18 | http://apdplat.org/
19 |
20 |
21 |
22 | GNU GENERAL PUBLIC LICENSE, Version 3
23 | http://www.gnu.org/licenses/gpl.html
24 |
25 |
26 | 2014
27 |
28 | https://github.com/ysc/HtmlExtractor
29 | scm:git:git://github.com/ysc/HtmlExtractor.git
30 | scm:git:ssh://git@github.com/ysc/HtmlExtractor.git
31 | HEAD
32 |
33 |
34 |
35 | 杨尚川
36 | ysc@apdplat.org
37 | http://yangshangchuan.iteye.com
38 |
39 |
40 |
41 |
42 |
43 |
44 | org.apache.maven.plugins
45 | maven-war-plugin
46 | ${maven-war-plugin.version}
47 |
48 |
49 |
50 | org.apache.maven.plugins
51 | maven-compiler-plugin
52 | ${maven-compiler-plugin.version}
53 |
54 | ${project.build.sourceEncoding}
55 | ${java.version}
56 | ${java.version}
57 | true
58 | true
59 | true
60 |
61 |
62 |
63 |
64 | org.apache.maven.plugins
65 | maven-jar-plugin
66 | ${maven-jar-plugin.version}
67 |
68 |
69 |
70 | org.apache.maven.plugins
71 | maven-surefire-plugin
72 | ${maven-surefire-plugin.version}
73 |
74 | true
75 |
76 |
77 |
78 |
79 | org.apache.maven.plugins
80 | maven-resources-plugin
81 | ${maven-resources-plugin.version}
82 |
83 | ${project.build.sourceEncoding}
84 |
85 |
86 |
87 |
88 | org.apache.maven.plugins
89 | maven-javadoc-plugin
90 | ${maven-javadoc-plugin.version}
91 |
92 |
93 | attach-docs
94 |
95 | jar
96 |
97 |
98 |
99 |
100 |
101 |
102 | maven-source-plugin
103 | ${maven-source-plugin.version}
104 |
105 |
106 | attach-sources
107 |
108 | jar
109 |
110 |
111 |
112 |
113 |
114 | org.mortbay.jetty
115 | maven-jetty-plugin
116 | ${maven-jetty-plugin.version}
117 |
118 |
119 |
120 |
121 | org.codehaus.sonar
122 | sonar-maven3-plugin
123 | ${sonar-maven3-plugin.version}
124 |
125 |
126 |
127 |
128 | 1.8
129 | UTF-8
130 |
131 | 2.1.1
132 | 3.0
133 | 2.4
134 | 2.14
135 | 2.6
136 | 2.9.1
137 | 2.2.1
138 | 3.5
139 | 6.1.26
140 |
141 | 4.11
142 | 7.0
143 | 5.1.18
144 | 1.6.4
145 | 0.9.28
146 | 2.5.1
147 | 2.6
148 | 1.9.13
149 |
150 |
151 |
152 |
153 | junit
154 | junit
155 | ${junit.version}
156 | test
157 |
158 |
159 |
160 | javax
161 | javaee-api
162 | ${javaee-api.version}
163 | provided
164 |
165 |
166 |
167 | mysql
168 | mysql-connector-java
169 | ${mysql.version}
170 |
171 |
172 |
173 | org.slf4j
174 | slf4j-api
175 | ${slf4j-api.version}
176 |
177 |
178 |
179 | ch.qos.logback
180 | logback-classic
181 | ${logback-classic.version}
182 |
183 |
184 | commons-logging
185 | commons-logging
186 |
187 |
188 | runtime
189 |
190 |
191 |
192 | redis.clients
193 | jedis
194 | ${jedis.version}
195 |
196 |
197 | commons-lang
198 | commons-lang
199 | ${commons-lang.version}
200 |
201 |
202 | org.codehaus.jackson
203 | jackson-mapper-asl
204 | ${jackson.version}
205 |
206 |
207 |
208 |
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/EnglishAtTheMovies.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/4515733.html=Put Yourself Out There
2 | http://learningenglish.voanews.com/a/4515732.html=Cast a Very Long Shadow
3 | http://learningenglish.voanews.com/a/4515734.html=Shut This Whole Thing Down
4 | http://learningenglish.voanews.com/a/4403351.html=Holding Your Breath
5 | http://learningenglish.voanews.com/a/4403350.html=Hold You In Contempt
6 | http://learningenglish.voanews.com/a/4403346.html=Scared Out Of Her Mind
7 | http://learningenglish.voanews.com/a/4403348.html=Damaged Goods
8 | http://learningenglish.voanews.com/a/4403347.html=Not My Cup Of Tea
9 | http://learningenglish.voanews.com/a/4403345.html=You In?
10 | http://learningenglish.voanews.com/a/4403340.html=Took Out
11 | http://learningenglish.voanews.com/a/4403339.html=Down The Tubes
12 | http://learningenglish.voanews.com/a/4403341.html=Felt Closer To
13 | http://learningenglish.voanews.com/a/4403342.html=Rust Bucket
14 | http://learningenglish.voanews.com/a/4278055.html=Up A Notch
15 | http://learningenglish.voanews.com/a/4278047.html=Pick Up Where He Left Off
16 | http://learningenglish.voanews.com/a/4278052.html=My Head Is Pounding
17 | http://learningenglish.voanews.com/a/4278048.html=Leaving To Chance
18 | http://learningenglish.voanews.com/a/4278046.html=Hold Them Accountable
19 | http://learningenglish.voanews.com/a/4278039.html=Get Me
20 | http://learningenglish.voanews.com/a/4278043.html=Even The Playing Field
21 | http://learningenglish.voanews.com/a/4278041.html=Cut It Out
22 | http://learningenglish.voanews.com/a/4278040.html=Bounce Back
23 | http://learningenglish.voanews.com/a/4218427.html=Suicide Mission
24 | http://learningenglish.voanews.com/a/4218437.html=See You On The Other Side
25 | http://learningenglish.voanews.com/a/4194269.html=Master Of Disguise
26 | http://learningenglish.voanews.com/a/4194274.html=Let You Down
27 | http://learningenglish.voanews.com/a/4194268.html=I've Lost My Way
28 | http://learningenglish.voanews.com/a/4194280.html=I Dig It
29 | http://learningenglish.voanews.com/a/4194259.html=Give Him A Break
30 | http://learningenglish.voanews.com/a/4194261.html=Dump Me
31 | http://learningenglish.voanews.com/a/4194264.html=Blast From The Past
32 | http://learningenglish.voanews.com/a/4194262.html=Act Out
33 | http://learningenglish.voanews.com/a/3093779.html=Cooked Up
34 | http://learningenglish.voanews.com/a/3207722.html=Gear Up
35 | http://learningenglish.voanews.com/a/3207728.html=Waste Of Time
36 | http://learningenglish.voanews.com/a/3093784.html=Scaredy-cat
37 | http://learningenglish.voanews.com/a/3093783.html=Put Your Foot Down
38 | http://learningenglish.voanews.com/a/3093775.html=A Few Tricks Up Her Sleeve
39 | http://learningenglish.voanews.com/a/3207737.html=Pick A Fight
40 | http://learningenglish.voanews.com/a/4072135.html=Don't Jinx Me
41 | http://learningenglish.voanews.com/a/4072139.html=We're Broke
42 | http://learningenglish.voanews.com/a/4072137.html=A Little Off
43 | http://learningenglish.voanews.com/a/4072134.html=Take Back
44 | http://learningenglish.voanews.com/a/4072136.html=Come Along
45 | http://learningenglish.voanews.com/a/4017445.html=Hand-outs
46 | http://learningenglish.voanews.com/a/4017452.html=Stand A Chance
47 | http://learningenglish.voanews.com/a/4017446.html=A Keeper
48 | http://learningenglish.voanews.com/a/4017448.html=Getting Ahead of Myself
49 | http://learningenglish.voanews.com/a/4017449.html=Crushing It
50 | http://learningenglish.voanews.com/a/3930642.html=Wait For Backup
51 | http://learningenglish.voanews.com/a/3930638.html=Fire Burning In Our Bellies
52 | http://learningenglish.voanews.com/a/3930647.html=Where Do You See Yourself In Five Years?
53 | http://learningenglish.voanews.com/a/3930639.html=Fit In
54 | http://learningenglish.voanews.com/a/3930641.html=Takes Your Breath Away
55 | http://learningenglish.voanews.com/a/3930646.html=You Got This
56 | http://learningenglish.voanews.com/a/3930645.html=You Will Soon Pay
57 | http://learningenglish.voanews.com/a/3930644.html=Go Undercover
58 | http://learningenglish.voanews.com/a/3930643.html=See Fit
59 | http://learningenglish.voanews.com/a/3930640.html=It's Shady
60 | http://learningenglish.voanews.com/a/3818369.html=Time Flies When You're Having fun
61 | http://learningenglish.voanews.com/a/3823000.html=Something Fishy
62 | http://learningenglish.voanews.com/a/3823001.html=Turn Back The Clock
63 | http://learningenglish.voanews.com/a/3822997.html=Jack Up Our Price
64 | http://learningenglish.voanews.com/a/3822994.html=Gifted
65 | http://learningenglish.voanews.com/a/3822998.html=Practice Run
66 | http://learningenglish.voanews.com/a/3822995.html=Let's Roll
67 | http://learningenglish.voanews.com/a/3822999.html=Set Things Right
68 | http://learningenglish.voanews.com/a/3822996.html=Fishing Around
69 | http://learningenglish.voanews.com/a/3818370.html=Got Our Work Cut Out For Us
70 | http://learningenglish.voanews.com/a/3725681.html=Piece Of Cake
71 | http://learningenglish.voanews.com/a/3725689.html=Crooked Cops
72 | http://learningenglish.voanews.com/a/3725680.html=I'm Really Freaking Out
73 | http://learningenglish.voanews.com/a/3725688.html=Ahead Of Your Time
74 | http://learningenglish.voanews.com/a/3725679.html=Power Nap
75 | http://learningenglish.voanews.com/a/3725674.html=I'm From A Different Planet
76 | http://learningenglish.voanews.com/a/3725682.html=To Take Charge
77 | http://learningenglish.voanews.com/a/3725678.html=It's Revolutionary
78 | http://learningenglish.voanews.com/a/3725675.html=Hitting Rock Bottom
79 | http://learningenglish.voanews.com/a/3725683.html=Write Your Own Rules
80 | http://learningenglish.voanews.com/a/3624832.html=You're A Piece Of Work
81 | http://learningenglish.voanews.com/a/3624830.html=My Heart Was Broken
82 | http://learningenglish.voanews.com/a/3624829.html=Lost Their Way
83 | http://learningenglish.voanews.com/a/3624834.html=You Are Having Visions
84 | http://learningenglish.voanews.com/a/3624825.html=Get Yourself Lawyered Up
85 | http://learningenglish.voanews.com/a/3624824.html=A Chain Reaction
86 | http://learningenglish.voanews.com/a/3624823.html=Fresh Start
87 | http://learningenglish.voanews.com/a/3624831.html=There's Something Off About Them
88 | http://learningenglish.voanews.com/a/3624826.html=Brace For Impact
89 | http://learningenglish.voanews.com/a/3624828.html=I am Dead Meat
90 | http://learningenglish.voanews.com/a/3529894.html=It's Crunch Time
91 | http://learningenglish.voanews.com/a/3529901.html=Not Everyone Is Wired
92 | http://learningenglish.voanews.com/a/3529902.html=The Cost of Doing Business
93 | http://learningenglish.voanews.com/a/3529904.html=We've Been Hacked
94 | http://learningenglish.voanews.com/a/3529896.html=Clean Record
95 | http://learningenglish.voanews.com/a/3529898.html=Give Me A Couple Pointers
96 | http://learningenglish.voanews.com/a/3529899.html=Make Things Right
97 | http://learningenglish.voanews.com/a/3529903.html=They Tracked You
98 | http://learningenglish.voanews.com/a/3529900.html=It's Not Cool
99 | http://learningenglish.voanews.com/a/3529897.html=I've Lost Direction
100 | http://learningenglish.voanews.com/a/3428747.html=More To You Than Meets The Eye
101 | http://learningenglish.voanews.com/a/3428759.html=You're A Legend
102 | http://learningenglish.voanews.com/a/3428746.html=I'm Just Ordinary
103 | http://learningenglish.voanews.com/a/3428743.html=What These Newcomers' Intentions Are
104 | http://learningenglish.voanews.com/a/3428758.html=Word Travels
105 | http://learningenglish.voanews.com/a/3428764.html=White Collar Crime
106 | http://learningenglish.voanews.com/a/3428749.html=Somebody Got The Better Of Us
107 | http://learningenglish.voanews.com/a/3428742.html=Hold His Feet To The Fire
108 | http://learningenglish.voanews.com/a/3428752.html=Watch Their Back
109 | http://learningenglish.voanews.com/a/3428741.html=There's A Lot At Stake
110 | http://learningenglish.voanews.com/a/3318586.html=Stepping Up
111 | http://learningenglish.voanews.com/a/3318577.html=Get Her Digits
112 | http://learningenglish.voanews.com/a/3318576.html=Signed Up For
113 | http://learningenglish.voanews.com/a/3318591.html=Swallow Your Pride
114 | http://learningenglish.voanews.com/a/3318585.html=Playing with Fire
115 | http://learningenglish.voanews.com/a/3318588.html=Showing Off
116 | http://learningenglish.voanews.com/a/3318578.html=Make Them Pay
117 | http://learningenglish.voanews.com/a/3318579.html=Justice is About to be Served
118 | http://learningenglish.voanews.com/a/3318584.html=Not Giving Up
119 | http://learningenglish.voanews.com/a/3318592.html=Stop at Nothing
120 | http://learningenglish.voanews.com/a/3207730.html=Make Your Case
121 | http://learningenglish.voanews.com/a/3207721.html=Come To Your Senses
122 | http://learningenglish.voanews.com/a/3207723.html=Make It Count
123 | http://learningenglish.voanews.com/a/3207729.html=Sounds Screwy
124 | http://learningenglish.voanews.com/a/3207731.html=So Long
125 | http://learningenglish.voanews.com/a/3207737.html=Pick A Fight
126 | http://learningenglish.voanews.com/a/3207728.html=Waste Of Time
127 | http://learningenglish.voanews.com/a/3207725.html=Party Foul
128 | http://learningenglish.voanews.com/a/3207724.html=All Hands On Deck
129 | http://learningenglish.voanews.com/a/3207722.html=Gear Up
130 | http://learningenglish.voanews.com/a/3093798.html=Time Bomb
131 | http://learningenglish.voanews.com/a/3093802.html=Shaking Us Down
132 | http://learningenglish.voanews.com/a/3093795.html=Felt Like I Belonged
133 | http://learningenglish.voanews.com/a/3093785.html=Stand On its Own
134 | http://learningenglish.voanews.com/a/3093789.html=Bigger Than They Are
135 | http://learningenglish.voanews.com/a/3093783.html=Put Your Foot Down
136 | http://learningenglish.voanews.com/a/3093779.html=Cooked Up
137 | http://learningenglish.voanews.com/a/3093784.html=Scaredy-cat
138 | http://learningenglish.voanews.com/a/3093775.html=A Few Tricks Up Her Sleeve
139 | http://learningenglish.voanews.com/a/3093780.html=Clean Slate
--------------------------------------------------------------------------------
/html-extractor/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 |
5 | org.apdplat
6 | html-extractor
7 | 1.1
8 | jar
9 |
10 |
11 | html-extractor
12 | https://github.com/ysc/HtmlExtractor
13 |
14 | html-extractor是一个Java实现的基于模板的通用的网页结构化信息精准抽取组件。
15 |
16 |
17 | APDPlat
18 | http://apdplat.org/
19 |
20 |
21 |
22 | GNU GENERAL PUBLIC LICENSE, Version 3
23 | http://www.gnu.org/licenses/gpl.html
24 |
25 |
26 | 2014
27 |
28 | https://github.com/ysc/HtmlExtractor
29 | scm:git:git://github.com/ysc/HtmlExtractor.git
30 | scm:git:ssh://git@github.com/ysc/HtmlExtractor.git
31 | HEAD
32 |
33 |
34 |
35 | 杨尚川
36 | ysc@apdplat.org
37 | http://yangshangchuan.iteye.com
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 | org.apache.maven.plugins
46 | maven-compiler-plugin
47 | ${maven-compiler-plugin.version}
48 |
49 | ${project.build.sourceEncoding}
50 | ${java.version}
51 | ${java.version}
52 | true
53 | true
54 | true
55 |
56 |
57 |
58 |
59 | org.apache.maven.plugins
60 | maven-jar-plugin
61 | ${maven-jar-plugin.version}
62 |
63 |
64 | **/logback.xml
65 |
66 |
67 |
68 |
69 |
70 | org.apache.maven.plugins
71 | maven-surefire-plugin
72 | ${maven-surefire-plugin.version}
73 |
74 | true
75 |
76 |
77 |
78 |
79 | org.apache.maven.plugins
80 | maven-resources-plugin
81 | ${maven-resources-plugin.version}
82 |
83 | ${project.build.sourceEncoding}
84 |
85 |
86 |
87 |
88 | org.apache.maven.plugins
89 | maven-javadoc-plugin
90 | ${maven-javadoc-plugin.version}
91 |
92 |
93 | attach-docs
94 |
95 | jar
96 |
97 |
98 |
99 |
100 |
101 |
102 | maven-source-plugin
103 | ${maven-source-plugin.version}
104 |
105 |
106 | attach-sources
107 |
108 | jar
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 | org.codehaus.sonar
117 | sonar-maven3-plugin
118 | ${sonar-maven3-plugin.version}
119 |
120 |
121 |
122 |
123 | 1.8
124 | UTF-8
125 |
126 | 3.0
127 | 2.4
128 | 2.14
129 | 2.6
130 | 2.9.1
131 | 2.2.1
132 | 3.5
133 |
134 | 4.11
135 | 1.6.4
136 | 0.9.28
137 | 2.5.1
138 | 1.7.2
139 | 2.6
140 | 1.9.13
141 | 3.1
142 | 1.6.4
143 | 2.24
144 | 3.0.1
145 |
146 |
147 |
148 |
149 | junit
150 | junit
151 | 4.11
152 | test
153 |
154 |
155 |
156 | org.slf4j
157 | slf4j-api
158 | ${slf4j-api.version}
159 |
160 |
161 |
162 | ch.qos.logback
163 | logback-classic
164 | ${logback-classic.version}
165 |
166 |
167 | commons-logging
168 | commons-logging
169 |
170 |
171 | runtime
172 |
173 |
174 |
175 | org.slf4j
176 | jcl-over-slf4j
177 | ${jcl-over-slf4j.version}
178 | runtime
179 |
180 |
181 |
182 | redis.clients
183 | jedis
184 | ${jedis.version}
185 |
186 |
187 | org.jsoup
188 | jsoup
189 | ${jsoup.version}
190 |
191 |
192 | commons-lang
193 | commons-lang
194 | ${commons-lang.version}
195 |
196 |
197 | org.codehaus.jackson
198 | jackson-mapper-asl
199 | ${jackson.version}
200 |
201 |
202 | commons-httpclient
203 | commons-httpclient
204 | ${commons-httpclient.version}
205 |
206 |
207 | commons-logging
208 | commons-logging
209 |
210 |
211 |
212 |
213 | net.sourceforge.htmlunit
214 | htmlunit
215 | ${htmlunit.version}
216 |
217 |
218 | commons-logging
219 | commons-logging
220 |
221 |
222 |
223 |
224 | org.seleniumhq.selenium
225 | selenium-java
226 | ${selenium.version}
227 |
228 |
229 |
230 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/html-extractor/src/main/resources/voa/EnglishInAMinute.txt:
--------------------------------------------------------------------------------
1 | http://learningenglish.voanews.com/a/3620615.html=Boils Down To
2 | http://learningenglish.voanews.com/a/3699956.html=Put All Your Eggs in One Basket
3 | http://learningenglish.voanews.com/a/3699945.html=Breathing Room
4 | http://learningenglish.voanews.com/a/3661254.html=Pack Rat
5 | http://learningenglish.voanews.com/a/3699954.html=Jump on the Bandwagon
6 | http://learningenglish.voanews.com/a/3620611.html=See Eye to Eye
7 | http://learningenglish.voanews.com/a/3699952.html=Keep You on Your Toes
8 | http://learningenglish.voanews.com/a/3699948.html=Bigger Fish to Fry
9 | http://learningenglish.voanews.com/a/3435558.html=On a Roll
10 | http://learningenglish.voanews.com/a/3605386.html=Happy Medium
11 | http://learningenglish.voanews.com/a/3699953.html=One Tough Cookie
12 | http://learningenglish.voanews.com/a/3620612.html=Slippery Slope
13 | http://learningenglish.voanews.com/a/3605389.html=Freebie
14 | http://learningenglish.voanews.com/a/3661248.html=Two Left Feet
15 | http://learningenglish.voanews.com/a/3435564.html=Sugarcoat
16 | http://learningenglish.voanews.com/a/3605385.html=Get off on the Wrong (or Right!) Foot
17 | http://learningenglish.voanews.com/a/3620609.html=Golden Opportunity
18 | http://learningenglish.voanews.com/a/3563804.html=Slip One's Mind
19 | http://learningenglish.voanews.com/a/3240171.html=Dog Eat Dog
20 | http://learningenglish.voanews.com/a/3605387.html=Sink Your Teeth Into
21 | http://learningenglish.voanews.com/a/3545534.html=Switch Gears
22 | http://learningenglish.voanews.com/a/3406302.html=Break the Ice
23 | http://learningenglish.voanews.com/a/3563798.html=The Ball Is In Your Court
24 | http://learningenglish.voanews.com/a/3545538.html=Go the Extra Mile
25 | http://learningenglish.voanews.com/a/3390611.html=Music to My Ears
26 | http://learningenglish.voanews.com/a/3435557.html=Up One's Alley
27 | http://learningenglish.voanews.com/a/3390591.html=Off the Deep End
28 | http://learningenglish.voanews.com/a/3390586.html=Red Flag
29 | http://learningenglish.voanews.com/a/3406310.html=Hit Your Stride
30 | http://learningenglish.voanews.com/a/3435555.html=To Have a Lot on Your Plate
31 | http://learningenglish.voanews.com/a/3406300.html=Comfort Zone
32 | http://learningenglish.voanews.com/a/3390603.html=Turn the Tables
33 | http://learningenglish.voanews.com/a/3390599.html=Bent out of Shape
34 | http://learningenglish.voanews.com/a/3406303.html=Miss the Boat
35 | http://learningenglish.voanews.com/a/3390596.html=Sing a Different Tune
36 | http://learningenglish.voanews.com/a/3445382.html=Suck the Air out of the Room
37 | http://learningenglish.voanews.com/a/3390583.html=No Pain No Gain
38 | http://learningenglish.voanews.com/a/3240231.html=Where There's Smoke There's Fire
39 | http://learningenglish.voanews.com/a/3390584.html=A Fish Out of Water
40 | http://learningenglish.voanews.com/a/3282529.html=To Fall Down the Rabbit Hole
41 | http://learningenglish.voanews.com/a/3282533.html=Put Someone on the Spot
42 | http://learningenglish.voanews.com/a/3282555.html=Cool as a Cucumber
43 | http://learningenglish.voanews.com/a/3240167.html=At the Drop of a Hat
44 | http://learningenglish.voanews.com/a/3282557.html=Fly on the Wall
45 | http://learningenglish.voanews.com/a/3282572.html=Baggage
46 | http://learningenglish.voanews.com/a/3240181.html=The Last Straw
47 | http://learningenglish.voanews.com/a/3209591.html=Push the Envelope
48 | http://learningenglish.voanews.com/a/3209587.html=Break Even
49 | http://learningenglish.voanews.com/a/3240179.html=Moving Target
50 | http://learningenglish.voanews.com/a/3203430.html=Keep Your Eye on the Ball
51 | http://learningenglish.voanews.com/a/3209586.html=Bucket List
52 | http://learningenglish.voanews.com/a/3203427.html=Deer in the Headlights
53 | http://learningenglish.voanews.com/a/3209593.html=That Ship Has Sailed
54 | http://learningenglish.voanews.com/a/3203438.html=A Wolf in Sheep's Clothing
55 | http://learningenglish.voanews.com/a/3203422.html=Best of Both Worlds
56 | http://learningenglish.voanews.com/a/3080432.html=Get Roped Into
57 | http://learningenglish.voanews.com/a/3203426.html=Throw the Baby Out with the Bathwater
58 | http://learningenglish.voanews.com/a/3209594.html=Living Under a Rock
59 | http://learningenglish.voanews.com/a/3081382.html=Wiggle Room
60 | http://learningenglish.voanews.com/a/3080429.html=Fly Under the Radar
61 | http://learningenglish.voanews.com/a/3026464.html=Fall Through the Cracks
62 | http://learningenglish.voanews.com/a/3081369.html=Head Over Heels
63 | http://learningenglish.voanews.com/a/3026468.html=In the Spotlight
64 | http://learningenglish.voanews.com/a/3080438.html=Face the Music
65 | http://learningenglish.voanews.com/a/3026460.html=Branch Out
66 | http://learningenglish.voanews.com/a/3026463.html=Low-Hanging Fruit
67 | http://learningenglish.voanews.com/a/3026462.html=In Over My Head
68 | http://learningenglish.voanews.com/a/2833784.html=Don't Hold Your Breath
69 | http://learningenglish.voanews.com/a/2921220.html=Burning Bridges
70 | http://learningenglish.voanews.com/a/2921285.html=MIA
71 | http://learningenglish.voanews.com/a/2777234.html=Wake-Up Call
72 | http://learningenglish.voanews.com/a/2777201.html=The Calm Before the Storm
73 | http://learningenglish.voanews.com/a/2833771.html=Jack of All Trades
74 | http://learningenglish.voanews.com/a/2741430.html=To Have Butterflies
75 | http://learningenglish.voanews.com/a/2741442.html=Up In The Air
76 | http://learningenglish.voanews.com/a/2777204.html=Get a Grip
77 | http://learningenglish.voanews.com/a/2833770.html=Water Under the Bridge
78 | http://learningenglish.voanews.com/a/2777203.html=On the Same Page
79 | http://learningenglish.voanews.com/a/2777202.html=The Sky's the Limit
80 | http://learningenglish.voanews.com/a/2741426.html=Elephant in the Room
81 | http://learningenglish.voanews.com/a/2741424.html=Knock Your Socks Off
82 | http://learningenglish.voanews.com/a/2741435.html=Pain in the Neck
83 | http://learningenglish.voanews.com/a/2716955.html=On Pins and Needles
84 | http://learningenglish.voanews.com/a/2716987.html=On the Edge of Your Seat
85 | http://learningenglish.voanews.com/a/2716950.html=Nail-biter
86 | http://learningenglish.voanews.com/a/2833766.html=Clam Up
87 | http://learningenglish.voanews.com/a/2716948.html=A Bitter Pill to Swallow
88 | http://learningenglish.voanews.com/a/2716951.html=Fifth Wheel
89 | http://learningenglish.voanews.com/a/2711203.html=Jazz it Up
90 | http://learningenglish.voanews.com/a/2711200.html=Don't Count Your Chickens Before They Hatch
91 | http://learningenglish.voanews.com/a/2711198.html=Over the Hill
92 | http://learningenglish.voanews.com/a/2833767.html=Out of the Blue
93 | http://learningenglish.voanews.com/a/2711201.html=Out of the Woods
94 | http://learningenglish.voanews.com/a/2660132.html=Fishing for Compliments
95 | http://learningenglish.voanews.com/a/2670644.html=Ring a Bell
96 | http://learningenglish.voanews.com/a/2670640.html=Skating on Thin Ice
97 | http://learningenglish.voanews.com/a/2688209.html=Tip of the Iceberg
98 | http://learningenglish.voanews.com/a/2688203.html=Straight from the Horse's Mouth
99 | http://learningenglish.voanews.com/a/2688206.html=Sleep Like a Rock
100 | http://learningenglish.voanews.com/a/2688204.html=Shoot the Breeze
101 | http://learningenglish.voanews.com/a/2749865.html=Bend the Rules
102 | http://learningenglish.voanews.com/a/2670637.html=Put (Something) on Hold
103 | http://learningenglish.voanews.com/a/2670645.html=Off the Cuff
104 | http://learningenglish.voanews.com/a/2596479.html=Not Have a Leg to Stand on
105 | http://learningenglish.voanews.com/a/2596477.html=Out of Your Mind
106 | http://learningenglish.voanews.com/a/2580622.html=Open Book
107 | http://learningenglish.voanews.com/a/2580617.html=Nip it in the Bud
108 | http://learningenglish.voanews.com/a/2670628.html=Down to Earth
109 | http://learningenglish.voanews.com/a/2580619.html=Drama Queen
110 | http://learningenglish.voanews.com/a/2667762.html=Keep One's Eyes Peeled
111 | http://learningenglish.voanews.com/a/2553166.html=Back to Square One
112 | http://learningenglish.voanews.com/a/2553157.html=Know the Ropes
113 | http://learningenglish.voanews.com/a/2553143.html=The Apple Doesn't Fall Far From the Tree
114 | http://learningenglish.voanews.com/a/2553116.html=Another One Bites the Dust
115 | http://learningenglish.voanews.com/a/2553133.html=Last-Ditch Effort
116 | http://learningenglish.voanews.com/a/2510605.html=Poker Face
117 | http://learningenglish.voanews.com/a/2510600.html=Blow Off Steam
118 | http://learningenglish.voanews.com/a/2510597.html=Up the Creek Without a Paddle
119 | http://learningenglish.voanews.com/a/2510754.html=Junk Food
120 | http://learningenglish.voanews.com/a/2510756.html=Fly by the Seat of One's Pants
121 | http://learningenglish.voanews.com/a/2494877.html=Food for Thought
122 | http://learningenglish.voanews.com/a/2494873.html=Got up on the Wrong Side of the Bed
123 | http://learningenglish.voanews.com/a/2494876.html=Fair Weather Friend
124 | http://learningenglish.voanews.com/a/2494875.html=Clear the Air
125 | http://learningenglish.voanews.com/a/2494874.html=Calling Someone's Bluff
126 | http://learningenglish.voanews.com/a/2479292.html=Barking Up the Wrong Tree
127 | http://learningenglish.voanews.com/a/2479288.html=Back-Seat Driver
128 | http://learningenglish.voanews.com/a/2479287.html=Apple of One's Eye
129 | http://learningenglish.voanews.com/a/2479286.html=As the Crow Flies
130 | http://learningenglish.voanews.com/a/2479290.html=Burned Out
131 | http://learningenglish.voanews.com/a/1679564.html=It's Up To You
132 | http://learningenglish.voanews.com/a/1666400.html=Miss The Point
133 | http://learningenglish.voanews.com/a/1698437.html=Go With The Flow
134 | http://learningenglish.voanews.com/a/1679557.html=A Piece of Cake
135 | http://learningenglish.voanews.com/a/1698912.html=Get On My Nerves
136 | http://learningenglish.voanews.com/a/1651145.html=What's Up?
137 | http://learningenglish.voanews.com/a/1664388.html=Grab a Bite
138 | http://learningenglish.voanews.com/a/1665136.html=A Day Late and a Dollar Short
139 | http://learningenglish.voanews.com/a/1665197.html=Dressed to Kill
140 | http://learningenglish.voanews.com/a/1665309.html=My Two Cents
141 | http://learningenglish.voanews.com/a/1665362.html=Burn The Midnight Oil
142 | http://learningenglish.voanews.com/a/1665601.html=Actions Speak Louder Than Words
143 | http://learningenglish.voanews.com/a/1666266.html=Keep Your Chin Up
144 | http://learningenglish.voanews.com/a/1670596.html=Bad Taste in My Mouth
145 | http://learningenglish.voanews.com/a/1670827.html=If The Shoe Fits
146 | http://learningenglish.voanews.com/a/1672261.html=A Game Plan
147 | http://learningenglish.voanews.com/a/1675045.html=In Your Dreams
148 | http://learningenglish.voanews.com/a/1675057.html=Stay in Touch
149 | http://learningenglish.voanews.com/a/1675069.html=Draw a Blank
150 | http://learningenglish.voanews.com/a/1675076.html=Get out of Here
151 | http://learningenglish.voanews.com/a/1675089.html=Costs a Pretty Penny
152 | http://learningenglish.voanews.com/a/1678615.html=Once in a Blue Moon
153 | http://learningenglish.voanews.com/a/1679513.html=A Breath of Fresh Air
154 | http://learningenglish.voanews.com/a/1684038.html=Call It A Day
155 | http://learningenglish.voanews.com/a/1698742.html=Cut to the Chase
156 | http://learningenglish.voanews.com/a/1698759.html=Cash Cow
157 | http://learningenglish.voanews.com/a/1698806.html=No Pain, No Gain
158 | http://learningenglish.voanews.com/a/1698818.html=Heart of Gold
159 | http://learningenglish.voanews.com/a/1698839.html=Under The Weather
160 | http://learningenglish.voanews.com/a/1698852.html=Ears Are Burning
161 | http://learningenglish.voanews.com/a/1698857.html=Off The Top Of My Head
162 | http://learningenglish.voanews.com/a/1698863.html=Hear a Pin Drop
163 | http://learningenglish.voanews.com/a/1698950.html=Play It By Ear
164 | http://learningenglish.voanews.com/a/1664509.html=Cats and Dogs
165 | http://learningenglish.voanews.com/a/1664835.html=It's Been Ages
166 | http://learningenglish.voanews.com/a/1670728.html=A Basket Case
167 | http://learningenglish.voanews.com/a/1679544.html=Break a Leg
168 | http://learningenglish.voanews.com/a/1684020.html=Make Up Your Mind
169 | http://learningenglish.voanews.com/a/1698727.html=Break the Bank
170 | http://learningenglish.voanews.com/a/1698775.html=Dime a Dozen
171 | http://learningenglish.voanews.com/a/1698787.html=24 - 7
172 | http://learningenglish.voanews.com/a/1698827.html=An Arm and a Leg
173 | http://learningenglish.voanews.com/a/1684043.html=Get Cold Feet
174 | http://learningenglish.voanews.com/a/1684030.html=Hit The Sack
175 | http://learningenglish.voanews.com/a/1684094.html=Treat With Kid Gloves
176 | http://learningenglish.voanews.com/a/1679529.html=Bite My Tongue
--------------------------------------------------------------------------------