├── .gitignore
├── README.md
├── conf
├── fetch.xml
├── letv_conf_temp.xml
├── template.xml
├── test_youku_dm.xml
├── wasu_conf.xml
├── youku_conf.xml
└── youku_conf_temp.xml
├── pom.xml
├── setpath.bat
├── src
├── main
│ ├── java
│ │ └── org
│ │ │ └── sbs
│ │ │ ├── goodcrawler
│ │ │ ├── bootstrap
│ │ │ │ ├── BootStrap.java
│ │ │ │ ├── CrawlerStatus.java
│ │ │ │ ├── foreman
│ │ │ │ │ ├── ExtractForeman.java
│ │ │ │ │ ├── FetchForeman.java
│ │ │ │ │ ├── Foreman.java
│ │ │ │ │ ├── StoreForeman.java
│ │ │ │ │ └── package-info.java
│ │ │ │ └── package-info.java
│ │ │ ├── conf
│ │ │ │ ├── Configurable.java
│ │ │ │ ├── Configuration.java
│ │ │ │ ├── GlobalConstants.java
│ │ │ │ ├── JobConfigurationManager.java
│ │ │ │ ├── PropertyConfigurationHelper.java
│ │ │ │ ├── Worker.java
│ │ │ │ └── package-info.java
│ │ │ ├── exception
│ │ │ │ ├── ConfigurationException.java
│ │ │ │ ├── ExtractException.java
│ │ │ │ ├── QueueException.java
│ │ │ │ └── package-info.java
│ │ │ ├── extractor
│ │ │ │ ├── DefaultExtractWorker.java
│ │ │ │ ├── DefaultExtractor.java
│ │ │ │ ├── ExtractResult.java
│ │ │ │ ├── ExtractWorker.java
│ │ │ │ ├── Extractor.java
│ │ │ │ ├── GCElement.java
│ │ │ │ ├── GCPage.java
│ │ │ │ ├── htmlelment
│ │ │ │ │ ├── AbstractHtmlElement.java
│ │ │ │ │ ├── CommonHtmlElement.java
│ │ │ │ │ ├── HtmlAnchorElementOfPage.java
│ │ │ │ │ ├── HtmlAnchorElementOfString.java
│ │ │ │ │ ├── HtmlElementExtractType.java
│ │ │ │ │ ├── HtmlElementType.java
│ │ │ │ │ ├── HtmlPageElement.java
│ │ │ │ │ └── package-info.java
│ │ │ │ ├── package-info.java
│ │ │ │ ├── selector
│ │ │ │ │ ├── AbstractElementCssSelector.java
│ │ │ │ │ ├── DateElementCssSelector.java
│ │ │ │ │ ├── FileElementCssSelector.java
│ │ │ │ │ ├── IFConditions.java
│ │ │ │ │ ├── IntegerElementCssSelector.java
│ │ │ │ │ ├── ListElementCssSelector.java
│ │ │ │ │ ├── NumericaElementCssSelector.java
│ │ │ │ │ ├── PageElementSelector.java
│ │ │ │ │ ├── SelectPageElement.java
│ │ │ │ │ ├── SelectorAttr.java
│ │ │ │ │ ├── SelectorType.java
│ │ │ │ │ ├── SetElementCssSelector.java
│ │ │ │ │ ├── StringElementCssSelector.java
│ │ │ │ │ ├── action
│ │ │ │ │ │ ├── EmptyAction.java
│ │ │ │ │ │ ├── FileSelectAction.java
│ │ │ │ │ │ ├── IntegerSelectorAction.java
│ │ │ │ │ │ ├── ListSelectorAction.java
│ │ │ │ │ │ ├── SelectorAction.java
│ │ │ │ │ │ ├── StringSelectorAction.java
│ │ │ │ │ │ ├── file
│ │ │ │ │ │ │ ├── DownLoadFileAction.java
│ │ │ │ │ │ │ ├── DownLoadImageResizeAction.java
│ │ │ │ │ │ │ └── FileActionType.java
│ │ │ │ │ │ ├── integer
│ │ │ │ │ │ │ ├── IntegerAbsAction.java
│ │ │ │ │ │ │ ├── IntegerActionType.java
│ │ │ │ │ │ │ ├── IntegerBetweenAction.java
│ │ │ │ │ │ │ └── package-info.java
│ │ │ │ │ │ ├── list
│ │ │ │ │ │ │ ├── ListFilterAction.java
│ │ │ │ │ │ │ └── package-info.java
│ │ │ │ │ │ ├── package-info.java
│ │ │ │ │ │ └── string
│ │ │ │ │ │ │ ├── ActionFactory.java
│ │ │ │ │ │ │ ├── StringActionType.java
│ │ │ │ │ │ │ ├── StringAfterAction.java
│ │ │ │ │ │ │ ├── StringAfterLastAction.java
│ │ │ │ │ │ │ ├── StringBeforeAction.java
│ │ │ │ │ │ │ ├── StringBeforeLastAction.java
│ │ │ │ │ │ │ ├── StringBetweenAction.java
│ │ │ │ │ │ │ ├── StringFilterAction.java
│ │ │ │ │ │ │ ├── StringPerfixAction.java
│ │ │ │ │ │ │ ├── StringReplaceAction.java
│ │ │ │ │ │ │ ├── StringSplitAction.java
│ │ │ │ │ │ │ ├── StringSubAction.java
│ │ │ │ │ │ │ ├── StringSuffixAction.java
│ │ │ │ │ │ │ └── package-info.java
│ │ │ │ │ ├── exception
│ │ │ │ │ │ ├── DownLoadException.java
│ │ │ │ │ │ ├── IntegerBetweenExpressionException.java
│ │ │ │ │ │ ├── SelectorConfigException.java
│ │ │ │ │ │ └── package-info.java
│ │ │ │ │ ├── expression
│ │ │ │ │ │ ├── GrExpression.java
│ │ │ │ │ │ ├── SimpleExpression.java
│ │ │ │ │ │ └── SimpleExpressionExtent.java
│ │ │ │ │ ├── factory
│ │ │ │ │ │ └── ElementCssSelectorFactory.java
│ │ │ │ │ └── package-info.java
│ │ │ │ └── template
│ │ │ │ │ ├── ExtractTemplate.java
│ │ │ │ │ └── package-info.java
│ │ │ ├── fetcher
│ │ │ │ ├── AjaxCallFetcher.java
│ │ │ │ ├── CustomFetchStatus.java
│ │ │ │ ├── DefaultFetchWorker.java
│ │ │ │ ├── FailedPageBackup.java
│ │ │ │ ├── FetchStatus.java
│ │ │ │ ├── FetchWorker.java
│ │ │ │ ├── Fetcher.java
│ │ │ │ ├── FetcherInstance.java
│ │ │ │ ├── FetcherType.java
│ │ │ │ ├── IdleConnectionMonitorThread.java
│ │ │ │ ├── PageFetcher.java
│ │ │ │ ├── ResynchronizingAjaxController.java
│ │ │ │ └── package-info.java
│ │ │ ├── jobconf
│ │ │ │ ├── ExtractConfig.java
│ │ │ │ ├── FetchConfig.java
│ │ │ │ ├── JobConfig.java
│ │ │ │ ├── StoreConfig.java
│ │ │ │ └── package-info.java
│ │ │ ├── page
│ │ │ │ ├── BinaryParseData.java
│ │ │ │ ├── ExtractedPage.java
│ │ │ │ ├── ExtractedUrlAnchorPair.java
│ │ │ │ ├── HtmlContentHandler.java
│ │ │ │ ├── HtmlParseData.java
│ │ │ │ ├── Page.java
│ │ │ │ ├── PageFetchResult.java
│ │ │ │ ├── ParseData.java
│ │ │ │ ├── Parser.java
│ │ │ │ ├── TextParseData.java
│ │ │ │ └── package-info.java
│ │ │ ├── plugin
│ │ │ │ ├── EsClient.java
│ │ │ │ ├── ExBulk.java
│ │ │ │ ├── IndexScanner.java
│ │ │ │ ├── ReIndex.java
│ │ │ │ ├── classloader
│ │ │ │ │ ├── CommonClassLoader.java
│ │ │ │ │ └── PluginClassLoader.java
│ │ │ │ ├── extract
│ │ │ │ │ ├── ExtractYouku.java
│ │ │ │ │ ├── Extractor66ys.java
│ │ │ │ │ ├── ExtractorDytt8.java
│ │ │ │ │ └── package-info.java
│ │ │ │ ├── package-info.java
│ │ │ │ └── storage
│ │ │ │ │ ├── ElasticSearchStorage.java
│ │ │ │ │ ├── Movie.java
│ │ │ │ │ ├── MovieSource.java
│ │ │ │ │ ├── Prepare.java
│ │ │ │ │ ├── p
│ │ │ │ │ ├── IESStoragePlugin.java
│ │ │ │ │ └── WasuEsStorePlugin.java
│ │ │ │ │ └── package-info.java
│ │ │ ├── schedule
│ │ │ │ ├── ReCraw.java
│ │ │ │ └── RecrawFetherWorkor.java
│ │ │ └── storage
│ │ │ │ ├── DefaultStoreWorker.java
│ │ │ │ ├── LocalFileStorage.java
│ │ │ │ ├── Storage.java
│ │ │ │ ├── StorageType.java
│ │ │ │ ├── StoreResult.java
│ │ │ │ ├── StoreWorker.java
│ │ │ │ └── package-info.java
│ │ │ ├── jetty
│ │ │ ├── JettyFactory.java
│ │ │ └── StartServer.java
│ │ │ ├── pendingqueue
│ │ │ ├── AbsPendingQueue.java
│ │ │ ├── PendRecraw.java
│ │ │ ├── PendingManager.java
│ │ │ ├── PendingPages.java
│ │ │ ├── PendingStore.java
│ │ │ ├── PendingUrls.java
│ │ │ └── package-info.java
│ │ │ ├── robotstxt
│ │ │ ├── HostDirectives.java
│ │ │ ├── RobotstxtConfig.java
│ │ │ ├── RobotstxtParser.java
│ │ │ ├── RobotstxtServer.java
│ │ │ └── RuleSet.java
│ │ │ ├── url
│ │ │ ├── TLDList.java
│ │ │ ├── URLCanonicalizer.java
│ │ │ ├── UlrFilters.java
│ │ │ ├── UrlResolver.java
│ │ │ ├── UrlSignatureSet.java
│ │ │ ├── WebURL.java
│ │ │ └── package-info.java
│ │ │ ├── util
│ │ │ ├── BinaryDateDwonLoader.java
│ │ │ ├── BloomFilter.java
│ │ │ ├── BloomfilterHelper.java
│ │ │ ├── CharUtil.java
│ │ │ ├── CheckIfUniqueUrl.java
│ │ │ ├── CheckIfUniqueUrlByBloomfilter.java
│ │ │ ├── CheckIfUniqueUrlByMd5.java
│ │ │ ├── ChineseSpelling.java
│ │ │ ├── DateTimeUtil.java
│ │ │ ├── EncryptUtils.java
│ │ │ ├── IO.java
│ │ │ ├── ImageCompress.java
│ │ │ ├── ImgUtil.java
│ │ │ ├── JsonUtil.java
│ │ │ ├── MD5Utils.java
│ │ │ ├── MapUtils.java
│ │ │ ├── MurmurHash.java
│ │ │ ├── PinyinUtil.java
│ │ │ ├── RegexList.java
│ │ │ ├── Simhash.java
│ │ │ ├── StringHelper.java
│ │ │ ├── StringUtil.java
│ │ │ ├── UrlUtils.java
│ │ │ ├── Util.java
│ │ │ ├── XmlConverUtil.java
│ │ │ ├── download
│ │ │ │ ├── DownLoadPool.java
│ │ │ │ ├── DownloadInfo.java
│ │ │ │ └── MultiThreadDownload.java
│ │ │ └── image
│ │ │ │ ├── ImageResize.java
│ │ │ │ └── ImageResizePool.java
│ │ │ └── web
│ │ │ ├── ContextListener.java
│ │ │ ├── CrawlerManager.java
│ │ │ ├── GoodServlet.java
│ │ │ ├── Start.java
│ │ │ ├── Status.java
│ │ │ ├── Stop.java
│ │ │ └── package-info.java
│ ├── resources
│ │ ├── conf.properties
│ │ ├── default_mapping.json
│ │ ├── job_conf.xml
│ │ ├── log4j.xml
│ │ ├── logback.xml
│ │ ├── mapping.json
│ │ ├── tld-names.txt
│ │ └── webdefault-windows.xml
│ └── webapp
│ │ ├── META-INF
│ │ └── MANIFEST.MF
│ │ ├── WEB-INF
│ │ └── web.xml
│ │ └── index.jsp
└── test
│ └── java
│ └── org
│ └── sbs
│ ├── AppTest.java
│ ├── ListLinks.java
│ ├── T.java
│ ├── extract
│ ├── TestWasu.java
│ ├── TestYouku.java
│ └── Tester.java
│ └── htmlunit
│ ├── HtmlUnitTest.java
│ ├── element
│ ├── GcElementTest.java
│ └── package-info.java
│ └── package-info.java
└── start.bat
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | goodcrawler(web crawler) 网络爬虫
2 | ===========
3 |
4 | ---- standalone version https://github.com/shenbaise/goodcrawler/tree/standalone
5 |
6 | this project is under development.
7 |
8 | it wanna to be a good crawler for java.
9 |
10 |
11 | LICENSE
12 | -------------------
13 | Apache License, Version 2.0
14 | http://www.apache.org/licenses/LICENSE-2.0 ( TXT or HTML )
15 |
16 | shenbaise1001@126.com
17 |
--------------------------------------------------------------------------------
/setpath.bat:
--------------------------------------------------------------------------------
1 | @set LOCALCLASSPATH=%LOCALCLASSPATH%;%1
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/bootstrap/CrawlerStatus.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.bootstrap;
19 |
20 | import org.sbs.pendingqueue.PendingPages;
21 | import org.sbs.pendingqueue.PendingStore;
22 | import org.sbs.pendingqueue.PendingUrls;
23 |
24 | /**
25 | * @author whiteme
26 | * @date 2013年7月31日
27 | * @desc 爬虫运行状态
28 | */
29 | public class CrawlerStatus {
30 |
31 | public static boolean running = false;
32 |
33 | public static String getStatus(){
34 | StringBuilder sb = new StringBuilder();
35 | // sb.append(PendingUrls.getInstance().pendingStatus()).append("
")
36 | // .append(PendingPages.getInstace().pendingStatus()).append("
")
37 | // .append(PendingStore.getInstance().pendingStatus()).append("
");
38 | return sb.toString();
39 | }
40 |
41 | public static String getJobsNames(){
42 | return BootStrap.getJobsNames();
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/bootstrap/foreman/ExtractForeman.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.bootstrap.foreman;
19 |
20 | import java.util.concurrent.ExecutorService;
21 | import java.util.concurrent.Executors;
22 |
23 | import org.sbs.goodcrawler.extractor.DefaultExtractWorker;
24 | import org.sbs.goodcrawler.extractor.DefaultExtractor;
25 | import org.sbs.goodcrawler.jobconf.ExtractConfig;
26 |
27 | /**
28 | * @author shenbaise(shenbaise@outlook.com)
29 | * @date 2013-7-3
30 | * 提取工工头
31 | */
32 | public class ExtractForeman extends Foreman{
33 |
34 | public void start(ExtractConfig conf){
35 | int threadNum = conf.getThreadNum();
36 | ExecutorService executor = Executors.newFixedThreadPool(threadNum);
37 | for(int i=0;i doExtract(Page page) {
76 | return extractor.onExtract(page);
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/ExtractResult.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor;
19 |
20 | /**
21 | * @author shenbaise(shenbaise@outlook.com)
22 | * @date 2013-7-2
23 | * 页面信息提取的结果
24 | */
25 | public enum ExtractResult {
26 | success,failed,ignored
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/GCElement.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor;
19 |
20 | /**
21 | * @author shenbaise(shenbaise1001@126.com)
22 | * @desc element interface
23 | */
24 | public interface GCElement {
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/GCPage.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor;
19 |
20 | /**
21 | * @author shenbaise(shenbaise1001@126.com)
22 | * @desc 用GCPage替代UrlElementCssSelector。同时增加htmlunit的HTMLPage。
23 | */
24 | public interface GCPage {
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/htmlelment/CommonHtmlElement.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.htmlelment;
19 |
20 | import java.net.URL;
21 | import java.util.HashMap;
22 | import java.util.Map;
23 |
24 | import org.sbs.goodcrawler.fetcher.AjaxCallFetcher;
25 |
26 | /**
27 | * @author shenbaise(shenbaise1001@126.com)
28 | * @desc 通用的HtmlElement提取器
29 | */
30 | public class CommonHtmlElement extends AbstractHtmlElement
50 |
51 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/AppTest.java:
--------------------------------------------------------------------------------
1 | package org.sbs;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/ListLinks.java:
--------------------------------------------------------------------------------
1 | package org.sbs;
2 |
3 | import java.io.IOException;
4 |
5 | import org.jsoup.Jsoup;
6 | import org.jsoup.nodes.Document;
7 | import org.jsoup.nodes.Element;
8 | import org.jsoup.safety.Whitelist;
9 | import org.jsoup.select.Elements;
10 |
11 | public class ListLinks {
12 | public static void main(String[] args) throws IOException {
13 | String url = "http://www.66e.cc/bd/20110611/7de42f3f410b41b99f55854553be25e1.htm";
14 |
15 | Document doc = Jsoup.connect(url).get();
16 |
17 | Elements elements = doc.select("p");
18 | for(Element e:elements){
19 | // System.out.println(e.text());
20 | String[] ss = e.toString().split("
");
21 | for(String s:ss){
22 | System.err.println(Jsoup.clean(s, Whitelist.none()));
23 | }
24 | }
25 |
26 | Elements links = doc.select("a[href^=ftp]");
27 | for(Element e:links){
28 | Elements xxElements = e.select("a[href^=ftp]");
29 | for(Element x:xxElements){
30 | System.out.println(x.text());
31 | }
32 | System.out.println(e.text());
33 | }
34 | }
35 | }
--------------------------------------------------------------------------------
/src/test/java/org/sbs/T.java:
--------------------------------------------------------------------------------
1 | package org.sbs;
2 |
3 | import java.io.IOException;
4 | import java.util.regex.Matcher;
5 | import java.util.regex.Pattern;
6 |
7 | public class T {
8 |
9 | /**
10 | * @param args
11 | * @desc
12 | */
13 | public static void main(String[] args) {
14 |
15 |
16 | new Thread(new Runnable() {
17 |
18 | @Override
19 | public void run() {
20 | byte[] b = new byte[128];
21 | while(true){
22 | try {
23 | int i = System.in.read(b);
24 | String input = new String(b);
25 | input = input.replace("\n", "").replace("\r", "").trim();
26 | if(input.equalsIgnoreCase("quit")){
27 |
28 | System.exit(0);
29 | }
30 | } catch (IOException e) {
31 | e.printStackTrace();
32 | }
33 | }
34 |
35 | }
36 | }).start();
37 |
38 |
39 |
40 | // String textString = "片名:床的另一边发布时间:2013-07-18评分:7.2";
41 | // Pattern pattern = Pattern.compile("(\\d{1}[.]\\d{1,2})");
42 | // Matcher m = pattern.matcher(textString);
43 | // while(m.find()){
44 | // System.out.println(m.group(1));
45 | // }
46 |
47 | // pattern = Pattern.compile("http://.*.wasu.cn/Play/show/id/\\d+");
48 | // System.out.println(pattern.matcher("http://www.wasu.cn/Play/show/id/216708").matches());
49 | // System.out.println(Pattern.matches("http://.*.wasu.cn/Play/show/id/\\d+", "http://www.wasu.cn/Play/show/id/216708"));
50 | //
51 |
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/extract/TestWasu.java:
--------------------------------------------------------------------------------
1 | package org.sbs.extract;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.net.MalformedURLException;
6 | import java.net.URL;
7 | import java.util.Map;
8 |
9 | import org.jsoup.Jsoup;
10 | import org.jsoup.nodes.Document;
11 | import org.junit.Test;
12 | import org.sbs.goodcrawler.bootstrap.foreman.FetchForeman;
13 | import org.sbs.goodcrawler.exception.ConfigurationException;
14 | import org.sbs.goodcrawler.exception.ExtractException;
15 | import org.sbs.goodcrawler.jobconf.ExtractConfig;
16 | import org.sbs.goodcrawler.jobconf.FetchConfig;
17 |
18 | public class TestWasu {
19 |
20 | private static String confFile = "conf/wasu_conf.xml";
21 | //电视剧: 新闺蜜时代
22 | private static String tv = "http://www.wasu.cn/Tele/index/id/1146253";
23 | // http://v.youku.com/v_show/id_XNTc4NzczNDM2.html
24 | // 电影: 赤裸特工
25 | private static String movie = "http://www.wasu.cn/Play/show/id/216708";
26 | // http://v.youku.com/v_show/id_XNjY4ODk3MjI4.html
27 | // 综艺 唱出爱火花_20140220_张潇洋 开门见山 女王驾到
28 | private static String zy = "http://www.wasu.cn/Play/show/id/2194006";
29 | // 动漫 海绵宝宝
30 | private static String dm = "http://www.wasu.cn/Tele/index/id/351623";
31 |
32 | private static String zwdajs = "http://www.wasu.cn/Tele/index/id/1007994";
33 |
34 | @Test
35 | public void tv(){
36 | new Tester().test(confFile, "http://www.wasu.cn/Tele/index/id/2169706");
37 | }
38 |
39 | @Test
40 | public void movie(){
41 | new Tester().test(confFile, movie);
42 | }
43 |
44 | @Test
45 | public void zy(){
46 | new Tester().test(confFile, zy);
47 | }
48 |
49 | @Test
50 | public void dm(){
51 |
52 | new Tester().test(confFile,dm);
53 | }
54 |
55 | @Test
56 | public void t(){
57 | try {
58 | Document elements = Jsoup.parse(new URL("http://www.wasu.cn/Play/show/id/2339162"), 100000);
59 | // 分类
60 | // System.out.println(elements.select(".play_information_t .r .one a"));// title
61 | // System.out.println(elements.select(".play_information_t .r .two a "));// eara
62 | // System.out.println(elements.select(".play_information_t .r .three a")); // type
63 | // System.out.println(elements.select(".play_information_t .r .four a")); // director
64 | // System.out.println(elements.select(".play_information_t .r .five .r a")); // actor
65 | // System.out.println(elements.select(".play_information_b .one b"));
66 |
67 | System.out.println(elements.select(".play_seat a").get(1));
68 | } catch (IOException e) {
69 | e.printStackTrace();
70 | }
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/extract/TestYouku.java:
--------------------------------------------------------------------------------
1 | package org.sbs.extract;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.net.URL;
6 | import java.util.Map;
7 |
8 | import org.jsoup.Jsoup;
9 | import org.jsoup.nodes.Document;
10 | import org.junit.Test;
11 | import org.sbs.goodcrawler.bootstrap.foreman.FetchForeman;
12 | import org.sbs.goodcrawler.exception.ConfigurationException;
13 | import org.sbs.goodcrawler.exception.ExtractException;
14 | import org.sbs.goodcrawler.jobconf.ExtractConfig;
15 | import org.sbs.goodcrawler.jobconf.FetchConfig;
16 |
17 | public class TestYouku {
18 |
19 | private static String confFile = "conf/youku_conf.xml";
20 | //电视剧: 精忠岳飞 2013
21 | private static String tv = "http://www.youku.com/show_page/id_zd4edea60e0d011df97c0.html";
22 | // http://v.youku.com/v_show/id_XNTc4NzczNDM2.html
23 | // 电影: 上帝保佑美国 2012
24 | private static String movie = "http://www.youku.com/show_page/id_z0fca04b0bceb11e0bf93.html";
25 | // http://v.youku.com/v_show/id_XNjY4ODk3MjI4.html
26 | // 综艺 快乐大本营
27 | private static String zy = "http://www.youku.com/show_page/id_zd18a7caa2d4311e29498.html";
28 | // 动漫 柯南
29 | private static String dm = "http://www.youku.com/show_page/id_zcc003400962411de83b1.html";
30 | @Test
31 | public void tv(){
32 | new Tester().test(confFile, tv);
33 | }
34 |
35 | @Test
36 | public void movie(){
37 | new Tester().test(confFile, movie);
38 | }
39 |
40 | @Test
41 | public void zy(){
42 | new Tester().test(confFile, zy);
43 | }
44 |
45 | @Test
46 | public void dm(){
47 | new Tester().test("conf/test_youku_dm.xml", "http://v.youku.com/v_show/id_XMzk1NjM1MjAw.html");
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/extract/Tester.java:
--------------------------------------------------------------------------------
1 | package org.sbs.extract;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.net.URL;
6 | import java.util.Map;
7 |
8 | import org.jsoup.Jsoup;
9 | import org.jsoup.nodes.Document;
10 | import org.sbs.goodcrawler.bootstrap.foreman.FetchForeman;
11 | import org.sbs.goodcrawler.exception.ConfigurationException;
12 | import org.sbs.goodcrawler.exception.ExtractException;
13 | import org.sbs.goodcrawler.jobconf.ExtractConfig;
14 | import org.sbs.goodcrawler.jobconf.FetchConfig;
15 |
16 | public class Tester {
17 |
18 | public void test(String conFile,String url){
19 | ExtractConfig extractConfig = new ExtractConfig();
20 | FetchConfig fetchConfig = new FetchConfig();
21 | Document document;
22 | try {
23 | document = Jsoup.parse(new File(conFile), "utf-8");
24 | System.out.println(extractConfig.loadConfig(document).toString());
25 | FetchForeman fetchForeman = new FetchForeman();
26 | fetchForeman.start(fetchConfig.loadConfig(document));
27 | Map r=extractConfig
28 | .getContentSeprator(Jsoup.parse(new URL(url), 10000),url);
29 | System.out.println(r);
30 | }catch (IOException e) {
31 | e.printStackTrace();
32 | } catch (ConfigurationException e) {
33 | e.printStackTrace();
34 | } catch (ExtractException e) {
35 | e.printStackTrace();
36 | }
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/htmlunit/element/GcElementTest.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @工程 goodcrawler
3 | * @文件 GcElementTest.java
4 | * @时间 2013年12月19日 下午4:58:46
5 | * @作者 shenbaise(shenbaise1001@126.com)
6 | * @描述
7 | */
8 | package org.sbs.htmlunit.element;
9 |
10 | import org.junit.Test;
11 | import org.sbs.goodcrawler.extractor.GCElement;
12 | import org.sbs.goodcrawler.extractor.selector.AbstractElementCssSelector;
13 | import org.sbs.goodcrawler.extractor.selector.StringElementCssSelector;
14 |
15 | /**
16 | * @author shenbaise(shenbaise1001@126.com)
17 | * @desc
18 | */
19 | public class GcElementTest {
20 |
21 | @Test
22 | public void insTest(){
23 | StringElementCssSelector secsCssSelector = new StringElementCssSelector("", "", "", true, 0,"");
24 | System.out.println(secsCssSelector instanceof GCElement);
25 | assert secsCssSelector instanceof AbstractElementCssSelector;
26 | assert secsCssSelector instanceof GCElement;
27 | // assert secsCssSelector instanceof AbstractHtmlElement;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/htmlunit/element/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @工程 goodcrawler
3 | * @文件 package-info.java
4 | * @时间 2013年12月19日 下午4:57:57
5 | * @作者 shenbaise(shenbaise1001@126.com)
6 | * @描述
7 | */
8 | /**
9 | * @author shenbaise(shenbaise1001@126.com)
10 | * @desc
11 | */
12 | package org.sbs.htmlunit.element;
--------------------------------------------------------------------------------
/src/test/java/org/sbs/htmlunit/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.htmlunit;
--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
1 | echo off
2 | @set LOCALCLASSPATH=./target/classes/
3 | @for %%i in (".\WebContent\WEB-INF\lib\*.jar") do call "setpath.bat" %%i
4 |
5 |
6 | set CLASSPATH=%LOCALCLASSPATH%;%CLASSPATH%
7 | echo on
8 | #java -Dxport="%1" -Xmx512m com.jetty.MyServer %2 %3 %4
9 | java -Xms512m -Xmx512m -Dxport=8080 -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8787 org.sbs.jetty.StartServer
--------------------------------------------------------------------------------