├── .gitignore
├── setpath.bat
├── src
├── main
│ ├── webapp
│ │ ├── META-INF
│ │ │ └── MANIFEST.MF
│ │ ├── index.jsp
│ │ └── WEB-INF
│ │ │ └── web.xml
│ ├── java
│ │ └── org
│ │ │ └── sbs
│ │ │ ├── goodcrawler
│ │ │ ├── extractor
│ │ │ │ ├── htmlelment
│ │ │ │ │ ├── package-info.java
│ │ │ │ │ ├── HtmlElementExtractType.java
│ │ │ │ │ ├── HtmlElementType.java
│ │ │ │ │ ├── HtmlPageElement.java
│ │ │ │ │ ├── HtmlAnchorElementOfPage.java
│ │ │ │ │ ├── HtmlAnchorElementOfString.java
│ │ │ │ │ └── CommonHtmlElement.java
│ │ │ │ ├── selector
│ │ │ │ │ ├── SelectPageElement.java
│ │ │ │ │ ├── package-info.java
│ │ │ │ │ ├── action
│ │ │ │ │ │ ├── package-info.java
│ │ │ │ │ │ ├── list
│ │ │ │ │ │ │ ├── package-info.java
│ │ │ │ │ │ │ └── ListFilterAction.java
│ │ │ │ │ │ ├── string
│ │ │ │ │ │ │ ├── package-info.java
│ │ │ │ │ │ │ ├── StringActionType.java
│ │ │ │ │ │ │ ├── StringPerfixAction.java
│ │ │ │ │ │ │ ├── StringSuffixAction.java
│ │ │ │ │ │ │ ├── StringAfterLastAction.java
│ │ │ │ │ │ │ ├── StringBeforeAction.java
│ │ │ │ │ │ │ ├── StringBeforeLastAction.java
│ │ │ │ │ │ │ ├── StringAfterAction.java
│ │ │ │ │ │ │ └── StringReplaceAction.java
│ │ │ │ │ │ ├── integer
│ │ │ │ │ │ │ ├── package-info.java
│ │ │ │ │ │ │ ├── IntegerActionType.java
│ │ │ │ │ │ │ ├── IntegerAbsAction.java
│ │ │ │ │ │ │ └── IntegerBetweenAction.java
│ │ │ │ │ │ ├── SelectorAction.java
│ │ │ │ │ │ ├── IntegerSelectorAction.java
│ │ │ │ │ │ ├── EmptyAction.java
│ │ │ │ │ │ ├── StringSelectorAction.java
│ │ │ │ │ │ ├── file
│ │ │ │ │ │ │ └── FileActionType.java
│ │ │ │ │ │ ├── ListSelectorAction.java
│ │ │ │ │ │ └── FileSelectAction.java
│ │ │ │ │ ├── exception
│ │ │ │ │ │ ├── package-info.java
│ │ │ │ │ │ ├── SelectorConfigException.java
│ │ │ │ │ │ ├── DownLoadException.java
│ │ │ │ │ │ └── IntegerBetweenExpressionException.java
│ │ │ │ │ ├── expression
│ │ │ │ │ │ ├── SimpleExpression.java
│ │ │ │ │ │ ├── GrExpression.java
│ │ │ │ │ │ └── SimpleExpressionExtent.java
│ │ │ │ │ ├── SelectorAttr.java
│ │ │ │ │ └── SelectorType.java
│ │ │ │ ├── package-info.java
│ │ │ │ ├── template
│ │ │ │ │ ├── package-info.java
│ │ │ │ │ └── ExtractTemplate.java
│ │ │ │ ├── GCElement.java
│ │ │ │ ├── ExtractResult.java
│ │ │ │ ├── GCPage.java
│ │ │ │ └── DefaultExtractWorker.java
│ │ │ ├── page
│ │ │ │ ├── ExtractedUrlAnchorPair.java
│ │ │ │ ├── ParseData.java
│ │ │ │ ├── package-info.java
│ │ │ │ ├── BinaryParseData.java
│ │ │ │ ├── TextParseData.java
│ │ │ │ ├── HtmlParseData.java
│ │ │ │ └── Parser.java
│ │ │ ├── plugin
│ │ │ │ ├── storage
│ │ │ │ │ ├── p
│ │ │ │ │ │ └── IESStoragePlugin.java
│ │ │ │ │ ├── package-info.java
│ │ │ │ │ ├── Prepare.java
│ │ │ │ │ └── MovieSource.java
│ │ │ │ ├── package-info.java
│ │ │ │ ├── extract
│ │ │ │ │ └── package-info.java
│ │ │ │ ├── classloader
│ │ │ │ │ └── PluginClassLoader.java
│ │ │ │ ├── ReIndex.java
│ │ │ │ └── IndexScanner.java
│ │ │ ├── conf
│ │ │ │ ├── package-info.java
│ │ │ │ ├── Configuration.java
│ │ │ │ ├── Configurable.java
│ │ │ │ ├── Worker.java
│ │ │ │ └── GlobalConstants.java
│ │ │ ├── jobconf
│ │ │ │ ├── package-info.java
│ │ │ │ └── JobConfig.java
│ │ │ ├── bootstrap
│ │ │ │ ├── package-info.java
│ │ │ │ ├── foreman
│ │ │ │ │ ├── package-info.java
│ │ │ │ │ ├── Foreman.java
│ │ │ │ │ ├── ExtractForeman.java
│ │ │ │ │ ├── FetchForeman.java
│ │ │ │ │ └── StoreForeman.java
│ │ │ │ └── CrawlerStatus.java
│ │ │ ├── exception
│ │ │ │ ├── package-info.java
│ │ │ │ ├── ExtractException.java
│ │ │ │ ├── QueueException.java
│ │ │ │ └── ConfigurationException.java
│ │ │ ├── fetcher
│ │ │ │ ├── package-info.java
│ │ │ │ ├── FetcherType.java
│ │ │ │ ├── FetchStatus.java
│ │ │ │ ├── FetcherInstance.java
│ │ │ │ └── IdleConnectionMonitorThread.java
│ │ │ ├── storage
│ │ │ │ ├── package-info.java
│ │ │ │ ├── StorageType.java
│ │ │ │ ├── Storage.java
│ │ │ │ └── StoreResult.java
│ │ │ └── schedule
│ │ │ │ └── RecrawFetherWorkor.java
│ │ │ ├── url
│ │ │ ├── package-info.java
│ │ │ ├── TLDList.java
│ │ │ ├── UrlSignatureSet.java
│ │ │ └── UlrFilters.java
│ │ │ ├── pendingqueue
│ │ │ ├── package-info.java
│ │ │ ├── PendingPages.java
│ │ │ ├── PendRecraw.java
│ │ │ ├── PendingStore.java
│ │ │ └── PendingUrls.java
│ │ │ ├── web
│ │ │ ├── package-info.java
│ │ │ ├── ContextListener.java
│ │ │ ├── GoodServlet.java
│ │ │ ├── Stop.java
│ │ │ ├── Status.java
│ │ │ ├── CrawlerManager.java
│ │ │ └── Start.java
│ │ │ ├── util
│ │ │ ├── CheckIfUniqueUrl.java
│ │ │ ├── CheckIfUniqueUrlByMd5.java
│ │ │ ├── UrlUtils.java
│ │ │ ├── download
│ │ │ │ └── DownLoadPool.java
│ │ │ ├── image
│ │ │ │ └── ImageResizePool.java
│ │ │ ├── IO.java
│ │ │ ├── CheckIfUniqueUrlByBloomfilter.java
│ │ │ ├── JsonUtil.java
│ │ │ ├── Simhash.java
│ │ │ ├── CharUtil.java
│ │ │ └── StringHelper.java
│ │ │ ├── jetty
│ │ │ ├── StartServer.java
│ │ │ └── JettyFactory.java
│ │ │ └── robotstxt
│ │ │ ├── RuleSet.java
│ │ │ ├── RobotstxtConfig.java
│ │ │ └── HostDirectives.java
│ └── resources
│ │ ├── conf.properties
│ │ ├── logback.xml
│ │ ├── default_mapping.json
│ │ └── mapping.json
└── test
│ └── java
│ └── org
│ └── sbs
│ ├── htmlunit
│ ├── element
│ │ ├── package-info.java
│ │ └── GcElementTest.java
│ └── package-info.java
│ ├── AppTest.java
│ ├── ListLinks.java
│ ├── extract
│ ├── Tester.java
│ ├── TestYouku.java
│ └── TestWasu.java
│ └── T.java
├── start.bat
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 |
--------------------------------------------------------------------------------
/setpath.bat:
--------------------------------------------------------------------------------
1 | @set LOCALCLASSPATH=%LOCALCLASSPATH%;%1
--------------------------------------------------------------------------------
/src/main/webapp/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Class-Path:
3 |
4 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/htmlunit/element/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @工程 goodcrawler
3 | * @文件 package-info.java
4 | * @时间 2013年12月19日 下午4:57:57
5 | * @作者 shenbaise(shenbaise1001@126.com)
6 | * @描述
7 | */
8 | /**
9 | * @author shenbaise(shenbaise1001@126.com)
10 | * @desc
11 | */
12 | package org.sbs.htmlunit.element;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/htmlelment/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @工程 goodcrawler
3 | * @文件 package-info.java
4 | * @时间 2013年12月16日 下午6:39:16
5 | * @作者 shenbaise(shenbaise1001@126.com)
6 | * @描述
7 | */
8 | /**
9 | * @author shenbaise(shenbaise1001@126.com)
10 | * @desc htmlUnit对应的HtmlElement对象应用
11 | */
12 | package org.sbs.goodcrawler.extractor.htmlelment;
--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
1 | echo off
2 | @set LOCALCLASSPATH=./target/classes/
3 | @for %%i in (".\WebContent\WEB-INF\lib\*.jar") do call "setpath.bat" %%i
4 |
5 |
6 | set CLASSPATH=%LOCALCLASSPATH%;%CLASSPATH%
7 | echo on
8 | #java -Dxport="%1" -Xmx512m com.jetty.MyServer %2 %3 %4
9 | java -Xms512m -Xmx512m -Dxport=8080 -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8787 org.sbs.jetty.StartServer
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | goodcrawler(web crawler) 网络爬虫
2 | ===========
3 |
4 | ---- standalone version https://github.com/shenbaise/goodcrawler/tree/standalone
5 |
6 | this project is under development.
7 |
8 | it wanna to be a good crawler for java.
9 |
10 |
11 | LICENSE
12 | -------------------
13 | Apache License, Version 2.0
14 | http://www.apache.org/licenses/LICENSE-2.0 ( TXT or HTML )
15 |
16 | shenbaise1001@126.com
17 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/htmlelment/HtmlElementExtractType.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @工程 goodcrawler
3 | * @文件 HtmlElementExtractType.java
4 | * @时间 2013年12月18日 下午5:13:58
5 | * @作者 shenbaise(shenbaise1001@126.com)
6 | * @描述
7 | */
8 | package org.sbs.goodcrawler.extractor.htmlelment;
9 |
10 | /**
11 | * @author shenbaise(shenbaise1001@126.com)
12 | * @desc 提取方式,xpath or id
13 | */
14 | public enum HtmlElementExtractType {
15 | xpath,id
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/SelectPageElement.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @工程 goodcrawler
3 | * @文件 PageElement.java
4 | * @时间 2013年12月19日 下午5:32:37
5 | * @作者 shenbaise(shenbaise1001@126.com)
6 | * @描述
7 | */
8 | package org.sbs.goodcrawler.extractor.selector;
9 |
10 | import org.sbs.goodcrawler.extractor.GCPage;
11 |
12 | /**
13 | * @author shenbaise(shenbaise1001@126.com)
14 | * @desc
15 | */
16 | public class SelectPageElement implements GCPage {
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/page/ExtractedUrlAnchorPair.java:
--------------------------------------------------------------------------------
1 | package org.sbs.goodcrawler.page;
2 |
3 | public class ExtractedUrlAnchorPair {
4 |
5 | private String href;
6 | private String anchor;
7 |
8 | public String getHref() {
9 | return href;
10 | }
11 |
12 | public void setHref(String href) {
13 | this.href = href;
14 | }
15 |
16 | public String getAnchor() {
17 | return anchor;
18 | }
19 |
20 | public void setAnchor(String anchor) {
21 | this.anchor = anchor;
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/resources/conf.properties:
--------------------------------------------------------------------------------
1 | #\u5F85\u5904\u7406URL\u961F\u5217\u5927\u5C0F
2 | pending.urls.queue.size=500000
3 | #\u5F85\u5904\u7406\u7684\u9875\u9762\u961F\u5217\u5927\u5C0F
4 | pending.pages.queue.size=100000
5 | #\u5904\u7406\u5931\u8D25\u7684\u9875\u9762\u961F\u5217\u5927\u5C0F
6 | failed.pages.queue.size=1000
7 | #\u89E3\u6790\u5931\u8D25\u9875\u9762\u5907\u4EFD\u8DEF\u5F84
8 | pending.store.pages.queue.size=2000
9 | failed.pages.backup.path=./failed-pages/
10 | #\u662F\u5426\u5FFD\u7565\u9519\u8BEF\u7684\u6216\u8005\u89E3\u6790\u5931\u8D25\u7684\u9875\u9762
11 | ignore.failed.pages=false
12 | status.save.path=./status
13 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/plugin/storage/p/IESStoragePlugin.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package org.sbs.goodcrawler.plugin.storage.p;
5 |
6 | import org.sbs.goodcrawler.jobconf.StoreConfig;
7 | import org.sbs.goodcrawler.page.ExtractedPage;
8 |
9 | /**
10 | * @author shenbaise(shenbaise1001@126.com)
11 | * es存储的扩展接口
12 | */
13 | public abstract class IESStoragePlugin {
14 | protected StoreConfig config;
15 |
16 | public StoreConfig getConfig() {
17 | return config;
18 | }
19 |
20 | public void setConfig(StoreConfig config) {
21 | this.config = config;
22 | }
23 |
24 | public abstract ExtractedPage, ?> process(ExtractedPage, ?> page);
25 | }
26 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/AppTest.java:
--------------------------------------------------------------------------------
1 | package org.sbs;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/htmlunit/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.htmlunit;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/url/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | /**
18 | * @author shenbaise(shenbaise@outlook.com)
19 | * @date 2013-6-29
20 | */
21 | package org.sbs.url;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/pendingqueue/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.pendingqueue;
--------------------------------------------------------------------------------
/src/test/java/org/sbs/htmlunit/element/GcElementTest.java:
--------------------------------------------------------------------------------
1 | /**
2 | * @工程 goodcrawler
3 | * @文件 GcElementTest.java
4 | * @时间 2013年12月19日 下午4:58:46
5 | * @作者 shenbaise(shenbaise1001@126.com)
6 | * @描述
7 | */
8 | package org.sbs.htmlunit.element;
9 |
10 | import org.junit.Test;
11 | import org.sbs.goodcrawler.extractor.GCElement;
12 | import org.sbs.goodcrawler.extractor.selector.AbstractElementCssSelector;
13 | import org.sbs.goodcrawler.extractor.selector.StringElementCssSelector;
14 |
15 | /**
16 | * @author shenbaise(shenbaise1001@126.com)
17 | * @desc
18 | */
19 | public class GcElementTest {
20 |
21 | @Test
22 | public void insTest(){
23 | StringElementCssSelector secsCssSelector = new StringElementCssSelector("", "", "", true, 0,"");
24 | System.out.println(secsCssSelector instanceof GCElement);
25 | assert secsCssSelector instanceof AbstractElementCssSelector;
26 | assert secsCssSelector instanceof GCElement;
27 | // assert secsCssSelector instanceof AbstractHtmlElement;
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/page/ParseData.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.sbs.goodcrawler.page;
19 |
20 | public interface ParseData {
21 |
22 | @Override
23 | public String toString();
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/web/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年7月29日
21 | * @desc
22 | */
23 | package org.sbs.web;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/conf/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-6-29
21 | */
22 | package org.sbs.goodcrawler.conf;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/jobconf/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年10月13日
21 | * @desc
22 | */
23 | package org.sbs.goodcrawler.jobconf;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/page/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-6-30
21 | */
22 | package org.sbs.goodcrawler.page;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/plugin/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-7-6
21 | */
22 | package org.sbs.goodcrawler.plugin;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/bootstrap/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-7-3
21 | */
22 | package org.sbs.goodcrawler.bootstrap;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/exception/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-6-30
21 | */
22 | package org.sbs.goodcrawler.exception;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-7-2
21 | */
22 | package org.sbs.goodcrawler.extractor;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/fetcher/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-6-30
21 | */
22 | package org.sbs.goodcrawler.fetcher;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/storage/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-6-29
21 | */
22 | package org.sbs.goodcrawler.storage;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/plugin/extract/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-7-7
21 | */
22 | package org.sbs.goodcrawler.plugin.extract;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/plugin/storage/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-7-7
21 | */
22 | package org.sbs.goodcrawler.plugin.storage;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/bootstrap/foreman/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author shenbaise(shenbaise@outlook.com)
20 | * @date 2013-7-3
21 | */
22 | package org.sbs.goodcrawler.bootstrap.foreman;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年10月11日
21 | * @desc
22 | */
23 | package org.sbs.goodcrawler.extractor.selector;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/template/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年10月13日
21 | * @desc
22 | */
23 | package org.sbs.goodcrawler.extractor.template;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年10月11日
21 | * @desc
22 | */
23 | package org.sbs.goodcrawler.extractor.selector.action;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/exception/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年10月13日
21 | * @desc
22 | */
23 | package org.sbs.goodcrawler.extractor.selector.exception;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/list/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年10月13日
21 | * @desc
22 | */
23 | package org.sbs.goodcrawler.extractor.selector.action.list;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年10月11日
21 | * @desc
22 | */
23 | package org.sbs.goodcrawler.extractor.selector.action.string;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/integer/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | /**
19 | * @author whiteme
20 | * @date 2013年10月13日
21 | * @desc
22 | */
23 | package org.sbs.goodcrawler.extractor.selector.action.integer;
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/fetcher/FetcherType.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.fetcher;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年8月3日
23 | * @desc
24 | */
25 | public enum FetcherType {
26 | defaultFetcher
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/GCElement.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor;
19 |
20 | /**
21 | * @author shenbaise(shenbaise1001@126.com)
22 | * @desc element interface
23 | */
24 | public interface GCElement {
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/plugin/storage/Prepare.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.plugin.storage;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年7月28日
23 | * @desc 创建索引及mapping
24 | */
25 | public class Prepare {
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/template/ExtractTemplate.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.template;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月13日
23 | * @desc
24 | */
25 | public class ExtractTemplate {
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/ListLinks.java:
--------------------------------------------------------------------------------
1 | package org.sbs;
2 |
3 | import java.io.IOException;
4 |
5 | import org.jsoup.Jsoup;
6 | import org.jsoup.nodes.Document;
7 | import org.jsoup.nodes.Element;
8 | import org.jsoup.safety.Whitelist;
9 | import org.jsoup.select.Elements;
10 |
11 | public class ListLinks {
12 | public static void main(String[] args) throws IOException {
13 | String url = "http://www.66e.cc/bd/20110611/7de42f3f410b41b99f55854553be25e1.htm";
14 |
15 | Document doc = Jsoup.connect(url).get();
16 |
17 | Elements elements = doc.select("p");
18 | for(Element e:elements){
19 | // System.out.println(e.text());
20 | String[] ss = e.toString().split(" ");
21 | for(String s:ss){
22 | System.err.println(Jsoup.clean(s, Whitelist.none()));
23 | }
24 | }
25 |
26 | Elements links = doc.select("a[href^=ftp]");
27 | for(Element e:links){
28 | Elements xxElements = e.select("a[href^=ftp]");
29 | for(Element x:xxElements){
30 | System.out.println(x.text());
31 | }
32 | System.out.println(e.text());
33 | }
34 | }
35 | }
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/fetcher/FetchStatus.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.fetcher;
19 |
20 | /**
21 | * @author shenbaise(shenbaise@outlook.com)
22 | * @date 2013-6-30
23 | * 抓取页面的结果
24 | */
25 | public enum FetchStatus {
26 | success,failed,ignored
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/storage/StorageType.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.storage;
19 |
20 | /**
21 | * @author shenbaise(shenbaise@outlook.com)
22 | * @date 2013-6-29
23 | */
24 | public enum StorageType {
25 | LocalFile,ElasticSearch,Mongodb,Hbase,Mysql
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/ExtractResult.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor;
19 |
20 | /**
21 | * @author shenbaise(shenbaise@outlook.com)
22 | * @date 2013-7-2
23 | * 页面信息提取的结果
24 | */
25 | public enum ExtractResult {
26 | success,failed,ignored
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/GCPage.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor;
19 |
20 | /**
21 | * @author shenbaise(shenbaise1001@126.com)
22 | * @desc 用GCPage替代UrlElementCssSelector。同时增加htmlunit的HTMLPage。
23 | */
24 | public interface GCPage {
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/expression/SimpleExpression.java:
--------------------------------------------------------------------------------
1 | package org.sbs.goodcrawler.extractor.selector.expression;
2 | /**
3 | *
4 | * @author whiteme
5 | * @date 2013年10月16日
6 | * @desc 简单测试表达式,大小比较自动转换为整型进行比较
7 | */
8 | public class SimpleExpression {
9 | String left;
10 | String right;
11 | String op;
12 |
13 | public SimpleExpression(String left, String right, String op) {
14 | super();
15 | this.left = left;
16 | this.right = right;
17 | this.op = op;
18 | }
19 |
20 | public boolean test() throws Exception{
21 | if("=".equals(op)){
22 | return left.equals(right);
23 | }else if(">".equals(op)){
24 | return Integer.parseInt(left) > Integer.parseInt(right);
25 | }else if("!=".equals(op)){
26 | return !left.equals(right);
27 | }else if(">=".equals(op)){
28 | return Integer.parseInt(left) >= Integer.parseInt(right);
29 | }
30 | else if("<".equals(op)){
31 | return Integer.parseInt(left) < Integer.parseInt(right);
32 | }
33 | else if("<=".equals(op)){
34 | return Integer.parseInt(left) <= Integer.parseInt(right);
35 | }else {
36 | throw new Exception("无效的表达式:"+op);
37 | }
38 | }
39 | }
--------------------------------------------------------------------------------
/src/main/java/org/sbs/util/CheckIfUniqueUrl.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.util;
19 |
20 | /**
21 | * @author shenbaise(shenbaise@outlook.com)
22 | * @date 2013-6-29
23 | * 用于检测Url是否已经处理过
24 | */
25 | public interface CheckIfUniqueUrl {
26 | public boolean isDuplicate(String url);
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/SelectorAttr.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年8月3日
23 | * @desc
24 | */
25 | public enum SelectorAttr {
26 | src,href,text,value,id,title,tostring,alt,other
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/integer/IntegerActionType.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action.integer;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月18日
23 | * @desc
24 | */
25 | public enum IntegerActionType {
26 | abs,
27 | between
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/exception/SelectorConfigException.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.exception;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月13日
23 | * @desc 选择器配置问题导致的异常
24 | */
25 | public interface SelectorConfigException {
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/expression/GrExpression.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.expression;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月16日
23 | * @desc 简单表达式接口
24 | */
25 | public interface GrExpression {
26 | public boolean test();
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/htmlelment/HtmlElementType.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.htmlelment;
19 |
20 | /**
21 | * @author shenbaise(shenbaise1001@126.com)
22 | * @desc
23 | */
24 | public enum HtmlElementType {
25 | htmlAnchor,htmlButton,htmlEmbed,htmlForm,htmlImg,htmlInput
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/SelectorAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年8月3日
23 | * @desc 对选择器选择内容的进一步加工处理。
24 | */
25 | public interface SelectorAction {
26 | // public abstract T doAction(T content);
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | true
5 |
6 |
7 | logs/gc-log-%d{yyyy-MM-dd}.%i.log
8 |
9 | 1
10 | 3
11 | 10
12 |
14 | 50MB
15 |
16 |
17 |
18 | %date %level [%thread] %logger.%class{0}#%method [%file:%line] %msg%n
19 |
20 |
21 |
22 |
23 | %date %level [%thread] %logger.%class{0}#%method [%file:%line] %msg%n
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/web/ContextListener.java:
--------------------------------------------------------------------------------
1 | package org.sbs.web;
2 |
3 | import javax.servlet.ServletContext;
4 | import javax.servlet.ServletContextEvent;
5 | import javax.servlet.ServletContextListener;
6 | import javax.servlet.annotation.WebListener;
7 |
8 | /**
9 | * Application Lifecycle Listener implementation class ContextListener
10 | *
11 | */
12 | @WebListener
13 | public class ContextListener extends ServletContextEvent implements ServletContextListener {
14 |
15 | private static final long serialVersionUID = -2217410422446016104L;
16 |
17 | /**
18 | * @see ServletContextEvent#ServletContextEvent(ServletContext)
19 | */
20 | public ContextListener(ServletContext source) {
21 | super(source);
22 | // TODO Auto-generated constructor stub
23 | }
24 |
25 | /**
26 | * @see ServletContextListener#contextInitialized(ServletContextEvent)
27 | */
28 | public void contextInitialized(ServletContextEvent sce) {
29 | // TODO Auto-generated method stub
30 | }
31 |
32 | /**
33 | * @see ServletContextListener#contextDestroyed(ServletContextEvent)
34 | */
35 | public void contextDestroyed(ServletContextEvent sce) {
36 | // TODO Auto-generated method stub
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/bootstrap/foreman/Foreman.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.bootstrap.foreman;
19 |
20 |
21 | /**
22 | * @author whiteme
23 | * @date 2013年7月30日
24 | * @desc
25 | */
26 | public abstract class Foreman {
27 | public static boolean stop = false;
28 |
29 | public static void stop(){
30 | stop = true;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/conf/Configuration.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.conf;
19 |
20 | /**
21 | * @author shenbaise(shenbaise@outlook.com)
22 | * @date 2013-6-30 配置接口
23 | */
24 | public abstract class Configuration {
25 |
26 | public String jobName;
27 |
28 | public String indexName;
29 |
30 | public abstract String toString();
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/IntegerSelectorAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月13日
23 | * @desc 整型选择器的处理接口
24 | */
25 | public abstract class IntegerSelectorAction implements SelectorAction {
26 | public abstract int doAction(Integer i);
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/EmptyAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action;
19 |
20 |
21 | /**
22 | * @author whiteme
23 | * @date 2013年10月18日
24 | * @desc 什么也不做
25 | */
26 | public class EmptyAction implements SelectorAction{
27 | public Object doAction(Object content) {
28 | return content;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/StringSelectorAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action;
19 |
20 |
21 | /**
22 | * @author whiteme
23 | * @date 2013年10月11日
24 | * @desc 字符选择器的处理接口
25 | */
26 | public abstract class StringSelectorAction implements SelectorAction {
27 | public abstract String doAction(String content);
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/page/BinaryParseData.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.sbs.goodcrawler.page;
19 |
20 | public class BinaryParseData implements ParseData {
21 |
22 | private static BinaryParseData instance = new BinaryParseData();
23 |
24 | public static BinaryParseData getInstance() {
25 | return instance;
26 | }
27 |
28 | @Override
29 | public String toString() {
30 | return "[Binary parse data can not be dumped as string]";
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/web/GoodServlet.java:
--------------------------------------------------------------------------------
1 | package org.sbs.web;
2 |
3 | import java.io.IOException;
4 | import javax.servlet.ServletException;
5 | import javax.servlet.annotation.WebServlet;
6 | import javax.servlet.http.HttpServlet;
7 | import javax.servlet.http.HttpServletRequest;
8 | import javax.servlet.http.HttpServletResponse;
9 |
10 | /**
11 | * Servlet implementation class GoodServlet
12 | */
13 | @WebServlet("/gcrawler")
14 | public class GoodServlet extends HttpServlet {
15 | private static final long serialVersionUID = 1L;
16 |
17 | /**
18 | * @see HttpServlet#HttpServlet()
19 | */
20 | public GoodServlet() {
21 | super();
22 | // TODO Auto-generated constructor stub
23 | }
24 |
25 | /**
26 | * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
27 | */
28 | protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
29 | // TODO Auto-generated method stub
30 | }
31 |
32 | /**
33 | * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
34 | */
35 | protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
36 | // TODO Auto-generated method stub
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/file/FileActionType.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action.file;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月22日
23 | * @desc 文件类型元素处理种类
24 | */
25 | public enum FileActionType {
26 | /**
27 | * 下载
28 | */
29 | download,
30 | /**
31 | * 下载并压缩,针对图片
32 | */
33 | download_resize
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/url/TLDList.java:
--------------------------------------------------------------------------------
1 | package org.sbs.url;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.InputStream;
5 | import java.io.InputStreamReader;
6 | import java.util.HashSet;
7 | import java.util.Set;
8 |
9 | public class TLDList {
10 |
11 | private final String tldNamesFileName = "tld-names.txt";
12 |
13 | private Set tldSet = new HashSet<>();
14 |
15 | private static TLDList instance = new TLDList();
16 |
17 | private TLDList() {
18 | try {
19 | InputStream stream = this.getClass().getClassLoader().getResourceAsStream(tldNamesFileName);
20 | if (stream == null) {
21 | System.err.println("Couldn't find " + tldNamesFileName);
22 | System.exit(-1);
23 | }
24 |
25 | BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
26 | String line;
27 | while ((line = reader.readLine()) != null) {
28 | line = line.trim();
29 | if (line.isEmpty() || line.startsWith("//")) {
30 | continue;
31 | }
32 | tldSet.add(line);
33 | }
34 | reader.close();
35 | } catch (Exception e) {
36 | e.printStackTrace();
37 | }
38 | }
39 |
40 | public static TLDList getInstance() {
41 | return instance;
42 | }
43 |
44 | public boolean contains(String str) {
45 | return tldSet.contains(str);
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/conf/Configurable.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.sbs.goodcrawler.conf;
19 |
20 |
21 | /**
22 | * @author shenbaise(shenbaise@outlook.com)
23 | * @date 2013-6-30
24 | */
25 | public abstract class Configurable {
26 |
27 | protected static Configuration config;
28 |
29 | protected Configurable(Configuration config) {
30 | Configurable.config = config;
31 | }
32 |
33 | public Configuration getConfig() {
34 | return config;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/ListSelectorAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action;
19 |
20 | import java.util.List;
21 |
22 | /**
23 | * @author whiteme
24 | * @date 2013年10月13日
25 | * @desc 处理list的Action
26 | */
27 | public abstract class ListSelectorAction implements SelectorAction {
28 | public abstract List> doAction(List> list);
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/page/TextParseData.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.sbs.goodcrawler.page;
19 |
20 | public class TextParseData implements ParseData {
21 |
22 | private String textContent;
23 |
24 | public String getTextContent() {
25 | return textContent;
26 | }
27 |
28 | public void setTextContent(String textContent) {
29 | this.textContent = textContent;
30 | }
31 |
32 | @Override
33 | public String toString() {
34 | return textContent;
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/extract/Tester.java:
--------------------------------------------------------------------------------
1 | package org.sbs.extract;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.net.URL;
6 | import java.util.Map;
7 |
8 | import org.jsoup.Jsoup;
9 | import org.jsoup.nodes.Document;
10 | import org.sbs.goodcrawler.bootstrap.foreman.FetchForeman;
11 | import org.sbs.goodcrawler.exception.ConfigurationException;
12 | import org.sbs.goodcrawler.exception.ExtractException;
13 | import org.sbs.goodcrawler.jobconf.ExtractConfig;
14 | import org.sbs.goodcrawler.jobconf.FetchConfig;
15 |
16 | public class Tester {
17 |
18 | public void test(String conFile,String url){
19 | ExtractConfig extractConfig = new ExtractConfig();
20 | FetchConfig fetchConfig = new FetchConfig();
21 | Document document;
22 | try {
23 | document = Jsoup.parse(new File(conFile), "utf-8");
24 | System.out.println(extractConfig.loadConfig(document).toString());
25 | FetchForeman fetchForeman = new FetchForeman();
26 | fetchForeman.start(fetchConfig.loadConfig(document));
27 | Map r=extractConfig
28 | .getContentSeprator(Jsoup.parse(new URL(url), 10000),url);
29 | System.out.println(r);
30 | }catch (IOException e) {
31 | e.printStackTrace();
32 | } catch (ConfigurationException e) {
33 | e.printStackTrace();
34 | } catch (ExtractException e) {
35 | e.printStackTrace();
36 | }
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringActionType.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action.string;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月17日
23 | * @desc 字符处理的各种方式
24 | */
25 | public enum StringActionType {
26 | after,
27 | afterLast,
28 | before,
29 | beforeLast,
30 | between,
31 | filter,
32 | replace,
33 | split,
34 | sub,
35 | suffix,
36 | perfix
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/pendingqueue/PendingPages.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.pendingqueue;
19 |
20 | import org.sbs.goodcrawler.page.Page;
21 |
22 | /**
23 | * @author shenbaise(shenbaise@outlook.com)
24 | * @date 2013-6-29 等待处理的页面
25 | */
26 | public class PendingPages extends AbsPendingQueue {
27 | private static final long serialVersionUID = -5671808882701246813L;
28 | protected PendingPages(String jobName) {
29 | super(jobName);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/util/CheckIfUniqueUrlByMd5.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.util;
19 |
20 | import org.sbs.url.UrlSignatureSet;
21 |
22 | /**
23 | * @author shenbaise(shenbaise@outlook.com)
24 | * @date 2013-6-29
25 | * 通过检测Url的md5信息比较Url是否重复
26 | */
27 | @Deprecated
28 | public class CheckIfUniqueUrlByMd5 implements CheckIfUniqueUrl{
29 |
30 | @Override
31 | public boolean isDuplicate(String url) {
32 | return UrlSignatureSet.duplicate(url);
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/conf/Worker.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.conf;
19 |
20 | /**
21 | * @author shenbaise(shenbaise@outlook.com)
22 | * @date 2013-7-1
23 | * 工人
24 | */
25 | public abstract class Worker implements Runnable{
26 | private static boolean stop = false;
27 |
28 | /**
29 | * 停工
30 | */
31 | public static synchronized void stop(){
32 | stop = true;
33 | }
34 |
35 | public static synchronized boolean isStop(){
36 | return stop;
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/pendingqueue/PendRecraw.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.sbs.pendingqueue;
18 |
19 | import org.sbs.url.WebURL;
20 |
21 |
22 | /**
23 | * @author shenbaise(shenbaise@outlook.com)
24 | * 定时更新的url--更新中的电视剧等
25 | */
26 | public class PendRecraw extends AbsPendingQueue {
27 | private static final long serialVersionUID = -2733220512896685281L;
28 |
29 | protected PendRecraw(String jobName) {
30 | super(jobName);
31 | }
32 |
33 | public static void main(String[] args) {
34 | System.out.println(new PendRecraw("hello").pendingStatus());
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/htmlelment/HtmlPageElement.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.htmlelment;
19 |
20 | import java.lang.ref.WeakReference;
21 |
22 | import org.sbs.goodcrawler.extractor.GCPage;
23 | import org.sbs.goodcrawler.fetcher.AjaxCallFetcher;
24 |
25 | /**
26 | * @author shenbaise(shenbaise1001@126.com)
27 | * @desc
28 | */
29 | public class HtmlPageElement implements GCPage {
30 | private WeakReference fetcher;
31 |
32 |
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/pendingqueue/PendingStore.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ########################## GoodCrawler ############################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.pendingqueue;
19 |
20 | import org.sbs.goodcrawler.page.ExtractedPage;
21 |
22 | /**
23 | * @author shenbaise(shenbaise@outlook.com)
24 | * @date 2013-6-30
25 | */
26 | @SuppressWarnings("rawtypes")
27 | public class PendingStore extends AbsPendingQueue {
28 |
29 | private static final long serialVersionUID = 7211446103736928404L;
30 |
31 | protected PendingStore(String jobName) {
32 | super(jobName);
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/exception/ExtractException.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.exception;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月20日
23 | * @desc
24 | */
25 | public class ExtractException extends Exception{
26 |
27 | private static final long serialVersionUID = 7761968909463699377L;
28 |
29 | public ExtractException(String arg0, Throwable arg1) {
30 | super(arg0, arg1);
31 | }
32 |
33 | public ExtractException(String arg0) {
34 | super(arg0);
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/list/ListFilterAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action.list;
19 |
20 | import java.util.List;
21 |
22 | import org.sbs.goodcrawler.extractor.selector.action.ListSelectorAction;
23 |
24 | /**
25 | * @author whiteme
26 | * @date 2013年10月13日
27 | * @desc
28 | */
29 | public class ListFilterAction extends ListSelectorAction {
30 |
31 | @Override
32 | public List> doAction(List> list) {
33 | return list;
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/pendingqueue/PendingUrls.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.sbs.pendingqueue;
18 |
19 | import org.sbs.url.WebURL;
20 |
21 |
22 | /**
23 | * @author shenbaise(shenbaise@outlook.com)
24 | * @date 2013-6-29
25 | * @desc 待处理的Urls队列
26 | */
27 | public class PendingUrls extends AbsPendingQueue {
28 | private static final long serialVersionUID = -2733220512896685281L;
29 |
30 | protected PendingUrls(String jobName) {
31 | super(jobName);
32 | }
33 |
34 | public static void main(String[] args) {
35 | System.out.println(new PendingUrls("hello").pendingStatus());
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/action/integer/IntegerAbsAction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.action.integer;
19 |
20 | import org.sbs.goodcrawler.extractor.selector.action.IntegerSelectorAction;
21 |
22 | /**
23 | * @author whiteme
24 | * @date 2013年10月13日
25 | * @desc 求绝对值
26 | */
27 | public class IntegerAbsAction extends IntegerSelectorAction {
28 |
29 | /**
30 | * 求绝对值
31 | */
32 | @Override
33 | public int doAction(Integer i) {
34 | return Math.abs(i);
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/test/java/org/sbs/T.java:
--------------------------------------------------------------------------------
1 | package org.sbs;
2 |
3 | import java.io.IOException;
4 | import java.util.regex.Matcher;
5 | import java.util.regex.Pattern;
6 |
7 | public class T {
8 |
9 | /**
10 | * @param args
11 | * @desc
12 | */
13 | public static void main(String[] args) {
14 |
15 |
16 | new Thread(new Runnable() {
17 |
18 | @Override
19 | public void run() {
20 | byte[] b = new byte[128];
21 | while(true){
22 | try {
23 | int i = System.in.read(b);
24 | String input = new String(b);
25 | input = input.replace("\n", "").replace("\r", "").trim();
26 | if(input.equalsIgnoreCase("quit")){
27 |
28 | System.exit(0);
29 | }
30 | } catch (IOException e) {
31 | e.printStackTrace();
32 | }
33 | }
34 |
35 | }
36 | }).start();
37 |
38 |
39 |
40 | // String textString = "片名:床的另一边发布时间:2013-07-18评分:7.2";
41 | // Pattern pattern = Pattern.compile("(\\d{1}[.]\\d{1,2})");
42 | // Matcher m = pattern.matcher(textString);
43 | // while(m.find()){
44 | // System.out.println(m.group(1));
45 | // }
46 |
47 | // pattern = Pattern.compile("http://.*.wasu.cn/Play/show/id/\\d+");
48 | // System.out.println(pattern.matcher("http://www.wasu.cn/Play/show/id/216708").matches());
49 | // System.out.println(Pattern.matches("http://.*.wasu.cn/Play/show/id/\\d+", "http://www.wasu.cn/Play/show/id/216708"));
50 | //
51 |
52 | }
53 |
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/java/org/sbs/goodcrawler/extractor/selector/exception/DownLoadException.java:
--------------------------------------------------------------------------------
1 | /**
2 | * ######################## SHENBAISE'S WORK ##########################
3 | * Licensed to the Apache Software Foundation (ASF) under one or more
4 | * contributor license agreements. See the NOTICE file distributed with
5 | * this work for additional information regarding copyright ownership.
6 | * The ASF licenses this file to You under the Apache License, Version 2.0
7 | * (the "License"); you may not use this file except in compliance with
8 | * the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | package org.sbs.goodcrawler.extractor.selector.exception;
19 |
20 | /**
21 | * @author whiteme
22 | * @date 2013年10月20日
23 | * @desc 文件下载异常
24 | */
25 | public class DownLoadException extends Exception {
26 | private static final long serialVersionUID = 6548227413938390848L;
27 |
28 | public DownLoadException(String arg0, Throwable arg1) {
29 | super(arg0, arg1);
30 | }
31 |
32 | public DownLoadException(String arg0) {
33 | super(arg0);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/webapp/index.jsp:
--------------------------------------------------------------------------------
1 | <%@ page language="java" contentType="text/html; charset=UTF-8"
2 | pageEncoding="UTF-8"%>
3 |
4 |
5 |
6 |
7 | Goodcrawler Controller Panel
8 |
9 |
10 |
11 |
12 |