├── .gitignore ├── README.md ├── conf ├── fetch.xml ├── letv_conf_temp.xml ├── template.xml ├── test_youku_dm.xml ├── wasu_conf.xml ├── youku_conf.xml └── youku_conf_temp.xml ├── pom.xml ├── setpath.bat ├── src ├── main │ ├── java │ │ └── org │ │ │ └── sbs │ │ │ ├── goodcrawler │ │ │ ├── bootstrap │ │ │ │ ├── BootStrap.java │ │ │ │ ├── CrawlerStatus.java │ │ │ │ ├── foreman │ │ │ │ │ ├── ExtractForeman.java │ │ │ │ │ ├── FetchForeman.java │ │ │ │ │ ├── Foreman.java │ │ │ │ │ ├── StoreForeman.java │ │ │ │ │ └── package-info.java │ │ │ │ └── package-info.java │ │ │ ├── conf │ │ │ │ ├── Configurable.java │ │ │ │ ├── Configuration.java │ │ │ │ ├── GlobalConstants.java │ │ │ │ ├── JobConfigurationManager.java │ │ │ │ ├── PropertyConfigurationHelper.java │ │ │ │ ├── Worker.java │ │ │ │ └── package-info.java │ │ │ ├── exception │ │ │ │ ├── ConfigurationException.java │ │ │ │ ├── ExtractException.java │ │ │ │ ├── QueueException.java │ │ │ │ └── package-info.java │ │ │ ├── extractor │ │ │ │ ├── DefaultExtractWorker.java │ │ │ │ ├── DefaultExtractor.java │ │ │ │ ├── ExtractResult.java │ │ │ │ ├── ExtractWorker.java │ │ │ │ ├── Extractor.java │ │ │ │ ├── GCElement.java │ │ │ │ ├── GCPage.java │ │ │ │ ├── htmlelment │ │ │ │ │ ├── AbstractHtmlElement.java │ │ │ │ │ ├── CommonHtmlElement.java │ │ │ │ │ ├── HtmlAnchorElementOfPage.java │ │ │ │ │ ├── HtmlAnchorElementOfString.java │ │ │ │ │ ├── HtmlElementExtractType.java │ │ │ │ │ ├── HtmlElementType.java │ │ │ │ │ ├── HtmlPageElement.java │ │ │ │ │ └── package-info.java │ │ │ │ ├── package-info.java │ │ │ │ ├── selector │ │ │ │ │ ├── AbstractElementCssSelector.java │ │ │ │ │ ├── DateElementCssSelector.java │ │ │ │ │ ├── FileElementCssSelector.java │ │ │ │ │ ├── IFConditions.java │ │ │ │ │ ├── IntegerElementCssSelector.java │ │ │ │ │ ├── ListElementCssSelector.java │ │ │ │ │ ├── NumericaElementCssSelector.java │ │ │ │ │ ├── PageElementSelector.java │ │ │ │ │ ├── SelectPageElement.java │ │ │ │ │ ├── SelectorAttr.java │ │ │ │ │ ├── SelectorType.java │ │ │ │ │ ├── SetElementCssSelector.java │ │ │ │ │ ├── StringElementCssSelector.java │ │ │ │ │ ├── action │ │ │ │ │ │ ├── EmptyAction.java │ │ │ │ │ │ ├── FileSelectAction.java │ │ │ │ │ │ ├── IntegerSelectorAction.java │ │ │ │ │ │ ├── ListSelectorAction.java │ │ │ │ │ │ ├── SelectorAction.java │ │ │ │ │ │ ├── StringSelectorAction.java │ │ │ │ │ │ ├── file │ │ │ │ │ │ │ ├── DownLoadFileAction.java │ │ │ │ │ │ │ ├── DownLoadImageResizeAction.java │ │ │ │ │ │ │ └── FileActionType.java │ │ │ │ │ │ ├── integer │ │ │ │ │ │ │ ├── IntegerAbsAction.java │ │ │ │ │ │ │ ├── IntegerActionType.java │ │ │ │ │ │ │ ├── IntegerBetweenAction.java │ │ │ │ │ │ │ └── package-info.java │ │ │ │ │ │ ├── list │ │ │ │ │ │ │ ├── ListFilterAction.java │ │ │ │ │ │ │ └── package-info.java │ │ │ │ │ │ ├── package-info.java │ │ │ │ │ │ └── string │ │ │ │ │ │ │ ├── ActionFactory.java │ │ │ │ │ │ │ ├── StringActionType.java │ │ │ │ │ │ │ ├── StringAfterAction.java │ │ │ │ │ │ │ ├── StringAfterLastAction.java │ │ │ │ │ │ │ ├── StringBeforeAction.java │ │ │ │ │ │ │ ├── StringBeforeLastAction.java │ │ │ │ │ │ │ ├── StringBetweenAction.java │ │ │ │ │ │ │ ├── StringFilterAction.java │ │ │ │ │ │ │ ├── StringPerfixAction.java │ │ │ │ │ │ │ ├── StringReplaceAction.java │ │ │ │ │ │ │ ├── StringSplitAction.java │ │ │ │ │ │ │ ├── StringSubAction.java │ │ │ │ │ │ │ ├── StringSuffixAction.java │ │ │ │ │ │ │ └── package-info.java │ │ │ │ │ ├── exception │ │ │ │ │ │ ├── DownLoadException.java │ │ │ │ │ │ ├── IntegerBetweenExpressionException.java │ │ │ │ │ │ ├── SelectorConfigException.java │ │ │ │ │ │ └── package-info.java │ │ │ │ │ ├── expression │ │ │ │ │ │ ├── GrExpression.java │ │ │ │ │ │ ├── SimpleExpression.java │ │ │ │ │ │ └── SimpleExpressionExtent.java │ │ │ │ │ ├── factory │ │ │ │ │ │ └── ElementCssSelectorFactory.java │ │ │ │ │ └── package-info.java │ │ │ │ └── template │ │ │ │ │ ├── ExtractTemplate.java │ │ │ │ │ └── package-info.java │ │ │ ├── fetcher │ │ │ │ ├── AjaxCallFetcher.java │ │ │ │ ├── CustomFetchStatus.java │ │ │ │ ├── DefaultFetchWorker.java │ │ │ │ ├── FailedPageBackup.java │ │ │ │ ├── FetchStatus.java │ │ │ │ ├── FetchWorker.java │ │ │ │ ├── Fetcher.java │ │ │ │ ├── FetcherInstance.java │ │ │ │ ├── FetcherType.java │ │ │ │ ├── IdleConnectionMonitorThread.java │ │ │ │ ├── PageFetcher.java │ │ │ │ ├── ResynchronizingAjaxController.java │ │ │ │ └── package-info.java │ │ │ ├── jobconf │ │ │ │ ├── ExtractConfig.java │ │ │ │ ├── FetchConfig.java │ │ │ │ ├── JobConfig.java │ │ │ │ ├── StoreConfig.java │ │ │ │ └── package-info.java │ │ │ ├── page │ │ │ │ ├── BinaryParseData.java │ │ │ │ ├── ExtractedPage.java │ │ │ │ ├── ExtractedUrlAnchorPair.java │ │ │ │ ├── HtmlContentHandler.java │ │ │ │ ├── HtmlParseData.java │ │ │ │ ├── Page.java │ │ │ │ ├── PageFetchResult.java │ │ │ │ ├── ParseData.java │ │ │ │ ├── Parser.java │ │ │ │ ├── TextParseData.java │ │ │ │ └── package-info.java │ │ │ ├── plugin │ │ │ │ ├── EsClient.java │ │ │ │ ├── ExBulk.java │ │ │ │ ├── IndexScanner.java │ │ │ │ ├── ReIndex.java │ │ │ │ ├── classloader │ │ │ │ │ ├── CommonClassLoader.java │ │ │ │ │ └── PluginClassLoader.java │ │ │ │ ├── extract │ │ │ │ │ ├── ExtractYouku.java │ │ │ │ │ ├── Extractor66ys.java │ │ │ │ │ ├── ExtractorDytt8.java │ │ │ │ │ └── package-info.java │ │ │ │ ├── package-info.java │ │ │ │ └── storage │ │ │ │ │ ├── ElasticSearchStorage.java │ │ │ │ │ ├── Movie.java │ │ │ │ │ ├── MovieSource.java │ │ │ │ │ ├── Prepare.java │ │ │ │ │ ├── p │ │ │ │ │ ├── IESStoragePlugin.java │ │ │ │ │ └── WasuEsStorePlugin.java │ │ │ │ │ └── package-info.java │ │ │ ├── schedule │ │ │ │ ├── ReCraw.java │ │ │ │ └── RecrawFetherWorkor.java │ │ │ └── storage │ │ │ │ ├── DefaultStoreWorker.java │ │ │ │ ├── LocalFileStorage.java │ │ │ │ ├── Storage.java │ │ │ │ ├── StorageType.java │ │ │ │ ├── StoreResult.java │ │ │ │ ├── StoreWorker.java │ │ │ │ └── package-info.java │ │ │ ├── jetty │ │ │ ├── JettyFactory.java │ │ │ └── StartServer.java │ │ │ ├── pendingqueue │ │ │ ├── AbsPendingQueue.java │ │ │ ├── PendRecraw.java │ │ │ ├── PendingManager.java │ │ │ ├── PendingPages.java │ │ │ ├── PendingStore.java │ │ │ ├── PendingUrls.java │ │ │ └── package-info.java │ │ │ ├── robotstxt │ │ │ ├── HostDirectives.java │ │ │ ├── RobotstxtConfig.java │ │ │ ├── RobotstxtParser.java │ │ │ ├── RobotstxtServer.java │ │ │ └── RuleSet.java │ │ │ ├── url │ │ │ ├── TLDList.java │ │ │ ├── URLCanonicalizer.java │ │ │ ├── UlrFilters.java │ │ │ ├── UrlResolver.java │ │ │ ├── UrlSignatureSet.java │ │ │ ├── WebURL.java │ │ │ └── package-info.java │ │ │ ├── util │ │ │ ├── BinaryDateDwonLoader.java │ │ │ ├── BloomFilter.java │ │ │ ├── BloomfilterHelper.java │ │ │ ├── CharUtil.java │ │ │ ├── CheckIfUniqueUrl.java │ │ │ ├── CheckIfUniqueUrlByBloomfilter.java │ │ │ ├── CheckIfUniqueUrlByMd5.java │ │ │ ├── ChineseSpelling.java │ │ │ ├── DateTimeUtil.java │ │ │ ├── EncryptUtils.java │ │ │ ├── IO.java │ │ │ ├── ImageCompress.java │ │ │ ├── ImgUtil.java │ │ │ ├── JsonUtil.java │ │ │ ├── MD5Utils.java │ │ │ ├── MapUtils.java │ │ │ ├── MurmurHash.java │ │ │ ├── PinyinUtil.java │ │ │ ├── RegexList.java │ │ │ ├── Simhash.java │ │ │ ├── StringHelper.java │ │ │ ├── StringUtil.java │ │ │ ├── UrlUtils.java │ │ │ ├── Util.java │ │ │ ├── XmlConverUtil.java │ │ │ ├── download │ │ │ │ ├── DownLoadPool.java │ │ │ │ ├── DownloadInfo.java │ │ │ │ └── MultiThreadDownload.java │ │ │ └── image │ │ │ │ ├── ImageResize.java │ │ │ │ └── ImageResizePool.java │ │ │ └── web │ │ │ ├── ContextListener.java │ │ │ ├── CrawlerManager.java │ │ │ ├── GoodServlet.java │ │ │ ├── Start.java │ │ │ ├── Status.java │ │ │ ├── Stop.java │ │ │ └── package-info.java │ ├── resources │ │ ├── conf.properties │ │ ├── default_mapping.json │ │ ├── job_conf.xml │ │ ├── log4j.xml │ │ ├── logback.xml │ │ ├── mapping.json │ │ ├── tld-names.txt │ │ └── webdefault-windows.xml │ └── webapp │ │ ├── META-INF │ │ └── MANIFEST.MF │ │ ├── WEB-INF │ │ └── web.xml │ │ └── index.jsp └── test │ └── java │ └── org │ └── sbs │ ├── AppTest.java │ ├── ListLinks.java │ ├── T.java │ ├── extract │ ├── TestWasu.java │ ├── TestYouku.java │ └── Tester.java │ └── htmlunit │ ├── HtmlUnitTest.java │ ├── element │ ├── GcElementTest.java │ └── package-info.java │ └── package-info.java └── start.bat /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | goodcrawler(web crawler) 网络爬虫 2 | =========== 3 | 4 | ---- standalone version https://github.com/shenbaise/goodcrawler/tree/standalone 5 | 6 | this project is under development. 7 | 8 | it wanna to be a good crawler for java. 9 | 10 | 11 | LICENSE 12 | ------------------- 13 | Apache License, Version 2.0 14 | http://www.apache.org/licenses/LICENSE-2.0 ( TXT or HTML ) 15 | 16 | shenbaise1001@126.com 17 | -------------------------------------------------------------------------------- /setpath.bat: -------------------------------------------------------------------------------- 1 | @set LOCALCLASSPATH=%LOCALCLASSPATH%;%1 -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/bootstrap/CrawlerStatus.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.bootstrap; 19 | 20 | import org.sbs.pendingqueue.PendingPages; 21 | import org.sbs.pendingqueue.PendingStore; 22 | import org.sbs.pendingqueue.PendingUrls; 23 | 24 | /** 25 | * @author whiteme 26 | * @date 2013年7月31日 27 | * @desc 爬虫运行状态 28 | */ 29 | public class CrawlerStatus { 30 | 31 | public static boolean running = false; 32 | 33 | public static String getStatus(){ 34 | StringBuilder sb = new StringBuilder(); 35 | // sb.append(PendingUrls.getInstance().pendingStatus()).append("


") 36 | // .append(PendingPages.getInstace().pendingStatus()).append("


") 37 | // .append(PendingStore.getInstance().pendingStatus()).append("


"); 38 | return sb.toString(); 39 | } 40 | 41 | public static String getJobsNames(){ 42 | return BootStrap.getJobsNames(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/bootstrap/foreman/ExtractForeman.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.bootstrap.foreman; 19 | 20 | import java.util.concurrent.ExecutorService; 21 | import java.util.concurrent.Executors; 22 | 23 | import org.sbs.goodcrawler.extractor.DefaultExtractWorker; 24 | import org.sbs.goodcrawler.extractor.DefaultExtractor; 25 | import org.sbs.goodcrawler.jobconf.ExtractConfig; 26 | 27 | /** 28 | * @author shenbaise(shenbaise@outlook.com) 29 | * @date 2013-7-3 30 | * 提取工工头 31 | */ 32 | public class ExtractForeman extends Foreman{ 33 | 34 | public void start(ExtractConfig conf){ 35 | int threadNum = conf.getThreadNum(); 36 | ExecutorService executor = Executors.newFixedThreadPool(threadNum); 37 | for(int i=0;i doExtract(Page page) { 76 | return extractor.onExtract(page); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/ExtractResult.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor; 19 | 20 | /** 21 | * @author shenbaise(shenbaise@outlook.com) 22 | * @date 2013-7-2 23 | * 页面信息提取的结果 24 | */ 25 | public enum ExtractResult { 26 | success,failed,ignored 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/GCElement.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor; 19 | 20 | /** 21 | * @author shenbaise(shenbaise1001@126.com) 22 | * @desc element interface 23 | */ 24 | public interface GCElement { 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/GCPage.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor; 19 | 20 | /** 21 | * @author shenbaise(shenbaise1001@126.com) 22 | * @desc 用GCPage替代UrlElementCssSelector。
同时增加htmlunit的HTMLPage。 23 | */ 24 | public interface GCPage { 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/htmlelment/CommonHtmlElement.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.htmlelment; 19 | 20 | import java.net.URL; 21 | import java.util.HashMap; 22 | import java.util.Map; 23 | 24 | import org.sbs.goodcrawler.fetcher.AjaxCallFetcher; 25 | 26 | /** 27 | * @author shenbaise(shenbaise1001@126.com) 28 | * @desc 通用的HtmlElement提取器 29 | */ 30 | public class CommonHtmlElement extends AbstractHtmlElement { 31 | 32 | private AjaxCallFetcher fetch = new AjaxCallFetcher(); 33 | 34 | private Object content; 35 | 36 | @Override 37 | public Object getContent() { 38 | if(page!=null){ 39 | if (null != content && !newPage) { 40 | return content; 41 | } 42 | 43 | if (type.equals(HtmlElementExtractType.xpath)) { 44 | Object o = fetch.getElement(page, value); 45 | this.content = o; 46 | return this.content; 47 | }else { 48 | try { 49 | new Exception("需要使用xpath"); 50 | } catch (Exception e) { 51 | throw e; 52 | } 53 | } 54 | } 55 | return null; 56 | } 57 | 58 | /** 59 | * 获取异步调用的URL 60 | * @return 61 | */ 62 | public URL getAjaxUrl(){ 63 | if(null!=content && !newPage){ 64 | this.content = fetch.getAjaxCallUrl(page, value, type, action); 65 | return (URL) this.content; 66 | } 67 | return null; 68 | } 69 | 70 | @Override 71 | public Map getContentMap() { 72 | if(newPage){ 73 | getContent(); 74 | } 75 | if(null==content) 76 | return null; 77 | Map m = new HashMap(1); 78 | m.put(name, this.content); 79 | return m; 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/htmlelment/HtmlAnchorElementOfPage.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @工程 goodcrawler 3 | * @文件 HtmlAnchorElement.java 4 | * @时间 2013年12月18日 下午5:35:49 5 | * @作者 shenbaise(shenbaise1001@126.com) 6 | * @描述 7 | */ 8 | package org.sbs.goodcrawler.extractor.htmlelment; 9 | 10 | import java.io.IOException; 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | import com.gargoylesoftware.htmlunit.Page; 15 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor; 16 | 17 | /** 18 | * @author shenbaise(shenbaise1001@126.com) 19 | * @desc 20 | */ 21 | public class HtmlAnchorElementOfPage extends AbstractHtmlElement{ 22 | 23 | private Page content; 24 | 25 | @Override 26 | public Page getContent() { 27 | if(page!=null){ 28 | if (null != content && !newPage) { 29 | return content; 30 | } 31 | if (type.equals(HtmlElementExtractType.xpath)) { 32 | HtmlAnchor anchor = page.getFirstByXPath(value); 33 | try { 34 | Page p = anchor.click(); 35 | webClient.waitForBackgroundJavaScript(1000*3L); 36 | this.content = p; 37 | return p; 38 | } catch (IOException e) { 39 | e.printStackTrace(); 40 | } 41 | }else { 42 | try { 43 | new Exception("需要使用xpath"); 44 | } catch (Exception e) { 45 | throw e; 46 | } 47 | } 48 | } 49 | return null; 50 | } 51 | 52 | @Override 53 | public Map getContentMap() { 54 | if(newPage){ 55 | getContent(); 56 | } 57 | if(null==content) 58 | return null; 59 | Map m = new HashMap(1); 60 | m.put(name, this.content); 61 | return m; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/htmlelment/HtmlAnchorElementOfString.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @工程 goodcrawler 3 | * @文件 HtmlAnchorElement.java 4 | * @时间 2013年12月18日 下午5:35:49 5 | * @作者 shenbaise(shenbaise1001@126.com) 6 | * @描述 7 | */ 8 | package org.sbs.goodcrawler.extractor.htmlelment; 9 | 10 | import java.io.IOException; 11 | import java.net.URL; 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | 15 | import org.sbs.goodcrawler.fetcher.ResynchronizingAjaxController; 16 | 17 | import com.gargoylesoftware.htmlunit.AjaxController; 18 | import com.gargoylesoftware.htmlunit.html.DomElement; 19 | import com.gargoylesoftware.htmlunit.html.HtmlAnchor; 20 | 21 | /** 22 | * @author shenbaise(shenbaise1001@126.com) 23 | * @desc 该类获取由javascript生成的ajax异步调用的地址。 24 | */ 25 | public class HtmlAnchorElementOfString extends AbstractHtmlElement{ 26 | 27 | private String content; 28 | 29 | @Override 30 | public String getContent() { 31 | if(page!=null){ 32 | if (null != content && !newPage) { 33 | return content; 34 | } 35 | if(type .equals(HtmlElementExtractType.id)){ 36 | DomElement d = page.getElementById(value); 37 | d.getAttribute(""); 38 | }else if (type.equals(HtmlElementExtractType.xpath)) { 39 | HtmlAnchor anchor = page.getFirstByXPath(value); 40 | try { 41 | anchor.click(); 42 | AjaxController ac = webClient.getAjaxController(); 43 | if(ac instanceof ResynchronizingAjaxController){ 44 | ResynchronizingAjaxController rac = (ResynchronizingAjaxController)ac; 45 | URL url = rac.getResynchronizedCallUlr(2000); 46 | if(url !=null){ 47 | content = url.toString(); 48 | return content; 49 | } 50 | } 51 | } catch (IOException e) { 52 | e.printStackTrace(); 53 | } 54 | } 55 | } 56 | return null; 57 | } 58 | 59 | @Override 60 | public Map getContentMap() { 61 | if(newPage){ 62 | getContent(); 63 | } 64 | if(null==content) 65 | return null; 66 | Map m = new HashMap(1); 67 | m.put(name, this.content); 68 | return m; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/htmlelment/HtmlElementExtractType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @工程 goodcrawler 3 | * @文件 HtmlElementExtractType.java 4 | * @时间 2013年12月18日 下午5:13:58 5 | * @作者 shenbaise(shenbaise1001@126.com) 6 | * @描述 7 | */ 8 | package org.sbs.goodcrawler.extractor.htmlelment; 9 | 10 | /** 11 | * @author shenbaise(shenbaise1001@126.com) 12 | * @desc 提取方式,xpath or id 13 | */ 14 | public enum HtmlElementExtractType { 15 | xpath,id 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/htmlelment/HtmlElementType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.htmlelment; 19 | 20 | /** 21 | * @author shenbaise(shenbaise1001@126.com) 22 | * @desc 23 | */ 24 | public enum HtmlElementType { 25 | htmlAnchor,htmlButton,htmlEmbed,htmlForm,htmlImg,htmlInput 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/htmlelment/HtmlPageElement.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.htmlelment; 19 | 20 | import java.lang.ref.WeakReference; 21 | 22 | import org.sbs.goodcrawler.extractor.GCPage; 23 | import org.sbs.goodcrawler.fetcher.AjaxCallFetcher; 24 | 25 | /** 26 | * @author shenbaise(shenbaise1001@126.com) 27 | * @desc 28 | */ 29 | public class HtmlPageElement implements GCPage { 30 | private WeakReference fetcher; 31 | 32 | 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/htmlelment/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @工程 goodcrawler 3 | * @文件 package-info.java 4 | * @时间 2013年12月16日 下午6:39:16 5 | * @作者 shenbaise(shenbaise1001@126.com) 6 | * @描述 7 | */ 8 | /** 9 | * @author shenbaise(shenbaise1001@126.com) 10 | * @desc htmlUnit对应的HtmlElement对象应用 11 | */ 12 | package org.sbs.goodcrawler.extractor.htmlelment; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author shenbaise(shenbaise@outlook.com) 20 | * @date 2013-7-2 21 | */ 22 | package org.sbs.goodcrawler.extractor; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/SelectPageElement.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @工程 goodcrawler 3 | * @文件 PageElement.java 4 | * @时间 2013年12月19日 下午5:32:37 5 | * @作者 shenbaise(shenbaise1001@126.com) 6 | * @描述 7 | */ 8 | package org.sbs.goodcrawler.extractor.selector; 9 | 10 | import org.sbs.goodcrawler.extractor.GCPage; 11 | 12 | /** 13 | * @author shenbaise(shenbaise1001@126.com) 14 | * @desc 15 | */ 16 | public class SelectPageElement implements GCPage { 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/SelectorAttr.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年8月3日 23 | * @desc 24 | */ 25 | public enum SelectorAttr { 26 | src,href,text,value,id,title,tostring,alt,other 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/SelectorType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年8月3日 23 | * @desc 选择器选择元素的预期类型 24 | */ 25 | public enum SelectorType { 26 | /** 27 | * 字符,可以使用正则进行匹配 28 | */ 29 | $string, 30 | /** 31 | * 整型,可带有区间、大小等限制 32 | */ 33 | $int, 34 | /** 35 | * 该类型的选择器将被填充到list中 36 | */ 37 | $list, 38 | /** 39 | * 该类型的选择器,将选择抽取的内入填充到set中 40 | */ 41 | $set, 42 | /** 43 | * 该类型的选择器表明,其值是一个Url。该url会被再次抓取并抽取。 44 | *
注意:只有该类型的选择器可以嵌套选择器 45 | */ 46 | $url, 47 | /** 48 | * 该类型的选择器表明其选择内容将是数值类型的。 49 | *
该选择器带有format 50 | */ 51 | $numerica, 52 | /** 53 | * 该类型的选择器表明其选择的内容将是日期类型的。
该选择器带有format 54 | */ 55 | $date, 56 | /** 57 | * ajax方式的动态选择器 58 | */ 59 | $ajax, 60 | /** 61 | * 文件 62 | */ 63 | $file 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/EmptyAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action; 19 | 20 | 21 | /** 22 | * @author whiteme 23 | * @date 2013年10月18日 24 | * @desc 什么也不做 25 | */ 26 | public class EmptyAction implements SelectorAction{ 27 | public Object doAction(Object content) { 28 | return content; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/FileSelectAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action; 19 | 20 | import java.util.Map; 21 | 22 | import org.sbs.goodcrawler.extractor.selector.exception.DownLoadException; 23 | 24 | /** 25 | * @author whiteme 26 | * @date 2013年10月20日 27 | * @desc FileSelectAction接口 28 | */ 29 | public abstract class FileSelectAction implements SelectorAction { 30 | /** 31 | * 返回文件下载后的本地路径 32 | * @param remoteFile 33 | * @return 34 | * @throws DownLoadException 35 | */ 36 | public abstract String doAction(Map result,String remoteFile) throws DownLoadException; 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/IntegerSelectorAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月13日 23 | * @desc 整型选择器的处理接口 24 | */ 25 | public abstract class IntegerSelectorAction implements SelectorAction { 26 | public abstract int doAction(Integer i); 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/ListSelectorAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action; 19 | 20 | import java.util.List; 21 | 22 | /** 23 | * @author whiteme 24 | * @date 2013年10月13日 25 | * @desc 处理list的Action 26 | */ 27 | public abstract class ListSelectorAction implements SelectorAction { 28 | public abstract List doAction(List list); 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/SelectorAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年8月3日 23 | * @desc 对选择器选择内容的进一步加工处理。 24 | */ 25 | public interface SelectorAction { 26 | // public abstract T doAction(T content); 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/StringSelectorAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action; 19 | 20 | 21 | /** 22 | * @author whiteme 23 | * @date 2013年10月11日 24 | * @desc 字符选择器的处理接口 25 | */ 26 | public abstract class StringSelectorAction implements SelectorAction { 27 | public abstract String doAction(String content); 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/file/FileActionType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.file; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月22日 23 | * @desc 文件类型元素处理种类 24 | */ 25 | public enum FileActionType { 26 | /** 27 | * 下载 28 | */ 29 | download, 30 | /** 31 | * 下载并压缩,针对图片 32 | */ 33 | download_resize 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/integer/IntegerAbsAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.integer; 19 | 20 | import org.sbs.goodcrawler.extractor.selector.action.IntegerSelectorAction; 21 | 22 | /** 23 | * @author whiteme 24 | * @date 2013年10月13日 25 | * @desc 求绝对值 26 | */ 27 | public class IntegerAbsAction extends IntegerSelectorAction { 28 | 29 | /** 30 | * 求绝对值 31 | */ 32 | @Override 33 | public int doAction(Integer i) { 34 | return Math.abs(i); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/integer/IntegerActionType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.integer; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月18日 23 | * @desc 24 | */ 25 | public enum IntegerActionType { 26 | abs, 27 | between 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/integer/IntegerBetweenAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.integer; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.sbs.goodcrawler.extractor.selector.action.IntegerSelectorAction; 22 | import org.sbs.goodcrawler.extractor.selector.exception.IntegerBetweenExpressionException; 23 | 24 | /** 25 | * @author whiteme 26 | * @date 2013年10月13日 27 | * @desc 检测数值是否在某个区间内。如果超过区间则有默认值代替 28 | */ 29 | public class IntegerBetweenAction extends IntegerSelectorAction { 30 | private int max; 31 | private int min; 32 | private int def; 33 | /** 34 | * 构造器 35 | * @param exp 36 | * @param def 37 | * @throws IntegerBetweenExpressionException 38 | */ 39 | public IntegerBetweenAction(String exp,String def) throws IntegerBetweenExpressionException{ 40 | if(StringUtils.isNotBlank(exp)){ 41 | String ss[] = exp.split(","); 42 | if(ss.length!=2){ 43 | throw new IntegerBetweenExpressionException("数值区间表示错误"); 44 | }else { 45 | max = Integer.parseInt(ss[1]); 46 | min = Integer.parseInt(ss[0]); 47 | if(maxmax || i doAction(List list) { 33 | return list; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/list/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author whiteme 20 | * @date 2013年10月13日 21 | * @desc 22 | */ 23 | package org.sbs.goodcrawler.extractor.selector.action.list; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author whiteme 20 | * @date 2013年10月11日 21 | * @desc 22 | */ 23 | package org.sbs.goodcrawler.extractor.selector.action; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringActionType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.string; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月17日 23 | * @desc 字符处理的各种方式 24 | */ 25 | public enum StringActionType { 26 | after, 27 | afterLast, 28 | before, 29 | beforeLast, 30 | between, 31 | filter, 32 | replace, 33 | split, 34 | sub, 35 | suffix, 36 | perfix 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringAfterAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.string; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.sbs.goodcrawler.extractor.selector.action.StringSelectorAction; 22 | 23 | /** 24 | * @author whiteme 25 | * @date 2013年10月13日 26 | * @desc 截取给定字符串中某个字符串之后的部分 27 | */ 28 | public class StringAfterAction extends StringSelectorAction { 29 | 30 | /** 31 | * 分隔符 32 | */ 33 | private String separator; 34 | /** 35 | * 构造器 36 | * @param separator 37 | */ 38 | public StringAfterAction(String separator){ 39 | this.separator = separator; 40 | } 41 | 42 | @Override 43 | public String doAction(String content) { 44 | if(StringUtils.isNotBlank(content)){ 45 | return StringUtils.substringAfter(content, separator); 46 | } 47 | return ""; 48 | } 49 | 50 | public static void main(String[] args) { 51 | String s = "asdfsfh354^$#^WEEAf "; 52 | StringAfterAction action = new StringAfterAction("^$"); 53 | System.out.println(action.doAction(s)); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringAfterLastAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.string; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.sbs.goodcrawler.extractor.selector.action.StringSelectorAction; 22 | 23 | /** 24 | * @author whiteme 25 | * @date 2013年10月13日 26 | * @desc 截取字符串中最后一个分隔符之后的部分 27 | */ 28 | public class StringAfterLastAction extends StringSelectorAction{ 29 | 30 | private String separator; 31 | 32 | public StringAfterLastAction(String separator){ 33 | this.separator = separator; 34 | } 35 | /** 36 | * 截取content中最后一个separator之后的部分 37 | */ 38 | @Override 39 | public String doAction(String content) { 40 | if(StringUtils.isNotBlank(content)){ 41 | return StringUtils.substringAfterLast(content, separator); 42 | } 43 | return ""; 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringBeforeAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.string; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.sbs.goodcrawler.extractor.selector.action.StringSelectorAction; 22 | 23 | /** 24 | * @author whiteme 25 | * @date 2013年10月13日 26 | * @desc 截取在某个字符(串)之前的部分的action 27 | */ 28 | public class StringBeforeAction extends StringSelectorAction { 29 | /** 30 | * 定位的String 31 | */ 32 | private String separator ; 33 | /** 34 | * 构造器 35 | * @param befString 36 | */ 37 | public StringBeforeAction(String separator){ 38 | this.separator = separator; 39 | } 40 | /** 41 | * 截取在beforeString之前的部分 42 | */ 43 | @Override 44 | public String doAction(String content) { 45 | if(StringUtils.isNotBlank(content)){ 46 | return StringUtils.substringBefore(content, this.separator); 47 | } 48 | return ""; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringBeforeLastAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.string; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.sbs.goodcrawler.extractor.selector.action.StringSelectorAction; 22 | 23 | /** 24 | * @author whiteme 25 | * @date 2013年10月13日 26 | * @desc 截取最后一个分隔符之前的字符串 27 | */ 28 | public class StringBeforeLastAction extends StringSelectorAction { 29 | /** 30 | * 分割的字符串 31 | */ 32 | String separator; 33 | /** 34 | * 构造器 35 | * @param separator 36 | */ 37 | public StringBeforeLastAction(String separator){ 38 | this.separator = separator; 39 | } 40 | /** 41 | * 截取最后一个separator之前的字符串 42 | */ 43 | @Override 44 | public String doAction(String content) { 45 | if(StringUtils.isNotBlank(content)){ 46 | return StringUtils.substringBeforeLast(content, separator); 47 | } 48 | return ""; 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringPerfixAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.string; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.sbs.goodcrawler.extractor.selector.action.StringSelectorAction; 22 | 23 | /** 24 | * @author whiteme 25 | * @date 2013年10月29日 26 | * @desc 前加字符串 27 | */ 28 | public class StringPerfixAction extends StringSelectorAction { 29 | /** 30 | * 字符串 31 | */ 32 | String perfix = ""; 33 | 34 | public StringPerfixAction(String perfix){ 35 | this.perfix = perfix; 36 | } 37 | 38 | @Override 39 | public String doAction(String content) { 40 | if(StringUtils.isNotBlank(content)){ 41 | return this.perfix + content; 42 | } 43 | return ""; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringReplaceAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.string; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.sbs.goodcrawler.extractor.selector.action.StringSelectorAction; 22 | 23 | /** 24 | * @author whiteme 25 | * @date 2013年10月11日 26 | * @desc 该action对StringSelecto选择的内容按配置进行替换操作。 27 | */ 28 | public class StringReplaceAction extends StringSelectorAction { 29 | 30 | private String searchString; 31 | private String replacement; 32 | 33 | public StringReplaceAction(String searchString, String replacement) { 34 | super(); 35 | this.searchString = searchString; 36 | this.replacement = replacement; 37 | } 38 | 39 | public String getSearchString() { 40 | return searchString; 41 | } 42 | 43 | public void setSearchString(String searchString) { 44 | this.searchString = searchString; 45 | } 46 | 47 | public String getReplacement() { 48 | return replacement; 49 | } 50 | 51 | public void setReplacement(String replacement) { 52 | this.replacement = replacement; 53 | } 54 | 55 | /** 56 | * 根据配置的查找字符和替换字符对抽取内容进行替换操作 57 | */ 58 | @Override 59 | public String doAction(String content) { 60 | return StringUtils.replace(content, searchString, replacement); 61 | } 62 | 63 | public static void main(String[] args) { 64 | String string = "@#$%$FGDFGFGHS#@$$Y"; 65 | StringReplaceAction action = new StringReplaceAction("#", ","); 66 | 67 | System.out.println(action.doAction(string)); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/StringSuffixAction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.action.string; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.sbs.goodcrawler.extractor.selector.action.StringSelectorAction; 22 | 23 | /** 24 | * @author whiteme 25 | * @date 2013年10月29日 26 | * @desc 追加字符串 27 | */ 28 | public class StringSuffixAction extends StringSelectorAction { 29 | 30 | /** 31 | * 追加字符 32 | */ 33 | String suffix = ""; 34 | 35 | 36 | public StringSuffixAction(String suffix){ 37 | this.suffix = suffix; 38 | } 39 | 40 | @Override 41 | public String doAction(String content) { 42 | if(StringUtils.isNotBlank(content)){ 43 | return content + this.suffix; 44 | } 45 | return ""; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/action/string/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author whiteme 20 | * @date 2013年10月11日 21 | * @desc 22 | */ 23 | package org.sbs.goodcrawler.extractor.selector.action.string; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/exception/DownLoadException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.exception; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月20日 23 | * @desc 文件下载异常 24 | */ 25 | public class DownLoadException extends Exception { 26 | private static final long serialVersionUID = 6548227413938390848L; 27 | 28 | public DownLoadException(String arg0, Throwable arg1) { 29 | super(arg0, arg1); 30 | } 31 | 32 | public DownLoadException(String arg0) { 33 | super(arg0); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/exception/IntegerBetweenExpressionException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.exception; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月13日 23 | * @desc 数值区间表达式异常 24 | */ 25 | public class IntegerBetweenExpressionException extends Exception implements SelectorConfigException{ 26 | private static final long serialVersionUID = 1L; 27 | 28 | public IntegerBetweenExpressionException() { 29 | super(); 30 | } 31 | 32 | public IntegerBetweenExpressionException(String arg0, Throwable arg1) { 33 | super(arg0, arg1); 34 | } 35 | 36 | public IntegerBetweenExpressionException(String arg0) { 37 | super(arg0); 38 | } 39 | 40 | public IntegerBetweenExpressionException(Throwable arg0) { 41 | super(arg0); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/exception/SelectorConfigException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.exception; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月13日 23 | * @desc 选择器配置问题导致的异常 24 | */ 25 | public interface SelectorConfigException { 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/exception/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author whiteme 20 | * @date 2013年10月13日 21 | * @desc 22 | */ 23 | package org.sbs.goodcrawler.extractor.selector.exception; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/expression/GrExpression.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.expression; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月16日 23 | * @desc 简单表达式接口 24 | */ 25 | public interface GrExpression { 26 | public boolean test(); 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/expression/SimpleExpression.java: -------------------------------------------------------------------------------- 1 | package org.sbs.goodcrawler.extractor.selector.expression; 2 | /** 3 | * 4 | * @author whiteme 5 | * @date 2013年10月16日 6 | * @desc 简单测试表达式,大小比较自动转换为整型进行比较 7 | */ 8 | public class SimpleExpression { 9 | String left; 10 | String right; 11 | String op; 12 | 13 | public SimpleExpression(String left, String right, String op) { 14 | super(); 15 | this.left = left; 16 | this.right = right; 17 | this.op = op; 18 | } 19 | 20 | public boolean test() throws Exception{ 21 | if("=".equals(op)){ 22 | return left.equals(right); 23 | }else if(">".equals(op)){ 24 | return Integer.parseInt(left) > Integer.parseInt(right); 25 | }else if("!=".equals(op)){ 26 | return !left.equals(right); 27 | }else if(">=".equals(op)){ 28 | return Integer.parseInt(left) >= Integer.parseInt(right); 29 | } 30 | else if("<".equals(op)){ 31 | return Integer.parseInt(left) < Integer.parseInt(right); 32 | } 33 | else if("<=".equals(op)){ 34 | return Integer.parseInt(left) <= Integer.parseInt(right); 35 | }else { 36 | throw new Exception("无效的表达式:"+op); 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/expression/SimpleExpressionExtent.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.selector.expression; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月16日 23 | * @desc 简单表达式的扩展 24 | */ 25 | public class SimpleExpressionExtent implements GrExpression{ 26 | 27 | SimpleExpression leftExpression ; 28 | SimpleExpression rightExpression; 29 | String logic; 30 | 31 | public SimpleExpressionExtent(SimpleExpression leftExpression, 32 | SimpleExpression rightExpression, String logic) { 33 | super(); 34 | this.leftExpression = leftExpression; 35 | this.rightExpression = rightExpression; 36 | this.logic = logic; 37 | } 38 | 39 | 40 | public boolean test() { 41 | try { 42 | if("and".equals(logic.toLowerCase())){ 43 | return this.leftExpression.test() && this.rightExpression.test(); 44 | }else { 45 | return this.leftExpression.test() || this.rightExpression.test(); 46 | } 47 | } catch (Exception e) { 48 | e.printStackTrace(); 49 | } 50 | return false; 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/selector/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author whiteme 20 | * @date 2013年10月11日 21 | * @desc 22 | */ 23 | package org.sbs.goodcrawler.extractor.selector; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/template/ExtractTemplate.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.extractor.template; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年10月13日 23 | * @desc 24 | */ 25 | public class ExtractTemplate { 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/extractor/template/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author whiteme 20 | * @date 2013年10月13日 21 | * @desc 22 | */ 23 | package org.sbs.goodcrawler.extractor.template; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/fetcher/FetchStatus.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.fetcher; 19 | 20 | /** 21 | * @author shenbaise(shenbaise@outlook.com) 22 | * @date 2013-6-30 23 | * 抓取页面的结果 24 | */ 25 | public enum FetchStatus { 26 | success,failed,ignored 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/fetcher/FetcherInstance.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.fetcher; 19 | 20 | import org.sbs.goodcrawler.jobconf.FetchConfig; 21 | import org.sbs.goodcrawler.page.PageFetchResult; 22 | import org.sbs.url.WebURL; 23 | 24 | /** 25 | * @author whiteme 26 | * @date 2013年7月29日 27 | * @desc 28 | */ 29 | public class FetcherInstance { 30 | private static FetchConfig conf = new FetchConfig(); 31 | { 32 | conf.setAgent("ipad"); 33 | conf.setSocketTimeoutMilliseconds(15000); 34 | conf.setConnectionTimeout(5000); 35 | conf.setMaxTotalConnections(10); 36 | conf.setHttps(true); 37 | } 38 | 39 | private static PageFetcher fetcher = null; 40 | 41 | public static PageFetcher getFetcher(){ 42 | if(null==fetcher){ 43 | fetcher = new PageFetcher(conf); 44 | } 45 | return fetcher; 46 | } 47 | 48 | public static void main(String[] args) { 49 | getFetcher(); 50 | WebURL webUrl = new WebURL(); 51 | webUrl.setAnchor("www.wasu.cn/Play/show/id/1220535"); 52 | PageFetchResult r = fetcher.fetchHeader(webUrl); 53 | System.out.println(r.getEntity().toString()); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/fetcher/FetcherType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.fetcher; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年8月3日 23 | * @desc 24 | */ 25 | public enum FetcherType { 26 | defaultFetcher 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/fetcher/IdleConnectionMonitorThread.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.goodcrawler.fetcher; 19 | 20 | import java.util.concurrent.TimeUnit; 21 | 22 | import org.apache.http.impl.conn.PoolingClientConnectionManager; 23 | 24 | public class IdleConnectionMonitorThread extends Thread { 25 | 26 | private final PoolingClientConnectionManager connMgr; 27 | private volatile boolean shutdown; 28 | 29 | public IdleConnectionMonitorThread(PoolingClientConnectionManager connMgr) { 30 | super("Connection Manager"); 31 | this.connMgr = connMgr; 32 | } 33 | 34 | @Override 35 | public void run() { 36 | try { 37 | while (!shutdown) { 38 | synchronized (this) { 39 | wait(5000); 40 | // Close expired connections 41 | connMgr.closeExpiredConnections(); 42 | // Optionally, close connections 43 | // that have been idle longer than 30 sec 44 | connMgr.closeIdleConnections(30, TimeUnit.SECONDS); 45 | System.out.println("#getAvailable="+connMgr.getTotalStats().getAvailable()); 46 | } 47 | } 48 | } catch (InterruptedException ex) { 49 | // terminate 50 | } 51 | } 52 | 53 | public void shutdown() { 54 | shutdown = true; 55 | synchronized (this) { 56 | notifyAll(); 57 | } 58 | } 59 | 60 | } 61 | 62 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/fetcher/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author shenbaise(shenbaise@outlook.com) 20 | * @date 2013-6-30 21 | */ 22 | package org.sbs.goodcrawler.fetcher; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/jobconf/JobConfig.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.jobconf; 19 | 20 | import java.io.File; 21 | import java.io.IOException; 22 | 23 | import org.apache.commons.logging.Log; 24 | import org.apache.commons.logging.LogFactory; 25 | import org.jsoup.Jsoup; 26 | import org.jsoup.nodes.Document; 27 | import org.sbs.goodcrawler.conf.Configuration; 28 | 29 | /** 30 | * @author whiteme 31 | * @date 2013年10月13日 32 | * @desc 33 | */ 34 | public class JobConfig extends Configuration { 35 | 36 | Log log = LogFactory.getLog(JobConfig.class); 37 | /** 38 | * job运行一定时长后自动结束 39 | */ 40 | private int jobTime; 41 | /** 42 | * job抓取一定数量url后自动结束 43 | */ 44 | private int urlNum; 45 | 46 | private FetchConfig fetchConfig; 47 | 48 | private ExtractConfig extractConfig; 49 | 50 | private StoreConfig storeConfig; 51 | 52 | private Document confDoc; 53 | 54 | public void loadConfig(String configFiles){ 55 | try { 56 | this.confDoc = Jsoup.parse(new File("conf/youku_conf.xml"), "utf-8"); 57 | super.jobName = confDoc.select("job name").text(); 58 | super.indexName = confDoc.select("job").attr("indexName"); 59 | this.jobTime = Integer.parseInt(confDoc.select("jobtime").text()); 60 | this.urlNum = Integer.parseInt(confDoc.select("urlNum").text()); 61 | // this.fetchConfig = FetchConfig 62 | // this.extractConfig = ExtractConfig.load(confDoc) 63 | // this.storeConfig = StoreConfig.load(confDoc) 64 | // TODO load fetch 、extract、store config 65 | } catch (IOException e) { 66 | log.fatal(e.getMessage()); 67 | } 68 | } 69 | 70 | @Override 71 | public String toString() { 72 | // TODO Auto-generated method stub 73 | return null; 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/jobconf/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author whiteme 20 | * @date 2013年10月13日 21 | * @desc 22 | */ 23 | package org.sbs.goodcrawler.jobconf; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/page/BinaryParseData.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.goodcrawler.page; 19 | 20 | public class BinaryParseData implements ParseData { 21 | 22 | private static BinaryParseData instance = new BinaryParseData(); 23 | 24 | public static BinaryParseData getInstance() { 25 | return instance; 26 | } 27 | 28 | @Override 29 | public String toString() { 30 | return "[Binary parse data can not be dumped as string]"; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/page/ExtractedUrlAnchorPair.java: -------------------------------------------------------------------------------- 1 | package org.sbs.goodcrawler.page; 2 | 3 | public class ExtractedUrlAnchorPair { 4 | 5 | private String href; 6 | private String anchor; 7 | 8 | public String getHref() { 9 | return href; 10 | } 11 | 12 | public void setHref(String href) { 13 | this.href = href; 14 | } 15 | 16 | public String getAnchor() { 17 | return anchor; 18 | } 19 | 20 | public void setAnchor(String anchor) { 21 | this.anchor = anchor; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/page/HtmlParseData.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.goodcrawler.page; 19 | 20 | 21 | 22 | public class HtmlParseData implements ParseData { 23 | 24 | private String html; 25 | private String text; 26 | private String title; 27 | 28 | public String getHtml() { 29 | return html; 30 | } 31 | 32 | public void setHtml(String html) { 33 | this.html = html; 34 | } 35 | 36 | public String getText() { 37 | return text; 38 | } 39 | 40 | public void setText(String text) { 41 | this.text = text; 42 | } 43 | 44 | public String getTitle() { 45 | return title; 46 | } 47 | 48 | public void setTitle(String title) { 49 | this.title = title; 50 | } 51 | 52 | @Override 53 | public String toString() { 54 | return text; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/page/ParseData.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.goodcrawler.page; 19 | 20 | public interface ParseData { 21 | 22 | @Override 23 | public String toString(); 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/page/Parser.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.goodcrawler.page; 19 | 20 | import org.apache.log4j.Logger; 21 | import org.sbs.util.Util; 22 | 23 | 24 | /** 25 | * @author Yasser Ganjisaffar 26 | */ 27 | public class Parser { 28 | 29 | protected static final Logger logger = Logger.getLogger(Parser.class.getName()); 30 | private boolean isFetchBinaryContent; 31 | public Parser(Boolean isFetchBinaryContent) { 32 | this.isFetchBinaryContent = isFetchBinaryContent; 33 | } 34 | 35 | public boolean parse(Page page, String contextURL) { 36 | 37 | if (Util.hasBinaryContent(page.getContentType())) { 38 | if (!this.isFetchBinaryContent) { 39 | return false; 40 | } 41 | 42 | page.setParseData(BinaryParseData.getInstance()); 43 | return true; 44 | 45 | } else if (Util.hasPlainTextContent(page.getContentType())) { 46 | try { 47 | TextParseData parseData = new TextParseData(); 48 | if (page.getContentCharset() == null) { 49 | parseData.setTextContent(new String(page.getContentData())); 50 | } else { 51 | parseData.setTextContent(new String(page.getContentData(), page.getContentCharset())); 52 | } 53 | page.setParseData(parseData); 54 | return true; 55 | } catch (Exception e) { 56 | logger.error(e.getMessage() + ", while parsing: " + page.getWebURL().getURL()); 57 | } 58 | return false; 59 | } 60 | return true; 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/page/TextParseData.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.goodcrawler.page; 19 | 20 | public class TextParseData implements ParseData { 21 | 22 | private String textContent; 23 | 24 | public String getTextContent() { 25 | return textContent; 26 | } 27 | 28 | public void setTextContent(String textContent) { 29 | this.textContent = textContent; 30 | } 31 | 32 | @Override 33 | public String toString() { 34 | return textContent; 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/page/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author shenbaise(shenbaise@outlook.com) 20 | * @date 2013-6-30 21 | */ 22 | package org.sbs.goodcrawler.page; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/IndexScanner.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.plugin; 19 | 20 | import org.elasticsearch.action.search.SearchResponse; 21 | import org.elasticsearch.action.search.SearchType; 22 | import org.elasticsearch.client.Client; 23 | import org.elasticsearch.common.unit.TimeValue; 24 | import org.elasticsearch.index.query.QueryBuilders; 25 | import org.elasticsearch.search.SearchHit; 26 | 27 | /** 28 | * 29 | * @author shenbaise 30 | * @date 2014年3月8日 31 | * desc: 遍历索引 32 | */ 33 | public abstract class IndexScanner { 34 | /** 35 | * 处理索引文档 36 | * @param hit 37 | */ 38 | public abstract void process(SearchHit hit); 39 | 40 | /** 41 | * 扫描整个索引,每个文档需要process进行加工和进一步操作,例如重建索引等。 42 | * @param index 43 | * @param size 44 | * @param keepAlive 45 | */ 46 | public void scanIndex(String index,int size,int keepAlive){ 47 | Client client = EsClient.getClient(); 48 | SearchResponse scrollResp = client.prepareSearch(index) 49 | .setSearchType(SearchType.SCAN) 50 | .setScroll(new TimeValue(keepAlive)) 51 | .setQuery(QueryBuilders.matchAllQuery()) 52 | .setSize(size).execute().actionGet(); 53 | //100 hits per shard will be returned for each scroll 54 | //Scroll until no hits are returned 55 | while (true) { 56 | scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(keepAlive)).execute().actionGet(); 57 | for (SearchHit hit : scrollResp.getHits()) { 58 | process(hit); 59 | } 60 | //Break condition: No hits are returned 61 | if (scrollResp.getHits().getHits().length == 0) { 62 | break; 63 | } 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/ReIndex.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.plugin; 19 | 20 | import java.util.Map; 21 | 22 | import org.elasticsearch.search.SearchHit; 23 | import org.sbs.util.MD5Utils; 24 | 25 | /** 26 | * @author Administrator 27 | * @date 2014年3月8日 28 | * desc: 整理文档,重建索引 29 | */ 30 | public class ReIndex extends IndexScanner { 31 | 32 | private String index; 33 | private String type; 34 | 35 | public ReIndex(String index, String type) { 36 | super(); 37 | this.index = index; 38 | this.type = type; 39 | } 40 | 41 | /* (non-Javadoc) 42 | * @see org.sbs.goodcrawler.plugin.IndexScanner#process(org.elasticsearch.search.SearchHit) 43 | */ 44 | @Override 45 | public void process(SearchHit hit) { 46 | Map m = hit.getSource(); 47 | hit.getIndex(); 48 | System.out.println(m); 49 | // TODO 处理文档,进一步加工。 50 | // EsClient.index(index, type, MD5Utils.createMD5((String) m.get("title")), m); 51 | EsClient.delete(index, type, hit.getId()); 52 | } 53 | 54 | 55 | public void deleteOneByOne(String index,String type,String id){ 56 | EsClient.delete(index, type, id); 57 | } 58 | 59 | public static void main(String[] args) { 60 | ReIndex reIndex = new ReIndex("movie", "0"); 61 | reIndex.scanIndex("movie", 100, 60000); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/classloader/PluginClassLoader.java: -------------------------------------------------------------------------------- 1 | package org.sbs.goodcrawler.plugin.classloader; 2 | 3 | import java.io.File; 4 | 5 | /** 6 | * @author shenbaise(shenbaise1001@126.com) 7 | * @desc 程序加载器。 8 | */ 9 | public class PluginClassLoader { 10 | // 插件目录 11 | private static String pluginDir = "plugin"; 12 | // Class loader 13 | private static CommonClassLoader classLoader = null; 14 | 15 | /** 16 | * 初始化 17 | * @throws Exception 18 | */ 19 | public static void init() throws Exception{ 20 | if(classLoader==null){ 21 | classLoader = new CommonClassLoader(Thread.currentThread().getContextClassLoader()); 22 | File fPluginDir = new File(pluginDir); 23 | if(!fPluginDir.exists()){ 24 | fPluginDir.mkdir(); 25 | } 26 | File[] fs = fPluginDir.listFiles(); 27 | classLoader.addEntries(fs); 28 | Thread.currentThread().setContextClassLoader(classLoader); 29 | }else { 30 | throw new Exception("类加载器已经初始化,不可重复初始化!"); 31 | } 32 | } 33 | 34 | public static synchronized void addPlugin(File f){ 35 | classLoader.addEntry(f); 36 | } 37 | 38 | public static synchronized void removePlugin(File f){ 39 | classLoader.removeEntry(f); 40 | } 41 | 42 | /** 43 | * 加载类 44 | * @param className 45 | * @return 46 | */ 47 | public static Class loadClass(String className){ 48 | try { 49 | return classLoader.loadClass(className, true); 50 | } catch (ClassNotFoundException e) { 51 | e.printStackTrace(); 52 | } 53 | return null; 54 | } 55 | 56 | } -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/extract/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author shenbaise(shenbaise@outlook.com) 20 | * @date 2013-7-7 21 | */ 22 | package org.sbs.goodcrawler.plugin.extract; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author shenbaise(shenbaise@outlook.com) 20 | * @date 2013-7-6 21 | */ 22 | package org.sbs.goodcrawler.plugin; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/storage/MovieSource.java: -------------------------------------------------------------------------------- 1 | package org.sbs.goodcrawler.plugin.storage; 2 | 3 | import java.util.List; 4 | import java.util.TreeMap; 5 | 6 | /** 7 | * 影片资源,可能更新比较频繁 8 | */ 9 | public class MovieSource { 10 | 11 | /** 12 | * 主键,使用片名或者片名的拼音 13 | */ 14 | private String id; 15 | /** 16 | * 下载资源 17 | */ 18 | private TreeMap> download; 19 | /** 20 | * 在线观看资源 21 | */ 22 | private TreeMap> online; 23 | 24 | public MovieSource() { 25 | }; 26 | 27 | public MovieSource(String id, TreeMap> download, 28 | TreeMap> online) { 29 | super(); 30 | this.id = id; 31 | this.download = download; 32 | this.online = online; 33 | } 34 | 35 | public String getId() { 36 | return id; 37 | } 38 | 39 | public void setId(String id) { 40 | this.id = id; 41 | } 42 | 43 | public TreeMap> getDownload() { 44 | return download; 45 | } 46 | 47 | public void setDownload(TreeMap> download) { 48 | this.download = download; 49 | } 50 | 51 | public TreeMap> getOnline() { 52 | return online; 53 | } 54 | 55 | public void setOnline(TreeMap> online) { 56 | this.online = online; 57 | } 58 | 59 | public static void main(String[] args) { 60 | System.out.println("Hello World!"); 61 | } 62 | 63 | /** 64 | * 资源 65 | */ 66 | class Source { 67 | /** 68 | * 资源连接 69 | */ 70 | private String url; 71 | /** 72 | * 有效值(值为负,小于-20删除之), 有人举报连接无效值-1,举报连接有效+2(考虑无效连接投诉率高,有效连接反馈率低) 73 | */ 74 | private int value; 75 | 76 | public Source() { 77 | } 78 | 79 | public Source(String url, int value) { 80 | super(); 81 | this.url = url; 82 | this.value = value; 83 | } 84 | 85 | public String getUrl() { 86 | return url; 87 | } 88 | 89 | public void setUrl(String url) { 90 | this.url = url; 91 | } 92 | 93 | public int getValue() { 94 | return value; 95 | } 96 | 97 | public void setValue(int value) { 98 | this.value = value; 99 | } 100 | 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/storage/Prepare.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.plugin.storage; 19 | 20 | /** 21 | * @author whiteme 22 | * @date 2013年7月28日 23 | * @desc 创建索引及mapping 24 | */ 25 | public class Prepare { 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/storage/p/IESStoragePlugin.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package org.sbs.goodcrawler.plugin.storage.p; 5 | 6 | import org.sbs.goodcrawler.jobconf.StoreConfig; 7 | import org.sbs.goodcrawler.page.ExtractedPage; 8 | 9 | /** 10 | * @author shenbaise(shenbaise1001@126.com) 11 | * es存储的扩展接口 12 | */ 13 | public abstract class IESStoragePlugin { 14 | protected StoreConfig config; 15 | 16 | public StoreConfig getConfig() { 17 | return config; 18 | } 19 | 20 | public void setConfig(StoreConfig config) { 21 | this.config = config; 22 | } 23 | 24 | public abstract ExtractedPage process(ExtractedPage page); 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/plugin/storage/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author shenbaise(shenbaise@outlook.com) 20 | * @date 2013-7-7 21 | */ 22 | package org.sbs.goodcrawler.plugin.storage; -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/schedule/RecrawFetherWorkor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.schedule; 19 | 20 | import org.sbs.goodcrawler.exception.QueueException; 21 | import org.sbs.goodcrawler.fetcher.FetchWorker; 22 | import org.sbs.goodcrawler.fetcher.PageFetcher; 23 | import org.sbs.goodcrawler.jobconf.FetchConfig; 24 | import org.sbs.pendingqueue.PendRecraw; 25 | import org.sbs.pendingqueue.PendingManager; 26 | import org.sbs.url.WebURL; 27 | 28 | public class RecrawFetherWorkor extends FetchWorker { 29 | 30 | private PendRecraw pendRecraw = null; 31 | public RecrawFetherWorkor(FetchConfig conf, PageFetcher fetcher) { 32 | super(conf, fetcher); 33 | pendRecraw = PendingManager.getUrlsToRecraw(conf.jobName); 34 | } 35 | 36 | @Override 37 | public void run() { 38 | WebURL url ; 39 | try { 40 | while(!isStop()){ 41 | while(null!=(url=pendRecraw.getElementT())){ 42 | fetchPageWhitoutExtractUrl(url); 43 | // 确保当前任务完成后跳出 44 | if(isStop()) 45 | break; 46 | } 47 | } 48 | } catch (QueueException e) { 49 | e.printStackTrace(); 50 | } 51 | } 52 | 53 | @Override 54 | public void onSuccessed() { 55 | pendRecraw.processedSuccess(); 56 | } 57 | 58 | @Override 59 | public void onFailed(WebURL url) { 60 | pendingPages.processedFailure(); 61 | } 62 | 63 | @Override 64 | public void onIgnored(WebURL url) { 65 | pendingPages.processedIgnored(); 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/storage/Storage.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.storage; 19 | 20 | import org.sbs.goodcrawler.page.ExtractedPage; 21 | /** 22 | * @author shenbaise(shenbaise@outlook.com) 23 | * @date 2013-6-29 24 | * 爬虫的存储接口 25 | */ 26 | public abstract class Storage { 27 | 28 | public Storage(){ 29 | } 30 | /** 31 | * @param object 32 | * @return 33 | * @desc 存储前 34 | */ 35 | public abstract StoreResult beforeStore(); 36 | /** 37 | * @param page 38 | * @return 39 | * @desc 存储时 40 | */ 41 | public abstract StoreResult onStore(ExtractedPage page); 42 | /** 43 | * @param page 44 | * @return 45 | * @desc 存储后 46 | */ 47 | public abstract StoreResult afterStore(ExtractedPage page); 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/storage/StorageType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.storage; 19 | 20 | /** 21 | * @author shenbaise(shenbaise@outlook.com) 22 | * @date 2013-6-29 23 | */ 24 | public enum StorageType { 25 | LocalFile,ElasticSearch,Mongodb,Hbase,Mysql 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/storage/StoreResult.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.goodcrawler.storage; 19 | 20 | /** 21 | * @author shenbaise(shenbaise@outlook.com) 22 | * @date 2013-6-29 23 | * 存储结果 24 | */ 25 | public class StoreResult { 26 | 27 | public Status status ; 28 | public String messge; 29 | 30 | public StoreResult(){} 31 | 32 | public StoreResult(Status status, String messge) { 33 | super(); 34 | this.status = status; 35 | this.messge = messge; 36 | } 37 | 38 | public Status getStatus() { 39 | return status; 40 | } 41 | 42 | public void setStatus(Status status) { 43 | this.status = status; 44 | } 45 | 46 | public String getMessge() { 47 | return messge; 48 | } 49 | 50 | public void setMessge(String messge) { 51 | this.messge = messge; 52 | } 53 | 54 | public enum Status { 55 | success,failed,ignored 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/goodcrawler/storage/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author shenbaise(shenbaise@outlook.com) 20 | * @date 2013-6-29 21 | */ 22 | package org.sbs.goodcrawler.storage; -------------------------------------------------------------------------------- /src/main/java/org/sbs/jetty/JettyFactory.java: -------------------------------------------------------------------------------- 1 | package org.sbs.jetty; 2 | 3 | import org.eclipse.jetty.server.Connector; 4 | import org.eclipse.jetty.server.Server; 5 | import org.eclipse.jetty.server.nio.SelectChannelConnector; 6 | import org.eclipse.jetty.webapp.WebAppClassLoader; 7 | import org.eclipse.jetty.webapp.WebAppContext; 8 | 9 | public class JettyFactory { 10 | 11 | private static final String DEFAULT_WEBAPP_PATH = "src/main/webapp"; 12 | private static final String WINDOWS_WEBDEFAULT_PATH = "webdefault-windows.xml"; 13 | 14 | /** 15 | * 创建用于开发运行调试的Jetty Server, 以src/main/webapp为Web应用目录. 16 | */ 17 | public static Server createServerInSource(int port, String contextPath) { 18 | Server server = new Server(); 19 | // 设置在JVM退出时关闭Jetty的钩子。 20 | server.setStopAtShutdown(true); 21 | 22 | SelectChannelConnector connector = new SelectChannelConnector(); 23 | connector.setPort(port); 24 | // 解决Windows下重复启动Jetty居然不报告端口冲突的问题. 25 | connector.setReuseAddress(false); 26 | server.setConnectors(new Connector[] { connector }); 27 | 28 | WebAppContext webContext = new WebAppContext(DEFAULT_WEBAPP_PATH, contextPath); 29 | webContext.setDescriptor("src/main/webapp/WEB-INF/web.xml"); 30 | // 修改webdefault.xml,解决Windows下Jetty Lock住静态文件的问题. 31 | webContext.setDefaultsDescriptor(WINDOWS_WEBDEFAULT_PATH); 32 | webContext.setResourceBase(DEFAULT_WEBAPP_PATH); 33 | webContext.setClassLoader(Thread.currentThread().getContextClassLoader()); 34 | server.setHandler(webContext); 35 | System.out.println(webContext.getContextPath()); 36 | System.out.println(webContext.getDescriptor()); 37 | System.out.println(webContext.getResourceBase()); 38 | System.out.println(webContext.getBaseResource()); 39 | return server; 40 | } 41 | 42 | /** 43 | * 快速重新启动application,重载target/classes与target/test-classes. 44 | */ 45 | public static void reloadContext(Server server) throws Exception { 46 | WebAppContext context = (WebAppContext) server.getHandler(); 47 | System.out.println("[INFO] Application reloading"); 48 | context.stop(); 49 | WebAppClassLoader classLoader = new WebAppClassLoader(context); 50 | classLoader.addClassPath("target/classes"); 51 | context.setClassLoader(classLoader); 52 | context.start(); 53 | System.out.println("[INFO] Application reloaded"); 54 | } 55 | } -------------------------------------------------------------------------------- /src/main/java/org/sbs/jetty/StartServer.java: -------------------------------------------------------------------------------- 1 | package org.sbs.jetty; 2 | 3 | import org.apache.commons.logging.Log; 4 | import org.apache.commons.logging.LogFactory; 5 | import org.eclipse.jetty.server.Server; 6 | import org.sbs.goodcrawler.bootstrap.BootStrap; 7 | import org.sbs.goodcrawler.exception.ConfigurationException; 8 | 9 | public class StartServer { 10 | 11 | public static final int PORT = 8080; 12 | public static final String CONTEXT = "/gc"; 13 | private Log log = LogFactory.getLog(StartServer.class); 14 | 15 | public static void main(String[] args) throws Exception { 16 | 17 | } 18 | /** 19 | * 启动爬虫 20 | */ 21 | public void startGC(){ 22 | try { 23 | BootStrap.start(); 24 | } catch (ConfigurationException e) { 25 | e.printStackTrace(); 26 | System.exit(-1); 27 | } 28 | } 29 | /** 30 | * 启动jetty服务 31 | * @param port 32 | * @param context 33 | */ 34 | public void startJetty(int port,String context){ 35 | final Server server = JettyFactory.createServerInSource(PORT, CONTEXT); 36 | try { 37 | server.stop(); 38 | server.start(); 39 | log.info("Server running at http://localhost:" + PORT + CONTEXT); 40 | new Thread(new Runnable() { 41 | @Override 42 | public void run() { 43 | // 等待用户输入回车重载应用. 44 | log.info("Hit Enter to reload the application quickly"); 45 | try { 46 | while (true) { 47 | char c = (char) System.in.read(); 48 | if (c == '\n') { 49 | JettyFactory.reloadContext(server); 50 | } 51 | } 52 | } catch (Exception e) { 53 | e.printStackTrace(); 54 | } 55 | } 56 | }, "reload jetty server").start(); 57 | 58 | } catch (Exception e) { 59 | e.printStackTrace(); 60 | System.exit(-1); 61 | } 62 | } 63 | } -------------------------------------------------------------------------------- /src/main/java/org/sbs/pendingqueue/PendRecraw.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sbs.pendingqueue; 18 | 19 | import org.sbs.url.WebURL; 20 | 21 | 22 | /** 23 | * @author shenbaise(shenbaise@outlook.com) 24 | * 定时更新的url--更新中的电视剧等 25 | */ 26 | public class PendRecraw extends AbsPendingQueue { 27 | private static final long serialVersionUID = -2733220512896685281L; 28 | 29 | protected PendRecraw(String jobName) { 30 | super(jobName); 31 | } 32 | 33 | public static void main(String[] args) { 34 | System.out.println(new PendRecraw("hello").pendingStatus()); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/pendingqueue/PendingPages.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.pendingqueue; 19 | 20 | import org.sbs.goodcrawler.page.Page; 21 | 22 | /** 23 | * @author shenbaise(shenbaise@outlook.com) 24 | * @date 2013-6-29 等待处理的页面 25 | */ 26 | public class PendingPages extends AbsPendingQueue { 27 | private static final long serialVersionUID = -5671808882701246813L; 28 | protected PendingPages(String jobName) { 29 | super(jobName); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/pendingqueue/PendingStore.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.pendingqueue; 19 | 20 | import org.sbs.goodcrawler.page.ExtractedPage; 21 | 22 | /** 23 | * @author shenbaise(shenbaise@outlook.com) 24 | * @date 2013-6-30 25 | */ 26 | @SuppressWarnings("rawtypes") 27 | public class PendingStore extends AbsPendingQueue { 28 | 29 | private static final long serialVersionUID = 7211446103736928404L; 30 | 31 | protected PendingStore(String jobName) { 32 | super(jobName); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/pendingqueue/PendingUrls.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sbs.pendingqueue; 18 | 19 | import org.sbs.url.WebURL; 20 | 21 | 22 | /** 23 | * @author shenbaise(shenbaise@outlook.com) 24 | * @date 2013-6-29 25 | * @desc 待处理的Urls队列 26 | */ 27 | public class PendingUrls extends AbsPendingQueue { 28 | private static final long serialVersionUID = -2733220512896685281L; 29 | 30 | protected PendingUrls(String jobName) { 31 | super(jobName); 32 | } 33 | 34 | public static void main(String[] args) { 35 | System.out.println(new PendingUrls("hello").pendingStatus()); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/pendingqueue/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.pendingqueue; -------------------------------------------------------------------------------- /src/main/java/org/sbs/robotstxt/HostDirectives.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.robotstxt; 19 | 20 | /** 21 | * @author Yasser Ganjisaffar 22 | */ 23 | public class HostDirectives { 24 | 25 | // If we fetched the directives for this host more than 26 | // 24 hours, we have to re-fetch it. 27 | private static final long EXPIRATION_DELAY = 24 * 60 * 1000L; 28 | 29 | private RuleSet disallows = new RuleSet(); 30 | private RuleSet allows = new RuleSet(); 31 | 32 | private long timeFetched; 33 | private long timeLastAccessed; 34 | 35 | public HostDirectives() { 36 | timeFetched = System.currentTimeMillis(); 37 | } 38 | 39 | public boolean needsRefetch() { 40 | return (System.currentTimeMillis() - timeFetched > EXPIRATION_DELAY); 41 | } 42 | 43 | public boolean allows(String path) { 44 | timeLastAccessed = System.currentTimeMillis(); 45 | return !disallows.containsPrefixOf(path) || allows.containsPrefixOf(path); 46 | } 47 | 48 | public void addDisallow(String path) { 49 | disallows.add(path); 50 | } 51 | 52 | public void addAllow(String path) { 53 | allows.add(path); 54 | } 55 | 56 | public long getLastAccessTime() { 57 | return timeLastAccessed; 58 | } 59 | } -------------------------------------------------------------------------------- /src/main/java/org/sbs/robotstxt/RobotstxtConfig.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.robotstxt; 19 | 20 | public class RobotstxtConfig { 21 | 22 | /** 23 | * Should the crawler obey Robots.txt protocol? More info on Robots.txt is 24 | * available at http://www.robotstxt.org/ 25 | */ 26 | private boolean enabled = true; 27 | 28 | /** 29 | * user-agent name that will be used to determine whether some servers have 30 | * specific rules for this agent name. 31 | */ 32 | private String userAgentName = "crawler4j"; 33 | 34 | /** 35 | * The maximum number of hosts for which their robots.txt is cached. 36 | */ 37 | private int cacheSize = 500; 38 | 39 | public boolean isEnabled() { 40 | return enabled; 41 | } 42 | 43 | public void setEnabled(boolean enabled) { 44 | this.enabled = enabled; 45 | } 46 | 47 | public String getUserAgentName() { 48 | return userAgentName; 49 | } 50 | 51 | public void setUserAgentName(String userAgentName) { 52 | this.userAgentName = userAgentName; 53 | } 54 | 55 | public int getCacheSize() { 56 | return cacheSize; 57 | } 58 | 59 | public void setCacheSize(int cacheSize) { 60 | this.cacheSize = cacheSize; 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/robotstxt/RuleSet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.robotstxt; 19 | 20 | import java.util.SortedSet; 21 | import java.util.TreeSet; 22 | 23 | public class RuleSet extends TreeSet { 24 | 25 | private static final long serialVersionUID = 1L; 26 | 27 | @Override 28 | public boolean add(String str) { 29 | SortedSet sub = headSet(str); 30 | if (!sub.isEmpty() && str.startsWith(sub.last())) { 31 | // no need to add; prefix is already present 32 | return false; 33 | } 34 | boolean retVal = super.add(str); 35 | sub = tailSet(str + "\0"); 36 | while (!sub.isEmpty() && sub.first().startsWith(str)) { 37 | // remove redundant entries 38 | sub.remove(sub.first()); 39 | } 40 | return retVal; 41 | } 42 | 43 | public boolean containsPrefixOf(String s) { 44 | SortedSet sub = headSet(s); 45 | // because redundant prefixes have been eliminated, 46 | // only a test against last item in headSet is necessary 47 | if (!sub.isEmpty() && s.startsWith(sub.last())) { 48 | return true; // prefix substring exists 49 | } 50 | // might still exist exactly (headSet does not contain boundary) 51 | return contains(s); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/url/TLDList.java: -------------------------------------------------------------------------------- 1 | package org.sbs.url; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.util.HashSet; 7 | import java.util.Set; 8 | 9 | public class TLDList { 10 | 11 | private final String tldNamesFileName = "tld-names.txt"; 12 | 13 | private Set tldSet = new HashSet<>(); 14 | 15 | private static TLDList instance = new TLDList(); 16 | 17 | private TLDList() { 18 | try { 19 | InputStream stream = this.getClass().getClassLoader().getResourceAsStream(tldNamesFileName); 20 | if (stream == null) { 21 | System.err.println("Couldn't find " + tldNamesFileName); 22 | System.exit(-1); 23 | } 24 | 25 | BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); 26 | String line; 27 | while ((line = reader.readLine()) != null) { 28 | line = line.trim(); 29 | if (line.isEmpty() || line.startsWith("//")) { 30 | continue; 31 | } 32 | tldSet.add(line); 33 | } 34 | reader.close(); 35 | } catch (Exception e) { 36 | e.printStackTrace(); 37 | } 38 | } 39 | 40 | public static TLDList getInstance() { 41 | return instance; 42 | } 43 | 44 | public boolean contains(String str) { 45 | return tldSet.contains(str); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/url/UlrFilters.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.url; 19 | 20 | import java.util.List; 21 | import java.util.regex.Pattern; 22 | 23 | import com.google.common.collect.Lists; 24 | 25 | /** 26 | * @author shenbaise(shenbaise@outlook.com) 27 | * @date 2013-6-30 28 | * Url正则匹配器 29 | */ 30 | public class UlrFilters { 31 | 32 | List patterns = Lists.newArrayList(); 33 | 34 | public void init(){ 35 | // Pattern pattern = Pattern.compile(regex); 36 | } 37 | 38 | 39 | 40 | /** 41 | * @param args 42 | * @desc 43 | */ 44 | public static void main(String[] args) { 45 | // TODO Auto-generated method stub 46 | 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/url/UrlSignatureSet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.url; 19 | 20 | import java.util.HashSet; 21 | import java.util.Set; 22 | 23 | /** 24 | * @author shenbaise(shenbaise@outlook.com) 25 | * @date 2013-6-29 26 | * 保存Url的签名信息,md5或者simhash 27 | * 已经废弃,去重复使用bloomfilter实现 28 | */ 29 | @Deprecated 30 | public class UrlSignatureSet { 31 | private static Set signatureSet = new HashSet<>(1024*1024*10); 32 | 33 | public static void add(String b){ 34 | signatureSet.add(b); 35 | } 36 | 37 | public static boolean duplicate(String b){ 38 | return signatureSet.equals(b); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/url/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | /** 18 | * @author shenbaise(shenbaise@outlook.com) 19 | * @date 2013-6-29 20 | */ 21 | package org.sbs.url; -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/CharUtil.java: -------------------------------------------------------------------------------- 1 | package org.sbs.util; 2 | 3 | import java.util.regex.Pattern; 4 | 5 | public class CharUtil { 6 | 7 | /** 8 | * @param args 9 | */ 10 | public static void main(String[] args) { 11 | String[] strArr = new String[] { "www.micmiu.com", 12 | "!@#$%^&*()_+{}[]|\"'?/:;<>,.", "!¥……()——:;“”‘’《》,。?、", "不要啊", 13 | "やめて", "韩佳人", "한가인" }; 14 | for (String str : strArr) { 15 | System.out.println("===========> 测试字符串:" + str); 16 | System.out.println("正则判断:" + isChineseByREG(str) + " -- " 17 | + isChineseByName(str)); 18 | System.out.println("Unicode判断结果 :" + isChinese(str)); 19 | System.out.println("详细判断列表:"); 20 | char[] ch = str.toCharArray(); 21 | for (int i = 0; i < ch.length; i++) { 22 | char c = ch[i]; 23 | System.out.println(c + " --> " + (isChinese(c) ? "是" : "否")); 24 | } 25 | } 26 | 27 | } 28 | 29 | // 根据Unicode编码完美的判断中文汉字和符号 30 | private static boolean isChinese(char c) { 31 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); 32 | if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 33 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 34 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 35 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 36 | || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION 37 | || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS 38 | || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION) { 39 | return true; 40 | } 41 | return false; 42 | } 43 | 44 | // 完整的判断中文汉字和符号 45 | public static boolean isChinese(String strName) { 46 | char[] ch = strName.toCharArray(); 47 | for (int i = 0; i < ch.length; i++) { 48 | char c = ch[i]; 49 | if (isChinese(c)) { 50 | return true; 51 | } 52 | } 53 | return false; 54 | } 55 | 56 | // 只能判断部分CJK字符(CJK统一汉字) 57 | public static boolean isChineseByREG(String str) { 58 | if (str == null) { 59 | return false; 60 | } 61 | Pattern pattern = Pattern.compile("[\\u4E00-\\u9FBF]+"); 62 | return pattern.matcher(str.trim()).find(); 63 | } 64 | 65 | // 只能判断部分CJK字符(CJK统一汉字) 66 | public static boolean isChineseByName(String str) { 67 | if (str == null) { 68 | return false; 69 | } 70 | // 大小写不同:\\p 表示包含,\\P 表示不包含 71 | // \\p{Cn} 的意思为 Unicode 中未被定义字符的编码,\\P{Cn} 就表示 Unicode中已经被定义字符的编码 72 | String reg = "\\p{InCJK Unified Ideographs}&&\\P{Cn}"; 73 | Pattern pattern = Pattern.compile(reg); 74 | return pattern.matcher(str.trim()).find(); 75 | } 76 | } -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/CheckIfUniqueUrl.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.util; 19 | 20 | /** 21 | * @author shenbaise(shenbaise@outlook.com) 22 | * @date 2013-6-29 23 | * 用于检测Url是否已经处理过 24 | */ 25 | public interface CheckIfUniqueUrl { 26 | public boolean isDuplicate(String url); 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/CheckIfUniqueUrlByBloomfilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.util; 19 | 20 | /** 21 | * @author shenbaise(shenbaise@outlook.com) 22 | * @date 2013-6-29 23 | */ 24 | public class CheckIfUniqueUrlByBloomfilter implements CheckIfUniqueUrl { 25 | 26 | /** 27 | * BloomFilter实例 28 | */ 29 | private static BloomFilter bloomFilter = new BloomFilter(0.001, 1024*1024); 30 | /** 31 | * CheckIfUniqueUrlByBloomfilter单例 32 | */ 33 | private CheckIfUniqueUrlByBloomfilter instance = null; 34 | 35 | private CheckIfUniqueUrlByBloomfilter(){}; 36 | 37 | /** 38 | * @desc 返回单例 39 | */ 40 | public CheckIfUniqueUrlByBloomfilter getInstance(){ 41 | if(instance==null){ 42 | instance = new CheckIfUniqueUrlByBloomfilter(); 43 | } 44 | return instance; 45 | } 46 | 47 | /* (non-Javadoc) 48 | * @see org.sbs.goodcrawler.urlmanager.CheckIfUniqueUrl#isDuplicate(java.lang.String) 49 | */ 50 | @Override 51 | public boolean isDuplicate(String url) { 52 | boolean b = bloomFilter.contains(url); 53 | if(!b) 54 | bloomFilter.add(url); 55 | return b; 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/CheckIfUniqueUrlByMd5.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.util; 19 | 20 | import org.sbs.url.UrlSignatureSet; 21 | 22 | /** 23 | * @author shenbaise(shenbaise@outlook.com) 24 | * @date 2013-6-29 25 | * 通过检测Url的md5信息比较Url是否重复 26 | */ 27 | @Deprecated 28 | public class CheckIfUniqueUrlByMd5 implements CheckIfUniqueUrl{ 29 | 30 | @Override 31 | public boolean isDuplicate(String url) { 32 | return UrlSignatureSet.duplicate(url); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/IO.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sbs.util; 19 | 20 | import java.io.*; 21 | import java.nio.ByteBuffer; 22 | import java.nio.channels.FileChannel; 23 | 24 | /** 25 | * @author Yasser Ganjisaffar 26 | */ 27 | public class IO { 28 | 29 | public static boolean deleteFolder(File folder) { 30 | return deleteFolderContents(folder) && folder.delete(); 31 | } 32 | 33 | public static boolean deleteFolderContents(File folder) { 34 | System.out.println("Deleting content of: " + folder.getAbsolutePath()); 35 | File[] files = folder.listFiles(); 36 | for (File file : files) { 37 | if (file.isFile()) { 38 | if (!file.delete()) { 39 | return false; 40 | } 41 | } else { 42 | if (!deleteFolder(file)) { 43 | return false; 44 | } 45 | } 46 | } 47 | return true; 48 | } 49 | 50 | public static void writeBytesToFile(byte[] bytes, String destination) { 51 | try { 52 | FileChannel fc = new FileOutputStream(destination,true).getChannel(); 53 | fc.write(ByteBuffer.wrap(bytes)); 54 | fc.close(); 55 | } catch (Exception e) { 56 | e.printStackTrace(); 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/JsonUtil.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.util; 19 | 20 | import java.util.HashMap; 21 | import java.util.List; 22 | import java.util.Map; 23 | 24 | import net.sf.json.JSONObject; 25 | 26 | /** 27 | * @author shenbaise(shenbaise@outlook.com) 28 | * @date 2013-7-4 29 | * json工具 30 | */ 31 | public class JsonUtil { 32 | 33 | /** 34 | * 构造函数 35 | */ 36 | public JsonUtil() { 37 | } 38 | 39 | public static JSONObject generate(List list) { 40 | Map map = new HashMap(); 41 | map.put("totalProperty", list.size()); 42 | map.put("root", list); 43 | return JSONObject.fromObject(map); 44 | } 45 | 46 | public static JSONObject javabean2json(Object object) { 47 | Map map = new HashMap(); 48 | map.put("success", true); 49 | map.put("data", object); 50 | return JSONObject.fromObject(map); 51 | } 52 | 53 | public static JSONObject objectcollect2json(List list, String total) { 54 | Map map = new HashMap(); 55 | map.put("totalProperty", total); 56 | map.put("root", list); 57 | return JSONObject.fromObject(map); 58 | } 59 | 60 | /** 61 | * @param args 62 | * @desc 63 | */ 64 | public static void main(String[] args) { 65 | 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/Simhash.java: -------------------------------------------------------------------------------- 1 | package org.sbs.util; 2 | 3 | import java.util.List; 4 | import java.util.Set; 5 | 6 | public class Simhash { 7 | 8 | private IWordSeg wordSeg; 9 | 10 | public Simhash(IWordSeg wordSeg) { 11 | this.wordSeg = wordSeg; 12 | } 13 | 14 | public int hammingDistance(int hash1, int hash2) { 15 | int i = hash1 ^ hash2; 16 | i = i - ((i >>> 1) & 0x55555555); 17 | i = (i & 0x33333333) + ((i >>> 2) & 0x33333333); 18 | i = (i + (i >>> 4)) & 0x0f0f0f0f; 19 | i = i + (i >>> 8); 20 | i = i + (i >>> 16); 21 | return i & 0x3f; 22 | } 23 | 24 | public int hammingDistance(long hash1, long hash2) { 25 | long i = hash1 ^ hash2; 26 | i = i - ((i >>> 1) & 0x5555555555555555L); 27 | i = (i & 0x3333333333333333L) + ((i >>> 2) & 0x3333333333333333L); 28 | i = (i + (i >>> 4)) & 0x0f0f0f0f0f0f0f0fL; 29 | i = i + (i >>> 8); 30 | i = i + (i >>> 16); 31 | i = i + (i >>> 32); 32 | return (int) i & 0x7f; 33 | } 34 | 35 | public long simhash64(String doc) { 36 | int bitLen = 64; 37 | int[] bits = new int[bitLen]; 38 | List tokens = wordSeg.tokens(doc); 39 | for (String t : tokens) { 40 | long v = MurmurHash.hash64(t); 41 | for (int i = bitLen; i >= 1; --i) { 42 | if (((v >> (bitLen - i)) & 1) == 1) 43 | ++bits[i - 1]; 44 | else 45 | --bits[i - 1]; 46 | } 47 | } 48 | long hash = 0x0000000000000000; 49 | long one = 0x0000000000000001; 50 | for (int i = bitLen; i >= 1; --i) { 51 | if (bits[i - 1] > 1) { 52 | hash |= one; 53 | } 54 | one = one << 1; 55 | } 56 | return hash; 57 | } 58 | 59 | public long simhash32(String doc) { 60 | int bitLen = 32; 61 | int[] bits = new int[bitLen]; 62 | List tokens = wordSeg.tokens(doc); 63 | for (String t : tokens) { 64 | int v = MurmurHash.hash32(t); 65 | for (int i = bitLen; i >= 1; --i) { 66 | if (((v >> (bitLen - i)) & 1) == 1) 67 | ++bits[i - 1]; 68 | else 69 | --bits[i - 1]; 70 | } 71 | } 72 | int hash = 0x00000000; 73 | int one = 0x00000001; 74 | for (int i = bitLen; i >= 1; --i) { 75 | if (bits[i - 1] > 1) { 76 | hash |= one; 77 | } 78 | one = one << 1; 79 | } 80 | return hash; 81 | } 82 | 83 | public interface IWordSeg { 84 | public List tokens(String doc); 85 | public List tokens(String doc, Set stopWords); 86 | } 87 | } -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/StringHelper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.util; 19 | 20 | import java.io.UnsupportedEncodingException; 21 | import java.net.URLDecoder; 22 | 23 | import org.apache.commons.lang3.StringUtils; 24 | 25 | import com.google.common.base.CharMatcher; 26 | 27 | /** 28 | * @author whiteme 29 | * @date 2013年7月25日 30 | * @desc 字符处理 31 | */ 32 | public class StringHelper { 33 | /** 34 | * @param isoString 35 | * @return 36 | */ 37 | public static String isoToUtf8(String isoString){ 38 | if(StringUtils.isBlank(isoString)) 39 | return ""; 40 | try { 41 | return new String(isoString.getBytes("iso8859-1"),"utf-8"); 42 | } catch (UnsupportedEncodingException e) { 43 | e.printStackTrace(); 44 | } 45 | return ""; 46 | } 47 | /** 48 | * @param urlString 49 | * @return 50 | */ 51 | public static String urlDecodeString(String urlString){ 52 | if(StringUtils.isBlank(urlString)) 53 | return ""; 54 | try { 55 | return URLDecoder.decode(urlString, "utf-8"); 56 | } catch (UnsupportedEncodingException e) { 57 | e.printStackTrace(); 58 | } 59 | return ""; 60 | } 61 | /** 62 | * 删除特殊字符、字母、数字、空格 63 | * @param s 64 | * @return 65 | */ 66 | public static String removeAB12Blank(String s){ 67 | return CharMatcher.anyOf("abcdefghijklmnopqrstuvwxyz;&ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.~!@#$%^&*()-+= 》《> submit(Callable call){ 53 | return pool.submit(call); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/util/image/ImageResizePool.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.util.image; 19 | 20 | import java.util.concurrent.Callable; 21 | import java.util.concurrent.ExecutorService; 22 | import java.util.concurrent.Executors; 23 | import java.util.concurrent.Future; 24 | 25 | /** 26 | * @author whiteme 27 | * @date 2013年10月22日 28 | * @desc 一个简单的线程池,用于提交图片压缩任务的线程 29 | */ 30 | public class ImageResizePool { 31 | /** 32 | * 线程池 33 | */ 34 | public ExecutorService pool; 35 | 36 | private static ImageResizePool instance; 37 | 38 | private ImageResizePool(){ 39 | pool = Executors.newFixedThreadPool(10); 40 | } 41 | 42 | public static ImageResizePool getInstance(){ 43 | if(instance == null){ 44 | instance = new ImageResizePool(); 45 | } 46 | return instance; 47 | } 48 | /** 49 | * 提交线程 50 | * @return 51 | */ 52 | public Future submit(Callable call){ 53 | return pool.submit(call); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/web/ContextListener.java: -------------------------------------------------------------------------------- 1 | package org.sbs.web; 2 | 3 | import javax.servlet.ServletContext; 4 | import javax.servlet.ServletContextEvent; 5 | import javax.servlet.ServletContextListener; 6 | import javax.servlet.annotation.WebListener; 7 | 8 | /** 9 | * Application Lifecycle Listener implementation class ContextListener 10 | * 11 | */ 12 | @WebListener 13 | public class ContextListener extends ServletContextEvent implements ServletContextListener { 14 | 15 | private static final long serialVersionUID = -2217410422446016104L; 16 | 17 | /** 18 | * @see ServletContextEvent#ServletContextEvent(ServletContext) 19 | */ 20 | public ContextListener(ServletContext source) { 21 | super(source); 22 | // TODO Auto-generated constructor stub 23 | } 24 | 25 | /** 26 | * @see ServletContextListener#contextInitialized(ServletContextEvent) 27 | */ 28 | public void contextInitialized(ServletContextEvent sce) { 29 | // TODO Auto-generated method stub 30 | } 31 | 32 | /** 33 | * @see ServletContextListener#contextDestroyed(ServletContextEvent) 34 | */ 35 | public void contextDestroyed(ServletContextEvent sce) { 36 | // TODO Auto-generated method stub 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/web/CrawlerManager.java: -------------------------------------------------------------------------------- 1 | package org.sbs.web; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.servlet.ServletException; 6 | import javax.servlet.annotation.WebServlet; 7 | import javax.servlet.http.HttpServlet; 8 | import javax.servlet.http.HttpServletRequest; 9 | import javax.servlet.http.HttpServletResponse; 10 | 11 | /** 12 | * Servlet implementation class CrawlerManager 13 | */ 14 | @WebServlet("/crawler") 15 | public class CrawlerManager extends HttpServlet { 16 | private static final long serialVersionUID = 1L; 17 | 18 | /** 19 | * @see HttpServlet#HttpServlet() 20 | */ 21 | public CrawlerManager() { 22 | super(); 23 | // TODO Auto-generated constructor stub 24 | } 25 | 26 | /** 27 | * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) 28 | */ 29 | protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 30 | // TODO Auto-generated method stub 31 | System.out.println("do get "); 32 | 33 | System.out.println(CrawlerManager.class.getResource("job_conf.xml").getFile()); 34 | //BootStrap.start(CrawlerManager.class.getResource("job_conf.xml").getFile()); 35 | } 36 | 37 | /** 38 | * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) 39 | */ 40 | protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 41 | // TODO Auto-generated method stub 42 | System.out.println("do post"); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/web/GoodServlet.java: -------------------------------------------------------------------------------- 1 | package org.sbs.web; 2 | 3 | import java.io.IOException; 4 | import javax.servlet.ServletException; 5 | import javax.servlet.annotation.WebServlet; 6 | import javax.servlet.http.HttpServlet; 7 | import javax.servlet.http.HttpServletRequest; 8 | import javax.servlet.http.HttpServletResponse; 9 | 10 | /** 11 | * Servlet implementation class GoodServlet 12 | */ 13 | @WebServlet("/gcrawler") 14 | public class GoodServlet extends HttpServlet { 15 | private static final long serialVersionUID = 1L; 16 | 17 | /** 18 | * @see HttpServlet#HttpServlet() 19 | */ 20 | public GoodServlet() { 21 | super(); 22 | // TODO Auto-generated constructor stub 23 | } 24 | 25 | /** 26 | * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) 27 | */ 28 | protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 29 | // TODO Auto-generated method stub 30 | } 31 | 32 | /** 33 | * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response) 34 | */ 35 | protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 36 | // TODO Auto-generated method stub 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/web/Start.java: -------------------------------------------------------------------------------- 1 | package org.sbs.web; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.servlet.ServletException; 6 | import javax.servlet.annotation.WebServlet; 7 | import javax.servlet.http.HttpServlet; 8 | import javax.servlet.http.HttpServletRequest; 9 | import javax.servlet.http.HttpServletResponse; 10 | 11 | import org.sbs.goodcrawler.bootstrap.BootStrap; 12 | import org.sbs.goodcrawler.bootstrap.CrawlerStatus; 13 | import org.sbs.goodcrawler.exception.ConfigurationException; 14 | 15 | /** 16 | * Servlet implementation class Start 17 | */ 18 | @WebServlet("/start") 19 | public class Start extends HttpServlet { 20 | private static final long serialVersionUID = 1L; 21 | 22 | /** 23 | * @see HttpServlet#HttpServlet() 24 | */ 25 | public Start() { 26 | super(); 27 | // TODO Auto-generated constructor stub 28 | } 29 | 30 | /** 31 | * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) 32 | */ 33 | protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 34 | System.out.println("start"); 35 | if(!CrawlerStatus.running){ 36 | try { 37 | BootStrap.start(); 38 | request.setAttribute("start", "程序正在运行中。。。"); 39 | request.setAttribute("jobs", BootStrap.getJobsNames()); 40 | request.setAttribute("status", CrawlerStatus.getStatus()); 41 | request.getRequestDispatcher("index.jsp").forward(request, response); 42 | } catch (ConfigurationException e) { 43 | e.printStackTrace(); 44 | request.setAttribute("status", e.getMessage()); 45 | request.getRequestDispatcher("index.jsp").forward(request, response); 46 | } 47 | } 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/web/Status.java: -------------------------------------------------------------------------------- 1 | package org.sbs.web; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.servlet.ServletException; 6 | import javax.servlet.annotation.WebServlet; 7 | import javax.servlet.http.HttpServlet; 8 | import javax.servlet.http.HttpServletRequest; 9 | import javax.servlet.http.HttpServletResponse; 10 | 11 | import org.sbs.goodcrawler.bootstrap.BootStrap; 12 | import org.sbs.goodcrawler.bootstrap.CrawlerStatus; 13 | 14 | /** 15 | * Servlet implementation class Status 16 | */ 17 | @WebServlet("/status") 18 | public class Status extends HttpServlet { 19 | private static final long serialVersionUID = 1L; 20 | 21 | /** 22 | * @see HttpServlet#HttpServlet() 23 | */ 24 | public Status() { 25 | super(); 26 | // TODO Auto-generated constructor stub 27 | } 28 | 29 | /** 30 | * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) 31 | */ 32 | protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 33 | System.out.println("Status"); 34 | 35 | if(CrawlerStatus.running){ 36 | request.setAttribute("start", "程序正在运行中。。。"); 37 | request.setAttribute("jobs", BootStrap.getJobsNames()); 38 | request.setAttribute("status", CrawlerStatus.getStatus()); 39 | }else { 40 | request.setAttribute("stop", "程序停止运行。。。"); 41 | } 42 | // response.sendRedirect("index.jsp"); 43 | request.getRequestDispatcher("index.jsp").forward(request, response); 44 | } 45 | 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/web/Stop.java: -------------------------------------------------------------------------------- 1 | package org.sbs.web; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.servlet.ServletException; 6 | import javax.servlet.annotation.WebServlet; 7 | import javax.servlet.http.HttpServlet; 8 | import javax.servlet.http.HttpServletRequest; 9 | import javax.servlet.http.HttpServletResponse; 10 | 11 | import org.apache.commons.lang3.StringUtils; 12 | import org.sbs.goodcrawler.bootstrap.BootStrap; 13 | import org.sbs.goodcrawler.bootstrap.CrawlerStatus; 14 | 15 | /** 16 | * Servlet implementation class Stop 17 | */ 18 | @WebServlet("/stop") 19 | public class Stop extends HttpServlet { 20 | private static final long serialVersionUID = 1L; 21 | 22 | /** 23 | * @see HttpServlet#HttpServlet() 24 | */ 25 | public Stop() { 26 | super(); 27 | // TODO Auto-generated constructor stub 28 | } 29 | 30 | /** 31 | * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) 32 | */ 33 | protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { 34 | System.out.println("stop"); 35 | String jobId = request.getParameter("jobId"); 36 | if(StringUtils.isNotBlank(jobId)){ 37 | BootStrap.stop(jobId); 38 | } 39 | if(CrawlerStatus.running){ 40 | BootStrap.stopAll(); 41 | } 42 | request.setAttribute("stop", "程序停止运行。。。"); 43 | request.getRequestDispatcher("index.jsp").forward(request, response); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/org/sbs/web/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ######################## SHENBAISE'S WORK ########################## 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | /** 19 | * @author whiteme 20 | * @date 2013年7月29日 21 | * @desc 22 | */ 23 | package org.sbs.web; -------------------------------------------------------------------------------- /src/main/resources/conf.properties: -------------------------------------------------------------------------------- 1 | #\u5F85\u5904\u7406URL\u961F\u5217\u5927\u5C0F 2 | pending.urls.queue.size=500000 3 | #\u5F85\u5904\u7406\u7684\u9875\u9762\u961F\u5217\u5927\u5C0F 4 | pending.pages.queue.size=100000 5 | #\u5904\u7406\u5931\u8D25\u7684\u9875\u9762\u961F\u5217\u5927\u5C0F 6 | failed.pages.queue.size=1000 7 | #\u89E3\u6790\u5931\u8D25\u9875\u9762\u5907\u4EFD\u8DEF\u5F84 8 | pending.store.pages.queue.size=2000 9 | failed.pages.backup.path=./failed-pages/ 10 | #\u662F\u5426\u5FFD\u7565\u9519\u8BEF\u7684\u6216\u8005\u89E3\u6790\u5931\u8D25\u7684\u9875\u9762 11 | ignore.failed.pages=false 12 | status.save.path=./status 13 | -------------------------------------------------------------------------------- /src/main/resources/default_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "template": "logstash-*", 3 | "settings" : { 4 | "number_of_shards" : 1, 5 | "number_of_replicas" : 0, 6 | "index" : { 7 | "query" : { "default_field" : "@message" }, 8 | "store" : { "compress" : { "stored" : true, "tv": true } } 9 | } 10 | }, 11 | "mappings": { 12 | "_default_": { 13 | "_all": { "enabled": false }, 14 | "_source": { "compress": true }, 15 | "dynamic_templates": [ 16 | { 17 | "string_template" : { 18 | "match" : "*", 19 | "mapping": { "type": "string", "index": "not_analyzed" }, 20 | "match_mapping_type" : "string" 21 | } 22 | } 23 | ], 24 | "properties" : { 25 | "@fields": { "type": "object", "dynamic": true, "path": "full" }, 26 | "@message" : { "type" : "string", "index" : "analyzed" }, 27 | "@source" : { "type" : "string", "index" : "not_analyzed" }, 28 | "@source_host" : { "type" : "string", "index" : "not_analyzed" }, 29 | "@source_path" : { "type" : "string", "index" : "not_analyzed" }, 30 | "@tags": { "type": "string", "index" : "not_analyzed" }, 31 | "@timestamp" : { "type" : "date", "index" : "not_analyzed" }, 32 | "@type" : { "type" : "string", "index" : "not_analyzed" } 33 | } 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | true 5 | 6 | 7 | logs/gc-log-%d{yyyy-MM-dd}.%i.log 8 | 9 | 1 10 | 3 11 | 10 12 | 14 | 50MB 15 | 16 | 17 | 18 | %date %level [%thread] %logger.%class{0}#%method [%file:%line] %msg%n 19 | 20 | 21 | 22 | 23 | %date %level [%thread] %logger.%class{0}#%method [%file:%line] %msg%n 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/main/resources/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": { 3 | "_all": { 4 | "enabled": true 5 | }, 6 | "index_analyzer": "ik", 7 | "search_analyzer": "ik", 8 | "_timestamp": { 9 | "enabled": true, 10 | "format": "YYYY-MM-dd" 11 | }, 12 | "dynamic_templates": [ 13 | { 14 | "string_template": { 15 | "match": "*", 16 | "mapping": { 17 | "type": "string", 18 | "index": "not_analyzed" 19 | }, 20 | "match_mapping_type": "string" 21 | } 22 | } 23 | ], 24 | "properties": { 25 | "title": { 26 | "type": "string", 27 | "include_in_all": true, 28 | "index": "analyzed" 29 | }, 30 | "year": { 31 | "type": "date", 32 | "include_in_all": true, 33 | "index": "not_analyzed" 34 | }, 35 | "actors": { 36 | "type": "string", 37 | "include_in_all": true, 38 | "index": "analyzed" 39 | }, 40 | "director": { 41 | "type": "string", 42 | "include_in_all": true, 43 | "index": "analyzed" 44 | }, 45 | "summary": { 46 | "type": "string", 47 | "include_in_all": false, 48 | "index": "analyzed" 49 | }, 50 | "type": { 51 | "type": "string", 52 | "include_in_all": true, 53 | "index": "not_analyzed" 54 | }, 55 | "category": { 56 | "type": "string", 57 | "include_in_all": true, 58 | "index": "not_analyzed" 59 | } 60 | } 61 | } 62 | } -------------------------------------------------------------------------------- /src/main/webapp/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Class-Path: 3 | 4 | -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | goodcrawler 4 | 5 | index.html 6 | index.htm 7 | index.jsp 8 | default.html 9 | default.htm 10 | default.jsp 11 | 12 | 13 | 14 | crawler 15 | crawler 16 | org.sbs.web.CrawlerManager 17 | 18 | 19 | crawler 20 | /crawler 21 | 22 | 23 | 24 | start 25 | start 26 | org.sbs.web.Start 27 | 28 | 29 | start 30 | /start 31 | 32 | 33 | 34 | stop 35 | stop 36 | org.sbs.web.Stop 37 | 38 | 39 | stop 40 | /stop 41 | 42 | 43 | 44 | 45 | status 46 | status 47 | org.sbs.web.Status 48 | 49 | 50 | status 51 | /status 52 | 53 | -------------------------------------------------------------------------------- /src/main/webapp/index.jsp: -------------------------------------------------------------------------------- 1 | <%@ page language="java" contentType="text/html; charset=UTF-8" 2 | pageEncoding="UTF-8"%> 3 | 4 | 5 | 6 | 7 | Goodcrawler Controller Panel 8 | 9 | 10 |

11 |

12 |

13 | 14 | 15 | 16 | 19 | 22 | 23 | 24 | 27 | 30 | 31 | 32 | 35 | 38 | 39 | 40 | 43 | 46 | 47 | 48 |
17 | 启动
18 |
20 | <%=request.getAttribute("start") %> 21 |
25 | 停止
26 |
28 | <%=request.getAttribute("stop") %> 29 |
33 | 任务: 34 | 36 | <%=request.getAttribute("jobs") %> 37 |
41 | 运行状态(刷新)
:
42 |
44 | <%=request.getAttribute("status") %> 45 |
49 |

50 | 51 | -------------------------------------------------------------------------------- /src/test/java/org/sbs/AppTest.java: -------------------------------------------------------------------------------- 1 | package org.sbs; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/java/org/sbs/ListLinks.java: -------------------------------------------------------------------------------- 1 | package org.sbs; 2 | 3 | import java.io.IOException; 4 | 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.safety.Whitelist; 9 | import org.jsoup.select.Elements; 10 | 11 | public class ListLinks { 12 | public static void main(String[] args) throws IOException { 13 | String url = "http://www.66e.cc/bd/20110611/7de42f3f410b41b99f55854553be25e1.htm"; 14 | 15 | Document doc = Jsoup.connect(url).get(); 16 | 17 | Elements elements = doc.select("p"); 18 | for(Element e:elements){ 19 | // System.out.println(e.text()); 20 | String[] ss = e.toString().split("
"); 21 | for(String s:ss){ 22 | System.err.println(Jsoup.clean(s, Whitelist.none())); 23 | } 24 | } 25 | 26 | Elements links = doc.select("a[href^=ftp]"); 27 | for(Element e:links){ 28 | Elements xxElements = e.select("a[href^=ftp]"); 29 | for(Element x:xxElements){ 30 | System.out.println(x.text()); 31 | } 32 | System.out.println(e.text()); 33 | } 34 | } 35 | } -------------------------------------------------------------------------------- /src/test/java/org/sbs/T.java: -------------------------------------------------------------------------------- 1 | package org.sbs; 2 | 3 | import java.io.IOException; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | 7 | public class T { 8 | 9 | /** 10 | * @param args 11 | * @desc 12 | */ 13 | public static void main(String[] args) { 14 | 15 | 16 | new Thread(new Runnable() { 17 | 18 | @Override 19 | public void run() { 20 | byte[] b = new byte[128]; 21 | while(true){ 22 | try { 23 | int i = System.in.read(b); 24 | String input = new String(b); 25 | input = input.replace("\n", "").replace("\r", "").trim(); 26 | if(input.equalsIgnoreCase("quit")){ 27 | 28 | System.exit(0); 29 | } 30 | } catch (IOException e) { 31 | e.printStackTrace(); 32 | } 33 | } 34 | 35 | } 36 | }).start(); 37 | 38 | 39 | 40 | // String textString = "片名:床的另一边发布时间:2013-07-18评分:7.2"; 41 | // Pattern pattern = Pattern.compile("(\\d{1}[.]\\d{1,2})"); 42 | // Matcher m = pattern.matcher(textString); 43 | // while(m.find()){ 44 | // System.out.println(m.group(1)); 45 | // } 46 | 47 | // pattern = Pattern.compile("http://.*.wasu.cn/Play/show/id/\\d+"); 48 | // System.out.println(pattern.matcher("http://www.wasu.cn/Play/show/id/216708").matches()); 49 | // System.out.println(Pattern.matches("http://.*.wasu.cn/Play/show/id/\\d+", "http://www.wasu.cn/Play/show/id/216708")); 50 | // 51 | 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/test/java/org/sbs/extract/TestWasu.java: -------------------------------------------------------------------------------- 1 | package org.sbs.extract; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.net.MalformedURLException; 6 | import java.net.URL; 7 | import java.util.Map; 8 | 9 | import org.jsoup.Jsoup; 10 | import org.jsoup.nodes.Document; 11 | import org.junit.Test; 12 | import org.sbs.goodcrawler.bootstrap.foreman.FetchForeman; 13 | import org.sbs.goodcrawler.exception.ConfigurationException; 14 | import org.sbs.goodcrawler.exception.ExtractException; 15 | import org.sbs.goodcrawler.jobconf.ExtractConfig; 16 | import org.sbs.goodcrawler.jobconf.FetchConfig; 17 | 18 | public class TestWasu { 19 | 20 | private static String confFile = "conf/wasu_conf.xml"; 21 | //电视剧: 新闺蜜时代 22 | private static String tv = "http://www.wasu.cn/Tele/index/id/1146253"; 23 | // http://v.youku.com/v_show/id_XNTc4NzczNDM2.html 24 | // 电影: 赤裸特工 25 | private static String movie = "http://www.wasu.cn/Play/show/id/216708"; 26 | // http://v.youku.com/v_show/id_XNjY4ODk3MjI4.html 27 | // 综艺 唱出爱火花_20140220_张潇洋 开门见山 女王驾到 28 | private static String zy = "http://www.wasu.cn/Play/show/id/2194006"; 29 | // 动漫 海绵宝宝 30 | private static String dm = "http://www.wasu.cn/Tele/index/id/351623"; 31 | 32 | private static String zwdajs = "http://www.wasu.cn/Tele/index/id/1007994"; 33 | 34 | @Test 35 | public void tv(){ 36 | new Tester().test(confFile, "http://www.wasu.cn/Tele/index/id/2169706"); 37 | } 38 | 39 | @Test 40 | public void movie(){ 41 | new Tester().test(confFile, movie); 42 | } 43 | 44 | @Test 45 | public void zy(){ 46 | new Tester().test(confFile, zy); 47 | } 48 | 49 | @Test 50 | public void dm(){ 51 | 52 | new Tester().test(confFile,dm); 53 | } 54 | 55 | @Test 56 | public void t(){ 57 | try { 58 | Document elements = Jsoup.parse(new URL("http://www.wasu.cn/Play/show/id/2339162"), 100000); 59 | // 分类 60 | // System.out.println(elements.select(".play_information_t .r .one a"));// title 61 | // System.out.println(elements.select(".play_information_t .r .two a "));// eara 62 | // System.out.println(elements.select(".play_information_t .r .three a")); // type 63 | // System.out.println(elements.select(".play_information_t .r .four a")); // director 64 | // System.out.println(elements.select(".play_information_t .r .five .r a")); // actor 65 | // System.out.println(elements.select(".play_information_b .one b")); 66 | 67 | System.out.println(elements.select(".play_seat a").get(1)); 68 | } catch (IOException e) { 69 | e.printStackTrace(); 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/test/java/org/sbs/extract/TestYouku.java: -------------------------------------------------------------------------------- 1 | package org.sbs.extract; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.net.URL; 6 | import java.util.Map; 7 | 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | import org.junit.Test; 11 | import org.sbs.goodcrawler.bootstrap.foreman.FetchForeman; 12 | import org.sbs.goodcrawler.exception.ConfigurationException; 13 | import org.sbs.goodcrawler.exception.ExtractException; 14 | import org.sbs.goodcrawler.jobconf.ExtractConfig; 15 | import org.sbs.goodcrawler.jobconf.FetchConfig; 16 | 17 | public class TestYouku { 18 | 19 | private static String confFile = "conf/youku_conf.xml"; 20 | //电视剧: 精忠岳飞 2013 21 | private static String tv = "http://www.youku.com/show_page/id_zd4edea60e0d011df97c0.html"; 22 | // http://v.youku.com/v_show/id_XNTc4NzczNDM2.html 23 | // 电影: 上帝保佑美国 2012 24 | private static String movie = "http://www.youku.com/show_page/id_z0fca04b0bceb11e0bf93.html"; 25 | // http://v.youku.com/v_show/id_XNjY4ODk3MjI4.html 26 | // 综艺 快乐大本营 27 | private static String zy = "http://www.youku.com/show_page/id_zd18a7caa2d4311e29498.html"; 28 | // 动漫 柯南 29 | private static String dm = "http://www.youku.com/show_page/id_zcc003400962411de83b1.html"; 30 | @Test 31 | public void tv(){ 32 | new Tester().test(confFile, tv); 33 | } 34 | 35 | @Test 36 | public void movie(){ 37 | new Tester().test(confFile, movie); 38 | } 39 | 40 | @Test 41 | public void zy(){ 42 | new Tester().test(confFile, zy); 43 | } 44 | 45 | @Test 46 | public void dm(){ 47 | new Tester().test("conf/test_youku_dm.xml", "http://v.youku.com/v_show/id_XMzk1NjM1MjAw.html"); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/test/java/org/sbs/extract/Tester.java: -------------------------------------------------------------------------------- 1 | package org.sbs.extract; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.net.URL; 6 | import java.util.Map; 7 | 8 | import org.jsoup.Jsoup; 9 | import org.jsoup.nodes.Document; 10 | import org.sbs.goodcrawler.bootstrap.foreman.FetchForeman; 11 | import org.sbs.goodcrawler.exception.ConfigurationException; 12 | import org.sbs.goodcrawler.exception.ExtractException; 13 | import org.sbs.goodcrawler.jobconf.ExtractConfig; 14 | import org.sbs.goodcrawler.jobconf.FetchConfig; 15 | 16 | public class Tester { 17 | 18 | public void test(String conFile,String url){ 19 | ExtractConfig extractConfig = new ExtractConfig(); 20 | FetchConfig fetchConfig = new FetchConfig(); 21 | Document document; 22 | try { 23 | document = Jsoup.parse(new File(conFile), "utf-8"); 24 | System.out.println(extractConfig.loadConfig(document).toString()); 25 | FetchForeman fetchForeman = new FetchForeman(); 26 | fetchForeman.start(fetchConfig.loadConfig(document)); 27 | Map r=extractConfig 28 | .getContentSeprator(Jsoup.parse(new URL(url), 10000),url); 29 | System.out.println(r); 30 | }catch (IOException e) { 31 | e.printStackTrace(); 32 | } catch (ConfigurationException e) { 33 | e.printStackTrace(); 34 | } catch (ExtractException e) { 35 | e.printStackTrace(); 36 | } 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/test/java/org/sbs/htmlunit/element/GcElementTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @工程 goodcrawler 3 | * @文件 GcElementTest.java 4 | * @时间 2013年12月19日 下午4:58:46 5 | * @作者 shenbaise(shenbaise1001@126.com) 6 | * @描述 7 | */ 8 | package org.sbs.htmlunit.element; 9 | 10 | import org.junit.Test; 11 | import org.sbs.goodcrawler.extractor.GCElement; 12 | import org.sbs.goodcrawler.extractor.selector.AbstractElementCssSelector; 13 | import org.sbs.goodcrawler.extractor.selector.StringElementCssSelector; 14 | 15 | /** 16 | * @author shenbaise(shenbaise1001@126.com) 17 | * @desc 18 | */ 19 | public class GcElementTest { 20 | 21 | @Test 22 | public void insTest(){ 23 | StringElementCssSelector secsCssSelector = new StringElementCssSelector("", "", "", true, 0,""); 24 | System.out.println(secsCssSelector instanceof GCElement); 25 | assert secsCssSelector instanceof AbstractElementCssSelector; 26 | assert secsCssSelector instanceof GCElement; 27 | // assert secsCssSelector instanceof AbstractHtmlElement; 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/org/sbs/htmlunit/element/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @工程 goodcrawler 3 | * @文件 package-info.java 4 | * @时间 2013年12月19日 下午4:57:57 5 | * @作者 shenbaise(shenbaise1001@126.com) 6 | * @描述 7 | */ 8 | /** 9 | * @author shenbaise(shenbaise1001@126.com) 10 | * @desc 11 | */ 12 | package org.sbs.htmlunit.element; -------------------------------------------------------------------------------- /src/test/java/org/sbs/htmlunit/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * ########################## GoodCrawler ############################ 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.sbs.htmlunit; -------------------------------------------------------------------------------- /start.bat: -------------------------------------------------------------------------------- 1 | echo off 2 | @set LOCALCLASSPATH=./target/classes/ 3 | @for %%i in (".\WebContent\WEB-INF\lib\*.jar") do call "setpath.bat" %%i 4 | 5 | 6 | set CLASSPATH=%LOCALCLASSPATH%;%CLASSPATH% 7 | echo on 8 | #java -Dxport="%1" -Xmx512m com.jetty.MyServer %2 %3 %4 9 | java -Xms512m -Xmx512m -Dxport=8080 -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=8787 org.sbs.jetty.StartServer --------------------------------------------------------------------------------