├── bin ├── buildapp.cmd ├── buildfengchao.cmd ├── buildapp.sh └── buildfengchao.sh ├── fengchao ├── uumai-common │ ├── crawler-xsoup │ │ ├── src │ │ │ └── main │ │ │ │ └── java │ │ │ │ ├── us │ │ │ │ └── codecraft │ │ │ │ │ └── xsoup │ │ │ │ │ ├── XElement.java │ │ │ │ │ ├── XElements.java │ │ │ │ │ ├── XPathEvaluator.java │ │ │ │ │ ├── Xsoup.java │ │ │ │ │ ├── DefaultXElement.java │ │ │ │ │ ├── XEvaluators.java │ │ │ │ │ ├── CombiningXElements.java │ │ │ │ │ ├── CombingXPathEvaluator.java │ │ │ │ │ ├── DefaultXPathEvaluator.java │ │ │ │ │ ├── DefaultXElements.java │ │ │ │ │ └── CombiningEvaluator.java │ │ │ │ └── com │ │ │ │ └── uumai │ │ │ │ ├── crawler │ │ │ │ └── selector │ │ │ │ │ ├── RegexResult.java │ │ │ │ │ ├── Selector.java │ │ │ │ │ ├── ElementSelector.java │ │ │ │ │ ├── ReplaceSelector.java │ │ │ │ │ ├── OrSelector.java │ │ │ │ │ ├── Selectors.java │ │ │ │ │ ├── BaseElementSelector.java │ │ │ │ │ ├── XpathSelector.java │ │ │ │ │ ├── Json.java │ │ │ │ │ ├── JsonPathSelector.java │ │ │ │ │ ├── AndSelector.java │ │ │ │ │ ├── PlainText.java │ │ │ │ │ ├── RegexSelector.java │ │ │ │ │ ├── CssSelector.java │ │ │ │ │ ├── SmartContentSelector.java │ │ │ │ │ └── AbstractSelectable.java │ │ │ │ └── crawler2 │ │ │ │ └── util │ │ │ │ └── htmlparse │ │ │ │ └── JsoupUtil.java │ │ └── pom.xml │ ├── crawler-util │ │ ├── src │ │ │ └── main │ │ │ │ ├── java │ │ │ │ └── com │ │ │ │ │ └── uumai │ │ │ │ │ └── crawer │ │ │ │ │ ├── constant │ │ │ │ │ └── UumaiConstant.java │ │ │ │ │ └── util │ │ │ │ │ ├── math │ │ │ │ │ └── Rondom.java │ │ │ │ │ ├── io │ │ │ │ │ ├── SerializeUtil.java │ │ │ │ │ └── HadoopSerializeUtil.java │ │ │ │ │ ├── UumaiTime.java │ │ │ │ │ ├── shell │ │ │ │ │ └── Shell.java │ │ │ │ │ ├── license │ │ │ │ │ ├── LicenseInfo.java │ │ │ │ │ └── LicenseValidateHelper.java │ │ │ │ │ ├── Java8Time.java │ │ │ │ │ ├── audio │ │ │ │ │ └── Player.java │ │ │ │ │ └── CookieUtil.java │ │ │ │ └── resources │ │ │ │ └── log4j.properties │ │ └── pom.xml │ ├── mongodb-client │ │ ├── src │ │ │ └── main │ │ │ │ ├── resources │ │ │ │ └── log4j.properties │ │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── uumai │ │ │ │ ├── model │ │ │ │ └── BaseMongoPOJO.java │ │ │ │ ├── dao │ │ │ │ ├── BaseDao.java │ │ │ │ └── helper │ │ │ │ │ ├── DB2JsonHelper.java │ │ │ │ │ └── Json2DBHelper.java │ │ │ │ └── crawer │ │ │ │ └── util │ │ │ │ └── MongoUtil.java │ │ └── pom.xml │ ├── zookeeper-client │ │ ├── src │ │ │ └── main │ │ │ │ ├── resources │ │ │ │ └── log4j.properties │ │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── uumai │ │ │ │ ├── zookeeper │ │ │ │ └── curator │ │ │ │ │ └── CuratorDao.java │ │ │ │ └── zookeeperclient │ │ │ │ ├── TestThread.java │ │ │ │ └── uitl │ │ │ │ └── ZookeeperFactory.java │ │ └── pom.xml │ ├── activemq-client │ │ └── pom.xml │ ├── kafka-client │ │ └── pom.xml │ ├── redis-client │ │ └── pom.xml │ └── pom.xml ├── uumai-core │ ├── crawler-quartz │ │ ├── src │ │ │ └── main │ │ │ │ ├── resources │ │ │ │ └── log4j.properties │ │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── uumai │ │ │ │ └── crawer │ │ │ │ └── quartz │ │ │ │ ├── util │ │ │ │ ├── QuartzQueryItem.java │ │ │ │ └── UumaiExportExcel.java │ │ │ │ ├── result │ │ │ │ ├── QuartzResult.java │ │ │ │ ├── QuartzResultItem.java │ │ │ │ └── QuartzXpathItem.java │ │ │ │ ├── localdebug │ │ │ │ ├── QuartzLocalDebugAppMaster.java │ │ │ │ └── QuartzLocalDebugCrawlerWorker.java │ │ │ │ └── download │ │ │ │ └── selenium │ │ │ │ ├── SeleniumActions.java │ │ │ │ └── SeleniumActionBot.java │ │ └── pom.xml │ ├── crawler-core │ │ ├── src │ │ │ └── main │ │ │ │ ├── java │ │ │ │ └── com │ │ │ │ │ └── uumai │ │ │ │ │ └── crawer2 │ │ │ │ │ ├── download │ │ │ │ │ ├── selenium │ │ │ │ │ │ ├── test │ │ │ │ │ │ │ ├── TestSeleniumScriptBase.java │ │ │ │ │ │ │ ├── TestCustomerScriptDownload.java │ │ │ │ │ │ │ ├── TestHtmlUnitDriver.java │ │ │ │ │ │ │ ├── TestSelenium.java │ │ │ │ │ │ │ └── TestPhantomJSDriver.java │ │ │ │ │ │ ├── SeleniumScriptBase.java │ │ │ │ │ │ ├── UumaiSeleniumWebDriver.java │ │ │ │ │ │ └── WebDriverFactory.java │ │ │ │ │ ├── mediadownload │ │ │ │ │ │ └── MediaHttpDownload.java │ │ │ │ │ ├── emptymock │ │ │ │ │ │ └── EmptyMockDownload.java │ │ │ │ │ ├── Download.java │ │ │ │ │ ├── filedownload │ │ │ │ │ │ ├── FileDownload.java │ │ │ │ │ │ └── TestFileHttpDownload.java │ │ │ │ │ ├── CrawlerProxy.java │ │ │ │ │ ├── httpclient │ │ │ │ │ │ ├── IdleConnectionMonitorThread.java │ │ │ │ │ │ └── HttpConnectionManager.java │ │ │ │ │ ├── DownloadFactory.java │ │ │ │ │ ├── shelldownload │ │ │ │ │ │ └── ShellDownload.java │ │ │ │ │ └── remoteshelldownload │ │ │ │ │ │ └── RemoveShellDownload.java │ │ │ │ │ ├── localdebug │ │ │ │ │ ├── LocalDebugAppMaster.java │ │ │ │ │ └── LocalDebugCrawlerWorker.java │ │ │ │ │ ├── CrawlerWorker.java │ │ │ │ │ └── CookieManager │ │ │ │ │ ├── CrawlerCookie.java │ │ │ │ │ └── httpdownload │ │ │ │ │ └── HttpDownloadCookitHelper.java │ │ │ │ └── resources │ │ │ │ └── log4j.xml │ │ └── pom.xml │ └── pom.xml ├── uumai-distribute-sys │ ├── uumai-storm │ │ ├── pom.xml │ │ └── topology-core │ │ │ └── pom.xml │ ├── pom.xml │ ├── uumai-mapreduce │ │ └── pom.xml │ └── uumai-yarn │ │ └── pom.xml ├── pom.xml └── uumai-multiserver │ ├── pom.xml │ ├── crawler-multi-core │ └── pom.xml │ ├── uumai-rpcserver │ └── pom.xml │ ├── crawler-multi-quartz │ └── pom.xml │ ├── java-bridge │ └── pom.xml │ └── uumai-thrift │ └── pom.xml ├── crawler-website └── crawler-example │ ├── src │ └── main │ │ ├── java │ │ └── com │ │ │ └── uumai │ │ │ └── crawer │ │ │ └── quartz │ │ │ ├── core │ │ │ ├── test │ │ │ │ ├── JsonTest.java │ │ │ │ └── TestApp.java │ │ │ └── cookies │ │ │ │ └── CookieConstant.java │ │ │ ├── gupiao │ │ │ ├── baidu │ │ │ │ ├── BaiduSearch.java │ │ │ │ └── BaiduNewsSearch.java │ │ │ ├── qq │ │ │ │ ├── Category.java │ │ │ │ └── SockIntro.java │ │ │ └── sinastock │ │ │ │ └── SinaJSStockTakser.java │ │ │ ├── jiudian │ │ │ └── ctrip │ │ │ │ ├── CityListTasker.java │ │ │ │ └── JiudianTasker.java │ │ │ ├── jipiao │ │ │ ├── chunqiu │ │ │ │ └── ChunqiuTiejia.java │ │ │ └── ctrip │ │ │ │ ├── JipiaoListTasker.java │ │ │ │ ├── JipiaoInternListTasker.java │ │ │ │ └── JipiaoTasker.java │ │ │ └── jobs │ │ │ ├── lagou │ │ │ ├── SearchJobTasker.java │ │ │ ├── JobTasker.java │ │ │ └── GongsiTasker.java │ │ │ └── fiveonejob │ │ │ ├── JobDetailTasker.java │ │ │ └── FiveonejobTasker.java │ │ └── resources │ │ └── log4j.xml │ ├── pom.xml │ ├── deploy │ └── resources │ │ └── amazon.cookie.txt │ └── pom-withdependencies.xml ├── .gitignore └── README.md /bin/buildapp.cmd: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bin/buildfengchao.cmd: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/XElement.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public interface XElement { 9 | 10 | String get(); 11 | 12 | Element getElement(); 13 | } 14 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/constant/UumaiConstant.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.constant; 2 | 3 | /** 4 | * Created by kanxg on 14-12-17. 5 | */ 6 | public class UumaiConstant { 7 | public static String[] goagent=new String[]{"127.0.0.1","8088"}; 8 | 9 | 10 | 11 | 12 | } 13 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/XElements.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.select.Elements; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | */ 10 | public interface XElements { 11 | 12 | String get(); 13 | 14 | List list(); 15 | 16 | Elements getElements(); 17 | } 18 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, A1 3 | 4 | # configure A1 to spit out data in console 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c - %m%n -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, A1 3 | 4 | # configure A1 to spit out data in console 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c - %m%n -------------------------------------------------------------------------------- /fengchao/uumai-common/mongodb-client/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, A1 3 | 4 | # configure A1 to spit out data in console 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c - %m%n -------------------------------------------------------------------------------- /fengchao/uumai-common/zookeeper-client/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, A1 3 | 4 | # configure A1 to spit out data in console 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.A1.layout.ConversionPattern=%d [%t] %-5p %c - %m%n -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/XPathEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public interface XPathEvaluator { 9 | 10 | XElements evaluate(Element element); 11 | 12 | XElements evaluate(String html); 13 | 14 | boolean hasAttribute(); 15 | 16 | } 17 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/selenium/test/TestSeleniumScriptBase.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.selenium.test; 2 | 3 | import com.uumai.crawer2.download.selenium.SeleniumScriptBase; 4 | 5 | /** 6 | * Created by rock on 12/9/15. 7 | */ 8 | public class TestSeleniumScriptBase extends SeleniumScriptBase { 9 | 10 | public void doaction(){ 11 | driver.get("http://item.jd.com/1644261435.html"); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/mediadownload/MediaHttpDownload.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.mediadownload; 2 | 3 | import com.uumai.crawer2.CrawlerResult; 4 | import com.uumai.crawer2.CrawlerTasker; 5 | import com.uumai.crawer2.download.Download; 6 | 7 | /** 8 | * Created by rock on 7/4/16. 9 | */ 10 | public class MediaHttpDownload implements Download { 11 | @Override 12 | public CrawlerResult download(CrawlerTasker tasker) throws Exception { 13 | return null; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/core/test/JsonTest.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.core.test; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | 6 | import com.jayway.jsonpath.JsonPath; 7 | 8 | public class JsonTest { 9 | 10 | public static void main(String[] args) throws IOException { 11 | File text=new File("/tmp/jd.json"); 12 | 13 | Object value=JsonPath.read(text, "jingdong_service_promotion_getcode_responce.queryjs_result.resultCode"); 14 | 15 | System.out.println(value); 16 | 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /fengchao/uumai-common/mongodb-client/src/main/java/com/uumai/model/BaseMongoPOJO.java: -------------------------------------------------------------------------------- 1 | package com.uumai.model; 2 | 3 | import java.util.Date; 4 | 5 | public class BaseMongoPOJO 6 | { 7 | public String version; 8 | public Date createtime; 9 | 10 | public Date getCreatetime() 11 | { 12 | return this.createtime; 13 | } 14 | 15 | public void setCreatetime(Date createtime) { 16 | this.createtime = createtime; 17 | } 18 | 19 | public String getVersion() { 20 | return this.version; 21 | } 22 | 23 | public void setVersion(String version) { 24 | this.version = version; 25 | } 26 | } -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/util/QuartzQueryItem.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.util; 2 | 3 | public class QuartzQueryItem { 4 | private String name; 5 | private Object value; 6 | public String getName() { 7 | return name; 8 | } 9 | public QuartzQueryItem(String name,Object value){ 10 | this.name=name; 11 | this.value=value; 12 | 13 | } 14 | public void setName(String name) { 15 | this.name = name; 16 | } 17 | public Object getValue() { 18 | return value; 19 | } 20 | public void setValue(String value) { 21 | this.value = value; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/selenium/SeleniumScriptBase.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.selenium; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * Created by rock on 12/9/15. 7 | */ 8 | public class SeleniumScriptBase implements Serializable { 9 | 10 | protected UumaiSeleniumWebDriver driver; 11 | 12 | public void doaction(){ 13 | 14 | } 15 | 16 | public UumaiSeleniumWebDriver getDriver() { 17 | return driver; 18 | } 19 | 20 | public void setDriver(UumaiSeleniumWebDriver driver) { 21 | this.driver = driver; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/emptymock/EmptyMockDownload.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.emptymock; 2 | 3 | import com.uumai.crawer2.CrawlerResult; 4 | import com.uumai.crawer2.CrawlerTasker; 5 | import com.uumai.crawer2.download.Download; 6 | 7 | /** 8 | * Created by rock on 9/2/15. 9 | */ 10 | public class EmptyMockDownload implements Download { 11 | @Override 12 | public CrawlerResult download(CrawlerTasker tasker) throws Exception { 13 | CrawlerResult result=new CrawlerResult(); 14 | result.setRawText(tasker.getUrl()); 15 | result.setReturncode(200); 16 | return result; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/Download.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download; 2 | 3 | import com.uumai.crawer2.CrawlerResult; 4 | import com.uumai.crawer2.CrawlerTasker; 5 | 6 | /** 7 | * Created by kanxg on 14-12-22. 8 | */ 9 | public interface Download { 10 | 11 | 12 | public enum DownloadType{ 13 | java_download, httpclient_download, 14 | firefox_download, chrome_download, htmlunit_download,phantomjs_download, 15 | emptymockdown, 16 | openscript_download, 17 | file_download, 18 | shell_download 19 | } 20 | public CrawlerResult download(CrawlerTasker tasker) throws Exception ; 21 | 22 | } 23 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/Xsoup.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Element; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | */ 9 | public class Xsoup { 10 | 11 | public static XElements select(Element element, String xpathStr) { 12 | return XPathParser.parse(xpathStr).evaluate(element); 13 | } 14 | 15 | public static XElements select(String html, String xpathStr) { 16 | return XPathParser.parse(xpathStr).evaluate(Jsoup.parse(html)); 17 | } 18 | 19 | public static XPathEvaluator compile(String xpathStr) { 20 | return XPathParser.parse(xpathStr); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/RegexResult.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | /** 4 | * Object contains regex results.
5 | * For multi group result extension.
6 | * 7 | * @author
8 | * @since 0.1.0 9 | */ 10 | class RegexResult { 11 | 12 | private String[] groups; 13 | 14 | public static final RegexResult EMPTY_RESULT = new RegexResult(); 15 | 16 | public RegexResult() { 17 | 18 | } 19 | 20 | public RegexResult(String[] groups) { 21 | this.groups = groups; 22 | } 23 | 24 | public String get(int groupId) { 25 | if (groups == null) { 26 | return null; 27 | } 28 | return groups[groupId]; 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/Selector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * Selector(extractor) for text.
7 | * 8 | * @author
9 | * @since 0.1.0 10 | */ 11 | public interface Selector { 12 | 13 | /** 14 | * Extract single result in text.
15 | * If there are more than one result, only the first will be chosen. 16 | * 17 | * @param text 18 | * @return result 19 | */ 20 | public String select(String text); 21 | 22 | /** 23 | * Extract all results in text.
24 | * 25 | * @param text 26 | * @return results 27 | */ 28 | public List selectList(String text); 29 | 30 | } 31 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/result/QuartzResult.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.result; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class QuartzResult { 7 | 8 | private List itemlist=new ArrayList(); 9 | 10 | public List getItemlist() { 11 | return itemlist; 12 | } 13 | 14 | public void setItemlist(List itemlist) { 15 | this.itemlist = itemlist; 16 | } 17 | 18 | 19 | public QuartzResultItem getItem(String name){ 20 | for(QuartzResultItem item:itemlist){ 21 | if(item.getName().equals(name)) 22 | return item; 23 | } 24 | return new QuartzResultItem(name,null); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/localdebug/QuartzLocalDebugAppMaster.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.localdebug; 2 | 3 | import com.uumai.crawer2.CrawlerTasker; 4 | import com.uumai.crawer2.localdebug.LocalDebugAppMaster; 5 | 6 | public class QuartzLocalDebugAppMaster extends LocalDebugAppMaster{ 7 | 8 | 9 | public void putDistributeTask(CrawlerTasker crawlerTasker) throws Exception { 10 | QuartzLocalDebugCrawlerWorker localDebugCrawlerWorker= new QuartzLocalDebugCrawlerWorker(crawlerTasker); 11 | localDebugCrawlerWorker.download(); 12 | localDebugCrawlerWorker.pipeline(); 13 | } 14 | 15 | // @Override 16 | // public void dobusiness() throws Exception { 17 | // super.dobusiness(); 18 | // } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /fengchao/uumai-core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-fengchao-parent 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-core 14 | 1.0 15 | pom 16 | 17 | 18 | 19 | crawler-core 20 | crawler-quartz 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/ElementSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Selector(extractor) for html elements.
9 | * 10 | * @author
11 | * @since 0.3.0 12 | */ 13 | public interface ElementSelector { 14 | 15 | /** 16 | * Extract single result in text.
17 | * If there are more than one result, only the first will be chosen. 18 | * 19 | * @param element 20 | * @return result 21 | */ 22 | public String select(Element element); 23 | 24 | /** 25 | * Extract all results in text.
26 | * 27 | * @param element 28 | * @return results 29 | */ 30 | public List selectList(Element element); 31 | 32 | } 33 | -------------------------------------------------------------------------------- /fengchao/uumai-distribute-sys/uumai-storm/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-distribute-sys 6 | 1.0 7 | 8 | 9 | 10 | com.uumai 11 | 1.0 12 | 4.0.0 13 | pom 14 | 15 | uumai-storm 16 | uumai-storm 17 | 18 | 19 | 20 | topology-core 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/filedownload/FileDownload.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.filedownload; 2 | 3 | import com.uumai.crawer.util.filesystem.UumaiFileUtil; 4 | import com.uumai.crawer2.CrawlerResult; 5 | import com.uumai.crawer2.CrawlerTasker; 6 | import com.uumai.crawer2.download.Download; 7 | 8 | /** 9 | * Created by rock on 11/16/15. 10 | */ 11 | public class FileDownload implements Download { 12 | @Override 13 | public CrawlerResult download(CrawlerTasker tasker) throws Exception { 14 | UumaiFileUtil uumaiFileUtil=new UumaiFileUtil(); 15 | 16 | CrawlerResult result=new CrawlerResult(); 17 | result.setRawText(uumaiFileUtil.readfromcache(tasker.getSavefilename())); 18 | result.setReturncode(200); 19 | 20 | return result; 21 | // return null; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /bin/buildapp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #source ./uumai_config.sh 3 | 4 | 5 | echo "start to build..." 6 | 7 | cd ./crawler-website/crawler-$1 8 | mvn clean install 9 | 10 | cd ../.. 11 | 12 | cp -r ./crawler-website/crawler-$1/target/crawler-*-1.0.jar ./build/fengchao/libs/ 13 | #cp -r ./crawler-website/crawler-$1/libs/* ./build/fengchao/libs/ 14 | 15 | echo "build finished!" 16 | 17 | 18 | :< /dev/null 2>&1 << eeooff 30 | cd ${arr[1]} 31 | ./sbin/syncjar.sh ./libs/crawler-$1-1.0.jar libs/ 32 | exit 33 | eeooff 34 | 35 | done < ./conf/masters 36 | BLOCK 37 | 38 | echo "finish deploy!" -------------------------------------------------------------------------------- /fengchao/uumai-distribute-sys/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-fengchao-parent 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-distribute-sys 14 | 1.0 15 | pom 16 | 17 | 18 | uumai-mapreduce 19 | uumai-storm 20 | uumai-yarn 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /fengchao/uumai-common/activemq-client/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | com.uumai 5 | uumai-common 6 | 1.0 7 | 8 | 9 | 4.0.0 10 | 11 | com.uumai 12 | uumai-activemq-client 13 | 1.0 14 | jar 15 | 16 | 17 | 18 | 19 | 20 | 21 | com.uumai 22 | uumai-crawler-util 23 | 1.0 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /fengchao/uumai-common/kafka-client/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | com.uumai 5 | uumai-common 6 | 1.0 7 | 8 | 9 | 4.0.0 10 | 11 | com.uumai 12 | uumai-kafka-client 13 | 1.0 14 | jar 15 | 16 | 17 | 18 | 19 | 20 | 21 | com.uumai 22 | uumai-crawler-util 23 | 1.0 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/result/QuartzResultItem.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.result; 2 | 3 | import java.io.Serializable; 4 | 5 | public class QuartzResultItem implements Serializable { 6 | /** 7 | * 8 | */ 9 | private static final long serialVersionUID = 1L; 10 | private String name; 11 | private Object value; 12 | 13 | public QuartzResultItem(){ 14 | 15 | } 16 | 17 | public QuartzResultItem(String name,String value){ 18 | this.name=name; 19 | this.value=value; 20 | 21 | } 22 | 23 | public String getName() { 24 | return name; 25 | } 26 | public void setName(String name) { 27 | this.name = name; 28 | } 29 | public Object getValue() { 30 | if(value==null){ 31 | return ""; 32 | } 33 | return value; 34 | } 35 | public void setValue(Object value) { 36 | this.value = value; 37 | } 38 | 39 | 40 | } 41 | -------------------------------------------------------------------------------- /fengchao/uumai-common/mongodb-client/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | com.uumai 6 | uumai-common 7 | 1.0 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-mongo-client 14 | jar 15 | 1.0 16 | 17 | 18 | 19 | 20 | com.uumai 21 | uumai-crawler-util 22 | 1.0 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-core 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | 13 | com.uumai 14 | uumai-crawler-quartz 15 | 1.0 16 | 17 | 18 | 19 | 20 | 21 | com.uumai 22 | uumai-crawler-core 23 | 1.0 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /fengchao/uumai-common/zookeeper-client/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | com.uumai 6 | uumai-common 7 | 1.0 8 | 9 | 10 | 11 | 4.0.0 12 | 13 | 14 | 15 | com.uumai 16 | uumai-zookeeper-client 17 | jar 18 | 1.0 19 | 20 | 21 | 22 | 23 | com.uumai 24 | common-libs 25 | 1.0 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /fengchao/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | 1.0 6 | 4.0.0 7 | pom 8 | 9 | UTF-8 10 | UTF-8 11 | 12 | 13 | uumai-fengchao-parent 14 | 15 | 16 | 17 | 18 | uumai-common 19 | 20 | uumai-core 21 | 22 | uumai-multiserver 23 | 24 | uumai-distribute-sys 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | com.uumai 8 | uumai-common 9 | 1.0 10 | 11 | 12 | 13 | 4.0.0 14 | 15 | 16 | 17 | com.uumai 18 | uumai-crawler-xsoup 19 | 1.0 20 | jar 21 | 22 | 23 | 24 | 25 | com.uumai 26 | common-libs 27 | 1.0 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /fengchao/uumai-multiserver/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-fengchao-parent 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-multiserver 14 | 1.0 15 | pom 16 | 17 | 18 | 19 | crawler-multi-core 20 | crawler-multi-quartz 21 | 22 | 23 | uumai-rpcserver 24 | uumai-thrift 25 | java-bridge 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /fengchao/uumai-multiserver/crawler-multi-core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-multiserver 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-crawler-multi-core 14 | jar 15 | 1.0 16 | 17 | 18 | 19 | 20 | 21 | com.uumai 22 | uumai-crawler-core 23 | 1.0 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | #忽略全部*.class字节码文件 3 | *.class 4 | 5 | #针对移动端开发忽略.mtj.tmp/目录下所有文件 6 | .mtj.tmp/ 7 | 8 | #忽略各类打包文件 9 | #此处列举*.jar/*.war/*.ear/*.zip四类打包文件 10 | *.jar 11 | *.war 12 | *.ear 13 | *.zip 14 | 15 | #增加忽略两类压缩文件 16 | *.tar.gz 17 | *.tar 18 | 19 | #忽略target/目录下所有文件 20 | target/ 21 | #忽略subDir/target/目录下所有文件 22 | #因为maven工程有parent和children之分 23 | **/target/ 24 | 25 | #忽略virtual machine crash logs文件 26 | hs_err_pid* 27 | 28 | #忽略日志文件 29 | #日志文件也可能是*.tar.gz或*.tar压缩处理过的 30 | *.log 31 | *.tar.gz.log 32 | *.tar.log 33 | 34 | #忽略掉临时文件 35 | *.bak 36 | 37 | #忽略eclipse项目描述文件 38 | *.classpath 39 | *.project 40 | *.prefs 41 | **/.settings/ 42 | 43 | #忽略svn文件 44 | #忽略.svn/目录下全部文件,但是不包括subDir/.svn/目录下文件 45 | .svn/ 46 | #忽略subDir目录下所有带.svn/目录的文件 47 | **/.svn/ 48 | 49 | #igore idea project file 50 | *.iml 51 | *.ipr 52 | *.iws 53 | 54 | 55 | #uumai config 56 | 57 | #ignore example libs build 58 | fengchao/uumai-common/common-libs/libs 59 | yiqun/uumai-search/search-libs/libs 60 | 61 | #temp remove rpc-neety project ,neety demo code 62 | #shop_indexer/rpc-netty/src/main/java/io 63 | -------------------------------------------------------------------------------- /fengchao/uumai-common/redis-client/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | com.uumai 5 | uumai-common 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-redis-client 14 | 1.0 15 | jar 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | com.uumai 24 | common-libs 25 | 1.0 26 | 27 | 28 | 29 | 30 | com.uumai 31 | uumai-crawler-util 32 | 1.0 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /fengchao/uumai-common/zookeeper-client/src/main/java/com/uumai/zookeeper/curator/CuratorDao.java: -------------------------------------------------------------------------------- 1 | package com.uumai.zookeeper.curator; 2 | 3 | import org.apache.curator.framework.CuratorFramework; 4 | import org.apache.curator.framework.CuratorFrameworkFactory; 5 | import org.apache.curator.retry.ExponentialBackoffRetry; 6 | 7 | /** 8 | * Created by rock on 2/8/16. 9 | */ 10 | public class CuratorDao { 11 | private CuratorFramework client=null; 12 | public void start(){ 13 | CuratorFrameworkFactory.Builder builder = CuratorFrameworkFactory.builder(); 14 | 15 | client = builder.connectString("192.168.11.56:2180") 16 | .sessionTimeoutMs(30000) 17 | .connectionTimeoutMs(30000) 18 | .canBeReadOnly(false) 19 | .retryPolicy(new ExponentialBackoffRetry(1000, Integer.MAX_VALUE)) 20 | .namespace("test") 21 | .defaultData(null) 22 | .build(); 23 | client.start(); 24 | } 25 | 26 | public void close(){ 27 | client.close(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /fengchao/uumai-common/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | com.uumai 6 | uumai-fengchao-parent 7 | 1.0 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-common 14 | 1.0 15 | pom 16 | 17 | 18 | 19 | common-libs 20 | crawler-xsoup 21 | crawler-util 22 | 23 | zookeeper-client 24 | redis-client 25 | activemq-client 26 | kafka-client 27 | mongodb-client 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/filedownload/TestFileHttpDownload.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.filedownload; 2 | 3 | import java.net.InetSocketAddress; 4 | import java.net.Proxy; 5 | 6 | /** 7 | * Created by rock on 5/11/16. 8 | */ 9 | public class TestFileHttpDownload { 10 | 11 | public static void main(String[] a){ 12 | FileHttpDownload fileHttpDownload=new FileHttpDownload(); 13 | //set pool size , default is 10 14 | //set cookie , get from Openscript 15 | //set proxy if in office 16 | //FileHttpDownload fileHttpDownload=new FileHttpDownload(100,null,(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("cn-proxy.jp.oracle.com", 80)))); 17 | 18 | //loop to send many taksers 19 | fileHttpDownload.sendtasker("http://www.linkedin.com/directory/country_listing/","/home/rock/uumai/a.html"); 20 | fileHttpDownload.sendtasker("https://sa.linkedin.com/","/home/rock/uumai/b.html"); 21 | //stop pool, will wait until all tasker finished 22 | fileHttpDownload.stoppool(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/math/Rondom.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util.math; 2 | 3 | import java.util.Random; 4 | 5 | /** 6 | * Created by rock on 6/14/16. 7 | */ 8 | public class Rondom { 9 | 10 | public static String genRandomNum(int card_len){ 11 | //35是因为数组是从0开始的,26个字母+10个数字 12 | final int maxNum = 36; 13 | int i; //生成的随机数 14 | int count = 0; //生成的密码的长度 15 | char[] str = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }; 16 | StringBuffer pwd = new StringBuffer(""); 17 | Random r = new Random(); 18 | while(count < card_len){ 19 | //生成随机数,取绝对值,防止生成负数 20 | i = Math.abs(r.nextInt(maxNum)); //生成的数最大为36-1 21 | if (i >= 0 && i < str.length) { 22 | pwd.append(str[i]); 23 | count ++; 24 | } 25 | } 26 | return pwd.toString(); 27 | } 28 | public static void main(String[] args) { 29 | for(int l=0;l<100;l++){ 30 | System.out.println(genRandomNum(6));//生成6位的随机数 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /fengchao/uumai-multiserver/uumai-rpcserver/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-multiserver 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-rpcserver 14 | jar 15 | 1.0 16 | 17 | 18 | 19 | com.uumai 20 | uumai-crawler-util 21 | 1.0 22 | 23 | 24 | 25 | com.uumai 26 | uumai-crawler-multi-core 27 | 1.0 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/DefaultXElement.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | /** 6 | * XPath result. 7 | * 8 | * @author code4crafter@gmail.com 9 | */ 10 | public class DefaultXElement implements XElement { 11 | 12 | private Element element; 13 | 14 | private ElementOperator elementOperator; 15 | 16 | public DefaultXElement(Element element, ElementOperator elementOperator) { 17 | this.element = element; 18 | this.elementOperator = elementOperator; 19 | } 20 | 21 | @Override 22 | public String get(){ 23 | return get(elementOperator); 24 | } 25 | 26 | protected String get(ElementOperator elementOperator){ 27 | if (elementOperator == null) { 28 | return element.toString(); 29 | } else { 30 | return elementOperator.operate(element); 31 | } 32 | } 33 | 34 | public String toString() { 35 | return get(); 36 | } 37 | 38 | @Override 39 | public Element getElement() { 40 | return element; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/download/selenium/SeleniumActions.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.download.selenium; 2 | 3 | 4 | import java.io.Serializable; 5 | 6 | /** 7 | * Created by rock on 12/8/15. 8 | */ 9 | public class SeleniumActions implements Serializable { 10 | private String command; 11 | private String target; 12 | private String value; 13 | 14 | public SeleniumActions(String command, String target, String value) { 15 | this.command = command; 16 | this.target = target; 17 | this.value = value; 18 | } 19 | 20 | public String getCommand() { 21 | return command; 22 | } 23 | 24 | public void setCommand(String command) { 25 | this.command = command; 26 | } 27 | 28 | public String getTarget() { 29 | return target; 30 | } 31 | 32 | public void setTarget(String target) { 33 | this.target = target; 34 | } 35 | 36 | public String getValue() { 37 | return value; 38 | } 39 | 40 | public void setValue(String value) { 41 | this.value = value; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /fengchao/uumai-multiserver/crawler-multi-quartz/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-multiserver 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | 13 | com.uumai 14 | uumai-crawler-multi-quartz 15 | 1.0 16 | 17 | 18 | 19 | 20 | 21 | com.uumai 22 | uumai-crawler-multi-core 23 | 1.0 24 | 25 | 26 | 27 | com.uumai 28 | uumai-crawler-quartz 29 | 1.0 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/core/test/TestApp.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.core.test; 2 | 3 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 4 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 5 | import com.uumai.crawer.util.UumaiTime; 6 | import com.uumai.crawer2.CrawlerTasker; 7 | import com.uumai.crawer2.download.Download.DownloadType; 8 | 9 | public class TestApp extends QuartzLocalDebugAppMaster{ // AbstractAppMaster { 10 | 11 | @Override 12 | public void dobusiness() throws Exception { 13 | createonetask("http://amazon.com"); 14 | } 15 | 16 | public void createonetask(String url) throws Exception{ 17 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 18 | 19 | tasker.setDownloadType(DownloadType.java_download); 20 | tasker.setUrl(url); 21 | tasker.addResultItem("test","test"); 22 | 23 | // tasker.setStoreTableName("test"); 24 | putDistributeTask(tasker); 25 | } 26 | 27 | public static void main(String[] args) throws Exception{ 28 | 29 | TestApp master = new TestApp(); 30 | master.init(); 31 | 32 | 33 | master.start(); 34 | 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/gupiao/baidu/BaiduSearch.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.gupiao.baidu; 2 | 3 | 4 | import java.util.List; 5 | 6 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 7 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 8 | import com.uumai.crawer.quartz.result.QuartzResult; 9 | import com.uumai.crawer.util.UumaiTime; 10 | 11 | 12 | public class BaiduSearch extends QuartzLocalDebugAppMaster { 13 | 14 | @Override 15 | public void dobusiness() throws Exception{ 16 | 17 | 18 | dotasker("SZ002006"); 19 | 20 | } 21 | 22 | 23 | 24 | private void dotasker( String searchtext) throws Exception { 25 | String url="http://www.baidu.com/s?wd="+searchtext; 26 | QuartzCrawlerTasker tasker=new QuartzCrawlerTasker(); 27 | tasker.setUrl(url); 28 | tasker.addResultItem("searchtext", searchtext); 29 | tasker.addXpath("result", "//div[@class='nums']/text()"); 30 | putDistributeTask(tasker); 31 | } 32 | 33 | 34 | public static void main(String[] args) throws Exception{ 35 | new BaiduSearch().init().start(); 36 | 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | com.uumai 6 | uumai-common 7 | 1.0 8 | 9 | 10 | 4.0.0 11 | 12 | 13 | 14 | com.uumai 15 | uumai-crawler-util 16 | 1.0 17 | jar 18 | 19 | 20 | 21 | 22 | 23 | 24 | com.uumai 25 | uumai-crawler-xsoup 26 | 1.0 27 | 28 | 29 | 30 | 31 | com.uumai 32 | uumai-zookeeper-client 33 | 1.0 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/gupiao/baidu/BaiduNewsSearch.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.gupiao.baidu; 2 | 3 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 4 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 5 | import com.uumai.crawer.util.UumaiTime; 6 | 7 | public class BaiduNewsSearch extends QuartzLocalDebugAppMaster {// QuartzLocalDebugAppMaster{ 8 | 9 | @Override 10 | public void dobusiness() throws Exception{ 11 | 12 | dotasker("SZ300104"); 13 | 14 | 15 | } 16 | 17 | 18 | 19 | private void dotasker( String searchtext) throws Exception { 20 | String url="http://news.baidu.com/ns?cl=2&rn=20&tn=news&word="+searchtext; 21 | QuartzCrawlerTasker tasker=new QuartzCrawlerTasker(); 22 | tasker.setUrl(url); 23 | // tasker.setStoreTableName("baidu_news"); 24 | tasker.addResultItem("searchtext", searchtext); 25 | tasker.addXpath("result", "//span[@class='nums']/text()"); 26 | putDistributeTask(tasker); 27 | } 28 | 29 | 30 | public static void main(String[] args) throws Exception{ 31 | BaiduNewsSearch master=new BaiduNewsSearch(); 32 | master.init(); 33 | master.start(); 34 | 35 | } 36 | } -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/localdebug/LocalDebugAppMaster.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.localdebug; 2 | 3 | import com.uumai.crawer2.CrawlerTasker; 4 | import com.uumai.crawer2.CrawlerWorker; 5 | 6 | /** 7 | * Created by rock on 12/9/15. 8 | */ 9 | public class LocalDebugAppMaster extends Thread{ 10 | 11 | public LocalDebugAppMaster init() { 12 | return this; 13 | } 14 | 15 | @Override 16 | public void run() { 17 | 18 | try { 19 | dobusiness(); 20 | } catch (Exception e) { 21 | e.printStackTrace(); 22 | } 23 | 24 | } 25 | public void dobusiness() throws Exception { 26 | 27 | } 28 | 29 | protected void putDistributeTask(String host,CrawlerTasker crawlerTasker) throws Exception { 30 | this.putDistributeTask(crawlerTasker); 31 | } 32 | public void putDistributeTask(CrawlerTasker crawlerTasker) throws Exception { 33 | LocalDebugCrawlerWorker localDebugCrawlerWorker= new LocalDebugCrawlerWorker(crawlerTasker); 34 | localDebugCrawlerWorker.download(); 35 | localDebugCrawlerWorker.pipeline(); 36 | } 37 | protected void waittaskfinished() throws Exception { 38 | 39 | } 40 | 41 | 42 | 43 | 44 | } 45 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/XEvaluators.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.nodes.Element; 4 | import org.jsoup.select.Elements; 5 | import org.jsoup.select.Evaluator; 6 | 7 | /** 8 | * Evaluators in Xsoup. 9 | * @author code4crafter@gmail.com 10 | */ 11 | public abstract class XEvaluators { 12 | 13 | public static class HasAnyAttribute extends Evaluator { 14 | 15 | @Override 16 | public boolean matches(Element root, Element element) { 17 | return element.attributes().size() > 0; 18 | } 19 | } 20 | 21 | public static class IsNthOfType extends Evaluator.CssNthEvaluator { 22 | public IsNthOfType(int a, int b) { 23 | super(a,b); 24 | } 25 | 26 | protected int calculatePosition(Element root, Element element) { 27 | int pos = 0; 28 | Elements family = element.parent().children(); 29 | for (int i = 0; i < family.size(); i++) { 30 | if (family.get(i).tag().equals(element.tag())) pos++; 31 | if (family.get(i) == element) break; 32 | } 33 | return pos; 34 | } 35 | 36 | @Override 37 | protected String getPseudoClass() { 38 | return "nth-of-type"; 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /bin/buildfengchao.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #source ./uumai_config.sh 3 | #echo $UUMAI_HOME 4 | 5 | #UUMAI_HOME=/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket 6 | #UUMAI_JAVA_OPTS=" -Xms20m -Xmx100m " 7 | 8 | echo "start to build core..." 9 | 10 | 11 | echo "clean old logs and libs..." 12 | 13 | rm -rf ./build/fengchao/libs/* 14 | rm -rf ./build/fengchao/logs/* 15 | 16 | 17 | echo "build uumai..." 18 | 19 | cd ./fengchao/ 20 | rm -rf ./uumai-common/common-libs/libs 21 | mvn clean install 22 | 23 | echo "build install package..." 24 | cd .. 25 | cp -r ./fengchao/uumai-common/common-libs/libs/* ./build/fengchao/libs/ 26 | 27 | find ./fengchao/uumai-common -name 'uumai-*.jar' | xargs -i{} cp {} ./build/fengchao/libs/ 28 | find ./fengchao/uumai-core -name 'uumai-*.jar' | xargs -i{} cp {} ./build/fengchao/libs/ 29 | 30 | 31 | #echo "build uumai yarn" 32 | 33 | #mvn package -f ./shop_indexer/uumai-yarn/pom.xml 34 | 35 | #cp -r ./shop_indexer/uumai-yarn/target/uumai-yarn-1.0.jar ./build/libs/ 36 | 37 | 38 | :< 3 | 4 | com.uumai 5 | uumai-multiserver 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-java-bridge 14 | 1.0 15 | jar 16 | 17 | 18 | 19 | 26 | 27 | 28 | 29 | org.python 30 | jython 31 | 2.5.3 32 | 33 | 34 | 35 | 36 | org.codehaus.groovy 37 | groovy-all 38 | 2.3.7 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/ReplaceSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import java.util.List; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | import java.util.regex.PatternSyntaxException; 7 | 8 | /** 9 | * Replace selector.
10 | * 11 | * @author
12 | * @since 0.1.0 13 | */ 14 | public class ReplaceSelector implements Selector { 15 | 16 | private String regexStr; 17 | 18 | private String replacement; 19 | 20 | private Pattern regex; 21 | 22 | public ReplaceSelector(String regexStr, String replacement) { 23 | this.regexStr = regexStr; 24 | this.replacement = replacement; 25 | try { 26 | regex = Pattern.compile(regexStr); 27 | } catch (PatternSyntaxException e) { 28 | throw new IllegalArgumentException("invalid regex", e); 29 | } 30 | } 31 | 32 | @Override 33 | public String select(String text) { 34 | Matcher matcher = regex.matcher(text); 35 | return matcher.replaceAll(replacement); 36 | } 37 | 38 | @Override 39 | public List selectList(String text) { 40 | throw new UnsupportedOperationException(); 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return regexStr + "_" + replacement; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /fengchao/uumai-common/mongodb-client/src/main/java/com/uumai/dao/BaseDao.java: -------------------------------------------------------------------------------- 1 | package com.uumai.dao; 2 | 3 | import com.mongodb.DB; 4 | import com.mongodb.MongoClient; 5 | import com.mongodb.MongoClientURI; 6 | import com.uumai.crawer.util.UumaiProperties; 7 | //import org.mongodb.morphia.Datastore; 8 | //import org.mongodb.morphia.Morphia; 9 | 10 | @Deprecated 11 | public class BaseDao 12 | { 13 | private final MongoClient mongoClient; 14 | private DB db; 15 | // private Datastore ds; 16 | private final String dbname = "mydb"; 17 | 18 | // private final Morphia morphia = new Morphia(); 19 | 20 | protected BaseDao() { 21 | try { 22 | this.mongoClient = new MongoClient(new MongoClientURI(UumaiProperties.readconfig("MONGO_URI", "mongodb://localhost:27017"))); 23 | } 24 | catch (Exception e) 25 | { 26 | e.printStackTrace(); 27 | throw new RuntimeException(e); 28 | } 29 | } 30 | 31 | public DB getDb() 32 | { 33 | return this.db; 34 | } 35 | 36 | // public Datastore getDs() { 37 | // return this.ds; 38 | // } 39 | 40 | public MongoClient getMongoClient() { 41 | return this.mongoClient; 42 | } 43 | 44 | // public Morphia getMorphia() { 45 | // return this.morphia; 46 | // } 47 | 48 | public void setDb(DB db) { 49 | this.db = db; 50 | } 51 | 52 | // public void setDs(Datastore ds) { 53 | // this.ds = ds; 54 | // } 55 | 56 | public String getDbname() { 57 | return "mydb"; 58 | } 59 | } -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/gupiao/qq/Category.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.gupiao.qq; 2 | 3 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 4 | import com.uumai.crawer.quartz.core.cookies.CookieConstant; 5 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 6 | import com.uumai.crawer.util.UumaiTime; 7 | 8 | public class Category extends QuartzLocalDebugAppMaster { 9 | 10 | @Override 11 | public void dobusiness() throws Exception { 12 | 13 | dotask("http://stockapp.finance.qq.com/mstats/?mod=all"); 14 | 15 | 16 | 17 | 18 | } 19 | 20 | private void dotask(String url) throws Exception{ 21 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 22 | tasker.setUrl(url); 23 | // tasker.setStoreTableName("qq_stock_category"); 24 | 25 | for(int i=0;i<10;i++){ 26 | tasker.addXpath("category"+i, "//div[@id='alllist']/div[2]/div["+(i*2+1)+"]/allText()"); 27 | 28 | tasker.addXpath_all("name"+i+"_", "//div[@id='alllist']/div[2]/ul["+(i+1)+"]/li/a/text()"); 29 | tasker.addXpath_all("link"+i+"_", "//div[@id='alllist']/div[2]/ul["+(i+1)+"]/li/a/@href"); 30 | 31 | } 32 | 33 | 34 | 35 | 36 | putDistributeTask(tasker); 37 | } 38 | 39 | public static void main(String[] args) throws Exception{ 40 | Category master = new Category(); 41 | master.init(); 42 | master.start(); 43 | 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/CrawlerProxy.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download; 2 | 3 | import org.apache.http.HttpHost; 4 | 5 | import java.io.Serializable; 6 | import java.net.InetSocketAddress; 7 | import java.net.Proxy; 8 | 9 | /** 10 | * Created by kanxg on 14-12-21. 11 | */ 12 | public class CrawlerProxy implements Serializable { 13 | 14 | private String ip; 15 | private int port; 16 | 17 | 18 | public CrawlerProxy(String ip, int port) { 19 | this.ip = ip; 20 | this.port = port; 21 | } 22 | 23 | public Proxy getproxy(){ 24 | return new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip, port)); 25 | } 26 | 27 | public HttpHost gethttpclientproxy(){ 28 | return new HttpHost(ip, port, "http"); 29 | } 30 | 31 | public String getProxyIpAndPortString(){ 32 | return ip+":"+port; 33 | } 34 | 35 | public String getIp() { 36 | return ip; 37 | } 38 | 39 | public void setIp(String ip) { 40 | this.ip = ip; 41 | } 42 | 43 | public int getPort() { 44 | return port; 45 | } 46 | 47 | public void setPort(int port) { 48 | this.port = port; 49 | } 50 | 51 | @Override 52 | public String toString() { 53 | return "CrawlerProxy{" + 54 | "ip='" + ip + '\'' + 55 | ", port=" + port + 56 | '}'; 57 | } 58 | } 59 | 60 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/OrSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * All extractors will do extracting separately,
8 | * and the results of extractors will combined as the final result. 9 | * @author
10 | * @since 0.2.0 11 | */ 12 | public class OrSelector implements Selector { 13 | 14 | private List selectors = new ArrayList(); 15 | 16 | public OrSelector(Selector... selectors) { 17 | for (Selector selector : selectors) { 18 | this.selectors.add(selector); 19 | } 20 | } 21 | 22 | public OrSelector(List selectors) { 23 | this.selectors = selectors; 24 | } 25 | 26 | @Override 27 | public String select(String text) { 28 | for (Selector selector : selectors) { 29 | String result = selector.select(text); 30 | if (result != null) { 31 | return result; 32 | } 33 | } 34 | return null; 35 | } 36 | 37 | @Override 38 | public List selectList(String text) { 39 | List results = new ArrayList(); 40 | for (Selector selector : selectors) { 41 | List strings = selector.selectList(text); 42 | results.addAll(strings); 43 | } 44 | return results; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/Selectors.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | /** 4 | * Convenient methods for selectors.
5 | * 6 | * @author
7 | * @since 0.2.1 8 | */ 9 | public abstract class Selectors { 10 | 11 | public static RegexSelector regex(String expr) { 12 | return new RegexSelector(expr); 13 | } 14 | 15 | public static RegexSelector regex(String expr, int group) { 16 | return new RegexSelector(expr,group); 17 | } 18 | 19 | public static SmartContentSelector smartContent() { 20 | return new SmartContentSelector(); 21 | } 22 | 23 | public static CssSelector $(String expr) { 24 | return new CssSelector(expr); 25 | } 26 | 27 | public static CssSelector $(String expr, String attrName) { 28 | return new CssSelector(expr, attrName); 29 | } 30 | 31 | public static XpathSelector xpath(String expr) { 32 | return new XpathSelector(expr); 33 | } 34 | 35 | /** 36 | * @Deprecated 37 | * @see #xpath(String) 38 | */ 39 | public static XpathSelector xsoup(String expr) { 40 | return new XpathSelector(expr); 41 | } 42 | 43 | public static AndSelector and(Selector... selectors) { 44 | return new AndSelector(selectors); 45 | } 46 | 47 | public static OrSelector or(Selector... selectors) { 48 | return new OrSelector(selectors); 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/httpclient/IdleConnectionMonitorThread.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.httpclient; 2 | 3 | import org.apache.http.conn.HttpClientConnectionManager; 4 | 5 | import java.util.concurrent.TimeUnit; 6 | 7 | /** 8 | * Created by kanxg on 14-12-30. 9 | */ 10 | public class IdleConnectionMonitorThread extends Thread { 11 | private final HttpClientConnectionManager connMgr; 12 | private volatile boolean shutdown; 13 | 14 | public IdleConnectionMonitorThread(HttpClientConnectionManager connMgr) { 15 | super(); 16 | this.connMgr = connMgr; 17 | } 18 | 19 | @Override 20 | public void run() { 21 | try { 22 | while (!shutdown) { 23 | synchronized (this) { 24 | wait(5000); 25 | //System.out.println("clean"); 26 | // Close expired connections 27 | connMgr.closeExpiredConnections(); 28 | // Optionally, close connections 29 | // that have been idle longer than 30 sec 30 | connMgr.closeIdleConnections(5, TimeUnit.SECONDS); 31 | } 32 | } 33 | } catch (InterruptedException ex) { 34 | // terminate 35 | } 36 | } 37 | 38 | public void shutdown() { 39 | shutdown = true; 40 | synchronized (this) { 41 | notifyAll(); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/BaseElementSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Element; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * @author 11 | * @since 0.3.0 12 | */ 13 | public abstract class BaseElementSelector implements Selector, ElementSelector { 14 | 15 | @Override 16 | public String select(String text) { 17 | if (text != null) { 18 | return select(Jsoup.parse(text)); 19 | } 20 | return null; 21 | } 22 | 23 | @Override 24 | public List selectList(String text) { 25 | if (text != null) { 26 | return selectList(Jsoup.parse(text)); 27 | } else { 28 | return new ArrayList(); 29 | } 30 | } 31 | 32 | public Element selectElement(String text) { 33 | if (text != null) { 34 | return selectElement(Jsoup.parse(text)); 35 | } 36 | return null; 37 | } 38 | 39 | public List selectElements(String text) { 40 | if (text != null) { 41 | return selectElements(Jsoup.parse(text)); 42 | } else { 43 | return new ArrayList(); 44 | } 45 | } 46 | 47 | public abstract Element selectElement(Element element); 48 | 49 | public abstract List selectElements(Element element); 50 | 51 | public abstract boolean hasAttribute(); 52 | 53 | } 54 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/io/SerializeUtil.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util.io; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.ObjectInputStream; 6 | import java.io.ObjectOutputStream; 7 | 8 | /** 9 | * Created by rock on 4/14/15. 10 | */ 11 | public class SerializeUtil { 12 | public static String serialize(Object object) { 13 | ObjectOutputStream oos = null; 14 | ByteArrayOutputStream baos = null; 15 | try { 16 | //序列化 17 | baos = new ByteArrayOutputStream(); 18 | oos = new ObjectOutputStream(baos); 19 | oos.writeObject(object); 20 | //byte[] bytes = baos.toByteArray(); 21 | //return bytes; 22 | String msg1 = baos.toString("ISO-8859-1");//指定字符集将字节流解码成字符串,否则在订阅时,转换会有问题。 23 | // msg1 = URLEncoder.encode(msg1, "UTF-8") 24 | return msg1; 25 | } catch (Exception e) { 26 | e.printStackTrace(); 27 | } 28 | return null; 29 | } 30 | 31 | public static Object unserialize(String msg) { 32 | ByteArrayInputStream bais = null; 33 | try { 34 | //反序列化 35 | bais = new ByteArrayInputStream(msg.getBytes("ISO-8859-1")); 36 | ObjectInputStream ois = new ObjectInputStream(bais); 37 | return ois.readObject(); 38 | } catch (Exception e) { 39 | e.printStackTrace(); 40 | } 41 | return null; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/CombiningXElements.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.select.Elements; 4 | 5 | import java.util.ArrayList; 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | */ 12 | public class CombiningXElements implements XElements { 13 | 14 | private List elementsList; 15 | 16 | public CombiningXElements(List elementsList) { 17 | this.elementsList = elementsList; 18 | } 19 | 20 | public CombiningXElements(XElements... elementsList) { 21 | this.elementsList = Arrays.asList(elementsList); 22 | } 23 | 24 | @Override 25 | public String get() { 26 | for (XElements xElements : elementsList) { 27 | String result = xElements.get(); 28 | if (result != null) { 29 | return result; 30 | } 31 | } 32 | return null; 33 | } 34 | 35 | @Override 36 | public List list() { 37 | List results = new ArrayList(); 38 | for (XElements xElements : elementsList) { 39 | results.addAll(xElements.list()); 40 | } 41 | return results; 42 | } 43 | 44 | @Override 45 | public Elements getElements() { 46 | Elements elements = new Elements(); 47 | for (XElements xElements : elementsList) { 48 | elements.addAll(xElements.getElements()); 49 | } 50 | return elements; 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/XpathSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import org.apache.commons.collections.CollectionUtils; 4 | import org.jsoup.nodes.Element; 5 | import us.codecraft.xsoup.XPathEvaluator; 6 | import us.codecraft.xsoup.Xsoup; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * XPath selector based on Xsoup.
12 | * 13 | * @author
14 | * @since 0.3.0 15 | */ 16 | public class XpathSelector extends BaseElementSelector { 17 | 18 | private XPathEvaluator xPathEvaluator; 19 | 20 | public XpathSelector(String xpathStr) { 21 | this.xPathEvaluator = Xsoup.compile(xpathStr); 22 | } 23 | 24 | @Override 25 | public String select(Element element) { 26 | return xPathEvaluator.evaluate(element).get(); 27 | } 28 | 29 | @Override 30 | public List selectList(Element element) { 31 | return xPathEvaluator.evaluate(element).list(); 32 | } 33 | 34 | @Override 35 | public Element selectElement(Element element) { 36 | List elements = selectElements(element); 37 | if (CollectionUtils.isNotEmpty(elements)){ 38 | return elements.get(0); 39 | } 40 | return null; 41 | } 42 | 43 | @Override 44 | public List selectElements(Element element) { 45 | return xPathEvaluator.evaluate(element).getElements(); 46 | } 47 | 48 | @Override 49 | public boolean hasAttribute() { 50 | return xPathEvaluator.hasAttribute(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 4.0.0 6 | 7 | 8 | com.uumai 9 | crawler-example 10 | 1.0 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | com.uumai 19 | uumai-crawler-quartz 20 | 1.0 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | org.apache.maven.plugins 35 | maven-dependency-plugin 36 | 37 | 38 | copy 39 | install 40 | 41 | copy-dependencies 42 | 43 | 44 | libs 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/gupiao/sinastock/SinaJSStockTakser.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.gupiao.sinastock; 2 | 3 | import java.util.List; 4 | 5 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 6 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 7 | import com.uumai.crawer.quartz.result.QuartzResult; 8 | import com.uumai.crawer.util.UumaiTime; 9 | 10 | public class SinaJSStockTakser extends QuartzLocalDebugAppMaster { 11 | 12 | @Override 13 | public void dobusiness() throws Exception{ 14 | 15 | String symbol ="SZ20001"; 16 | dotask("http://hq.sinajs.cn/list="+symbol, symbol); 17 | 18 | 19 | 20 | 21 | } 22 | 23 | private void dotask(String url,String sockname)throws Exception{ 24 | QuartzCrawlerTasker tasker=new QuartzCrawlerTasker(); 25 | // tasker.setCookies(cookie); 26 | // tasker.setUrl("http://data.eastmoney.com/zjlx/600307.html"); 27 | tasker.setUrl(url); 28 | // tasker.setEncoding("gbk"); 29 | // tasker.setDownloadType(DownloadType.selenium_download); 30 | // tasker.setStoreTableName("sinastock"); 31 | tasker.addResultItem("stock", sockname); 32 | 33 | tasker.addXpath("all", "*"); 34 | 35 | putDistributeTask(tasker); 36 | } 37 | 38 | public static void main(String[] args) throws Exception{ 39 | 40 | SinaJSStockTakser master=new SinaJSStockTakser(); 41 | master.init(); 42 | 43 | master.start(); 44 | 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /fengchao/uumai-common/mongodb-client/src/main/java/com/uumai/dao/helper/DB2JsonHelper.java: -------------------------------------------------------------------------------- 1 | package com.uumai.dao.helper; 2 | 3 | import com.mongodb.BasicDBObject; 4 | import com.mongodb.DB; 5 | import com.mongodb.DBCollection; 6 | import com.mongodb.DBCursor; 7 | import com.uumai.crawer.util.MongoUtil; 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | 12 | public class DB2JsonHelper 13 | { 14 | 15 | MongoUtil mongoUtil=new MongoUtil(); 16 | public String query(String collectionname, List key, List value) 17 | { 18 | StringBuffer sb = new StringBuffer(); 19 | DB db = mongoUtil.getDB(); 20 | DBCollection collection = db.getCollection(collectionname); 21 | BasicDBObject query = new BasicDBObject(); 22 | for (int i = 0; i < key.size(); i++) { 23 | query.append((String)key.get(i), value.get(i)); 24 | } 25 | DBCursor cursorDoc = collection.find(query); 26 | long count = collection.count(query); 27 | System.out.println("count:" + count); 28 | while (cursorDoc.hasNext()) 29 | { 30 | sb.append(cursorDoc.next()); 31 | } 32 | mongoUtil.close(); 33 | return sb.toString(); 34 | } 35 | public static void main(String[] args) throws Exception { 36 | List key = new ArrayList(); 37 | key.add("ASIN"); 38 | List value = new ArrayList(); 39 | value.add("B00LZS5EEI"); 40 | 41 | String json = new DB2JsonHelper().query("AmazonProduct", key, value); 42 | System.out.println("result:" + json); 43 | } 44 | } -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/CombingXPathEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.List; 10 | 11 | /** 12 | * @author code4crafter@gmail.com 13 | */ 14 | public class CombingXPathEvaluator implements XPathEvaluator { 15 | 16 | private List xPathEvaluators; 17 | 18 | public CombingXPathEvaluator(List xPathEvaluators) { 19 | this.xPathEvaluators = xPathEvaluators; 20 | } 21 | 22 | public CombingXPathEvaluator(XPathEvaluator... xPathEvaluators) { 23 | this.xPathEvaluators = Arrays.asList(xPathEvaluators); 24 | } 25 | 26 | @Override 27 | public XElements evaluate(Element element) { 28 | List xElementses = new ArrayList(); 29 | for (XPathEvaluator xPathEvaluator : xPathEvaluators) { 30 | xElementses.add(xPathEvaluator.evaluate(element)); 31 | } 32 | return new CombiningXElements(xElementses); 33 | } 34 | 35 | @Override 36 | public XElements evaluate(String html) { 37 | Document document = Jsoup.parse(html); 38 | return evaluate(document); 39 | } 40 | 41 | @Override 42 | public boolean hasAttribute() { 43 | for (XPathEvaluator xPathEvaluator : xPathEvaluators) { 44 | if (xPathEvaluator.hasAttribute()){ 45 | return true; 46 | } 47 | } 48 | return false; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/deploy/resources/amazon.cookie.txt: -------------------------------------------------------------------------------- 1 | .doubleclick.net TRUE / FALSE 1500776357 id 2214869c1c0400ca||t=1437704357|et=730|cs=002213fd480b369a11aaad8710 2 | .amazon.com TRUE / FALSE 1437705242 a-ogbcbff 1 3 | .amazon.com TRUE / FALSE 2068424402 session-token "2I1yxhWQL5uh01JYDgaQSDU6JP4ReuW/4JXjZ0OPGYiQjRCi5mgDFgjqLgegRQQBHacGz2BXEOCF4gYeyoYHCUO5UQugrosYinlQFJGU1uiEI9/BkB66DbnFWBpHzrlKx8d+dMMV6LE0mZCd0MKLcCT1LBIh30agaQEZEowR57YsKtemGtfXIzktLXzRZVR6ckQyYz5gCO15efS2gQtimXhTpTrqb2H/HM+lYkfs2F3YoisITTNfgy4fsRvaVvHzfKsJbmIE/MFX0BHZey/v3g==" 4 | .amazon.com TRUE / FALSE 2068424402 x-main "YP9n@HbLEhC6h7irq7vBOS8OseYvc@UuKBrEIQ494AckM?E8xjdRM99jyVOSOFhW" 5 | .amazon.com TRUE / TRUE 2068424402 at-main 5|XdiYEsFzBGeHGPDgaLfZ4InwvQdOkWpPWH8z5vc8Yz6KLXtt9zUdQVggcEFHYwteG1iEpA/G3KpSI9wkfeurORbAz+dtVX7rqlMHkSZkvs6qHGAnCQPbggl+h3k18DSV5a0fTNqevRgdI8O188lgxEnFpdQP3ELFL7hn5ZWcZENdz/MYLbl0MX7uwtLrbZ3hHMTg62wqLbA8CGgYTtSePOr80xpBlJ+wr5YRtnqcTEGJ3NxSNTY7OkR+4XcMT//h22okVjOan+GyBfdEHY2JVvduWlCyB2jy 6 | .amazon.com TRUE / TRUE sess-at-main "XyD0EK8GTqE1wtWfpRQy0Ym69OAOqlv7Utzr+33uW7I=" 7 | .amazon.com TRUE / FALSE 2082787201 x-wl-uid 1jDVdx8nEwb4KY/Gvxs6ec2JdKapcTCt7QYwY3PdSE4ewo9149d61UZKEC8twEGdf/9BjCzumEqY6WKYPeqa23QrIiwPNpmAgDdYBNhGaEcBXph0E5RSkDFY4Vb9f+u6K05CYy7AE06w= 8 | .amazon.com TRUE / FALSE 1437708004 b2b-main 0 9 | .amazon.com TRUE / FALSE 2082787201 ubid-main 177-0014183-0330740 10 | .amazon.com TRUE / FALSE 2082787201 session-id-time 2082787201l 11 | .amazon.com TRUE / FALSE 2082787201 session-id 181-8856461-9572619 12 | .amazon-adsystem.com TRUE / FALSE 2114380801 ad-id AyoBLNkCF0sLtJ7mfif4-fw 13 | .amazon-adsystem.com TRUE / FALSE 2114380801 ad-privacy 0 14 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/DefaultXPathEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | 4 | import org.jsoup.Jsoup; 5 | import org.jsoup.nodes.Document; 6 | import org.jsoup.nodes.Element; 7 | import org.jsoup.select.Collector; 8 | import org.jsoup.select.Elements; 9 | import org.jsoup.select.Evaluator; 10 | 11 | /** 12 | * @author code4crafter@gmail.com 13 | */ 14 | public class DefaultXPathEvaluator implements XPathEvaluator { 15 | 16 | private Evaluator evaluator; 17 | 18 | private ElementOperator elementOperator; 19 | 20 | public DefaultXPathEvaluator(Evaluator evaluator, ElementOperator elementOperator) { 21 | this.evaluator = evaluator; 22 | this.elementOperator = elementOperator; 23 | } 24 | 25 | @Override 26 | public XElements evaluate(Element element) { 27 | Elements elements = Collector.collect(evaluator, element); 28 | return new DefaultXElements(elements, elementOperator); 29 | } 30 | 31 | @Override 32 | public XElements evaluate(String html) { 33 | Document document = Jsoup.parse(html); 34 | return evaluate(document); 35 | } 36 | 37 | @Override 38 | public boolean hasAttribute() { 39 | return elementOperator != null; 40 | } 41 | 42 | public Evaluator getEvaluator() { 43 | return evaluator; 44 | } 45 | 46 | public String getAttribute() { 47 | if (elementOperator == null) { 48 | return null; 49 | } 50 | return elementOperator.toString(); 51 | } 52 | 53 | public ElementOperator getElementOperator() { 54 | return elementOperator; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/UumaiTime.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util; 2 | 3 | import com.uumai.crawer.util.math.MathUtils; 4 | import org.joda.time.DateTime; 5 | 6 | import java.text.ParseException; 7 | import java.text.SimpleDateFormat; 8 | import java.time.LocalDateTime; 9 | 10 | /** 11 | * Created by rock on 7/24/15. 12 | */ 13 | public class UumaiTime { 14 | static SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 15 | 16 | JodaTime jodaTime=new JodaTime(); 17 | // Java8Time java8Time=new Java8Time(); 18 | 19 | // public LocalDateTime getNow(){ 20 | // return java8Time.getNow(); 21 | // 22 | // } 23 | 24 | public DateTime getNow(){ 25 | return jodaTime.getNow(); 26 | } 27 | 28 | 29 | public String getNowString(){ 30 | return jodaTime.getNowString(); 31 | 32 | } 33 | public float getTimeDuration(String startime,String endtime){ 34 | // return joda_fmt.parseDateTime(endtime) -joda_fmt.parseDateTime(startime); 35 | try { 36 | float duration=df.parse(endtime).getTime()-df.parse(startime).getTime(); 37 | duration= duration / (1000 * 60); 38 | return duration; 39 | // return MathUtils.multiply(dr, "24"); 40 | } catch (ParseException e) { 41 | e.printStackTrace(); 42 | } 43 | return -1f; 44 | } 45 | public static void main(String[] args){ 46 | 47 | 48 | UumaiTime uumaiTime=new UumaiTime(); 49 | System.out.println(uumaiTime.getNow()); 50 | 51 | String newupdatetime=uumaiTime.getNowString(); 52 | System.out.println(newupdatetime); 53 | 54 | 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /fengchao/uumai-common/zookeeper-client/src/main/java/com/uumai/zookeeperclient/TestThread.java: -------------------------------------------------------------------------------- 1 | package com.uumai.zookeeperclient; 2 | 3 | import com.uumai.zookeeperclient.uitl.ZookeeperClient; 4 | import com.uumai.zookeeperclient.watch.WatchZKClient; 5 | 6 | /** 7 | * Created by rock on 8/19/15. 8 | */ 9 | public class TestThread extends Thread{ 10 | 11 | private WatchZKClient client; 12 | private String path; 13 | 14 | public TestThread(String path){ 15 | this.path=path; 16 | } 17 | 18 | public void init(){ 19 | client=new WatchZKClient(path); 20 | try { 21 | client.connect(); 22 | } catch (Exception e) { 23 | e.printStackTrace(); 24 | } 25 | String data=client.getCurrent_data_value(); 26 | System.out.println("init data:" + data); 27 | } 28 | 29 | @Override 30 | public void run() { 31 | client.setDaemon(true); 32 | client.start(); 33 | 34 | // int i=0; 35 | while(true){ 36 | 37 | try { 38 | Thread.sleep(5000); 39 | } catch (InterruptedException e) { 40 | e.printStackTrace(); 41 | } 42 | 43 | // i=i+1; 44 | // System.out.println("i:"+i); 45 | // if(i==30) break; 46 | } 47 | } 48 | public static void main(String[] args) throws Exception{ 49 | ZookeeperClient client=new ZookeeperClient(); 50 | client.updateNodes("/test",null); 51 | client.updateNodes("/test/1",null); 52 | TestThread testThread=new TestThread("/test"); 53 | testThread.init(); 54 | testThread.start(); 55 | 56 | 57 | 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/DefaultXElements.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.nodes.Element; 4 | import org.jsoup.select.Elements; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * XPath results. 11 | * 12 | * @author code4crafter@gmail.com 13 | */ 14 | public class DefaultXElements extends ArrayList implements XElements { 15 | 16 | private Elements elements; 17 | 18 | private ElementOperator elementOperator; 19 | 20 | public DefaultXElements(Elements elements, ElementOperator elementOperator) { 21 | this.elements = elements; 22 | this.elementOperator = elementOperator; 23 | initList(); 24 | } 25 | 26 | private void initList() { 27 | for (Element element : elements) { 28 | this.add(new DefaultXElement(element, elementOperator)); 29 | } 30 | } 31 | 32 | @Override 33 | public String get() { 34 | if (size() < 1) { 35 | return null; 36 | } else { 37 | return get(0).get(); 38 | } 39 | } 40 | 41 | @Override 42 | public List list() { 43 | List resultStrings = new ArrayList(); 44 | for (XElement xElement : this) { 45 | String text = xElement.get(); 46 | if (text != null) { 47 | resultStrings.add(text); 48 | } 49 | } 50 | return resultStrings; 51 | } 52 | 53 | @Override 54 | public String toString() { 55 | return get(); 56 | } 57 | 58 | @Override 59 | public Elements getElements() { 60 | return elements; 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/Json.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import us.codecraft.xsoup.XTokenQueue; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * parse json 10 | * @author 11 | * @since 0.5.0 12 | */ 13 | public class Json extends PlainText { 14 | 15 | public Json(List strings) { 16 | super(strings); 17 | } 18 | 19 | public Json(String text) { 20 | super(text); 21 | } 22 | 23 | /** 24 | * remove padding for JSONP 25 | * @param padding 26 | * @return 27 | */ 28 | public Json removePadding(String padding) { 29 | String text = getFirstSourceText(); 30 | XTokenQueue tokenQueue = new XTokenQueue(text); 31 | tokenQueue.consumeWhitespace(); 32 | tokenQueue.consume(padding); 33 | tokenQueue.consumeWhitespace(); 34 | String chompBalanced = tokenQueue.chompBalancedNotInQuotes('(', ')'); 35 | return new Json(chompBalanced); 36 | } 37 | 38 | public T toObject(Class clazz) { 39 | if (getFirstSourceText() == null) { 40 | return null; 41 | } 42 | return JSON.parseObject(getFirstSourceText(), clazz); 43 | } 44 | 45 | public List toList(Class clazz) { 46 | if (getFirstSourceText() == null) { 47 | return null; 48 | } 49 | return JSON.parseArray(getFirstSourceText(), clazz); 50 | } 51 | 52 | @Override 53 | public Selectable jsonPath(String jsonPath) { 54 | JsonPathSelector jsonPathSelector = new JsonPathSelector(jsonPath); 55 | return selectList(jsonPathSelector,getSourceTexts()); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/selenium/UumaiSeleniumWebDriver.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.selenium; 2 | 3 | import org.openqa.selenium.By; 4 | import org.openqa.selenium.WebDriver; 5 | import org.openqa.selenium.WebElement; 6 | 7 | import java.io.Serializable; 8 | import java.util.List; 9 | import java.util.Set; 10 | 11 | /** 12 | * Created by rock on 12/9/15. 13 | */ 14 | public class UumaiSeleniumWebDriver implements WebDriver,Serializable { 15 | @Override 16 | public void get(String s) { 17 | 18 | } 19 | 20 | @Override 21 | public String getCurrentUrl() { 22 | return null; 23 | } 24 | 25 | @Override 26 | public String getTitle() { 27 | return null; 28 | } 29 | 30 | @Override 31 | public List findElements(By by) { 32 | return null; 33 | } 34 | 35 | @Override 36 | public WebElement findElement(By by) { 37 | return null; 38 | } 39 | 40 | @Override 41 | public String getPageSource() { 42 | return null; 43 | } 44 | 45 | @Override 46 | public void close() { 47 | 48 | } 49 | 50 | @Override 51 | public void quit() { 52 | 53 | } 54 | 55 | @Override 56 | public Set getWindowHandles() { 57 | return null; 58 | } 59 | 60 | @Override 61 | public String getWindowHandle() { 62 | return null; 63 | } 64 | 65 | @Override 66 | public TargetLocator switchTo() { 67 | return null; 68 | } 69 | 70 | @Override 71 | public Navigation navigate() { 72 | return null; 73 | } 74 | 75 | @Override 76 | public Options manage() { 77 | return null; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/selenium/test/TestHtmlUnitDriver.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.selenium.test; 2 | 3 | import com.gargoylesoftware.htmlunit.BrowserVersion; 4 | import org.openqa.selenium.By; 5 | import org.openqa.selenium.WebDriver; 6 | import org.openqa.selenium.WebElement; 7 | import org.openqa.selenium.htmlunit.HtmlUnitDriver; 8 | 9 | /** 10 | * Created by rock on 5/6/15. 11 | */ 12 | public class TestHtmlUnitDriver { 13 | 14 | public static void main(String[] args){ 15 | // Notice that the remainder of the code relies on the interface, 16 | // not the implementation. 17 | HtmlUnitDriver driver = new HtmlUnitDriver(true); 18 | 19 | // And now use this to visit Google 20 | driver.get("file:///home/rock/kanxg/knowledges/bigdata/hadoop/zookeeper/ZooKeeper%E5%8E%9F%E7%90%86%E5%8F%8A%E4%BD%BF%E7%94%A8%20%20%20%E5%B0%8F%E6%AD%A6%E5%93%A5%E7%9A%84%E5%8D%9A%E5%AE%A2%20-%20%E5%B7%A6%E6%89%8B%E7%A8%8B%E5%BA%8F%E5%8F%B3%E6%89%8B%E8%AF%97.html"); 21 | 22 | // Find the text input element by its name 23 | // WebElement element = driver.findElement(By.name("q")); 24 | 25 | // Enter something to search for 26 | // element.sendKeys("Cheese!"); 27 | 28 | // Now submit the form. WebDriver will find the form for us from the element 29 | // element.submit(); 30 | 31 | // Check the title of the page 32 | System.out.println("Page title is: " + driver.getTitle()); 33 | 34 | WebElement webElement = driver.findElement(By.xpath("/html")); 35 | String content = webElement.getAttribute("outerHTML"); 36 | System.out.print("html:"+content); 37 | 38 | driver.quit(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/gupiao/qq/SockIntro.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.gupiao.qq; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 7 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 8 | import com.uumai.crawer.quartz.result.QuartzResult; 9 | import com.uumai.crawer.quartz.util.QuartzQueryItem; 10 | import com.uumai.crawer.util.UumaiTime; 11 | 12 | public class SockIntro extends QuartzLocalDebugAppMaster { 13 | 14 | @Override 15 | public void dobusiness() throws Exception{ 16 | 17 | String symbol="SZ20006"; 18 | geteastmoney("http://stock.finance.qq.com/corp1/profile.php?zqdm="+symbol.substring(2), symbol ); 19 | 20 | 21 | 22 | 23 | 24 | } 25 | 26 | 27 | private void geteastmoney(String url,String sockname) throws Exception{ 28 | QuartzCrawlerTasker tasker=new QuartzCrawlerTasker(); 29 | // tasker.setCookies(cookie); 30 | // tasker.setUrl("http://data.eastmoney.com/zjlx/600307.html"); 31 | tasker.setUrl(url); 32 | tasker.setEncoding("gbk"); 33 | // tasker.setDownloadType(DownloadType.selenium_download); 34 | // tasker.setStoreTableName("qq_stock_intro"); 35 | tasker.addResultItem("name", sockname); 36 | // tasker.addXpath("html", "*"); 37 | tasker.addXpath_all("shuxing", "//table[@class='list']//td/allText()"); 38 | 39 | 40 | putDistributeTask(tasker); 41 | } 42 | 43 | public static void main(String[] args) throws Exception{ 44 | 45 | 46 | 47 | SockIntro master=new SockIntro(); 48 | 49 | master.init(); 50 | 51 | master.start(); 52 | 53 | } 54 | } -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/JsonPathSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import com.jayway.jsonpath.JsonPath; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | /** 9 | * JsonPath selector.
10 | * Used to extract content from JSON.
11 | * 12 | * @author
13 | * @since 0.2.1 14 | */ 15 | public class JsonPathSelector implements Selector { 16 | 17 | private String jsonPathStr; 18 | 19 | private JsonPath jsonPath; 20 | 21 | public JsonPathSelector(String jsonPathStr) { 22 | this.jsonPathStr = jsonPathStr; 23 | this.jsonPath = JsonPath.compile(this.jsonPathStr); 24 | } 25 | 26 | @Override 27 | public String select(String text) { 28 | Object object = jsonPath.read(text); 29 | if (object == null) { 30 | return null; 31 | } 32 | if (object instanceof List) { 33 | List list = (List) object; 34 | if (list != null && list.size() > 0) { 35 | return list.iterator().next().toString(); 36 | } 37 | } 38 | return object.toString(); 39 | } 40 | 41 | @Override 42 | public List selectList(String text) { 43 | List list = new ArrayList(); 44 | Object object = jsonPath.read(text); 45 | if (object == null) { 46 | return list; 47 | } 48 | if (object instanceof List) { 49 | List items = (List) object; 50 | for (Object item : items) { 51 | list.add(String.valueOf(item)); 52 | } 53 | } else { 54 | list.add(String.valueOf(object)); 55 | } 56 | return list; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/shell/Shell.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util.shell; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | 7 | /** 8 | * Created with IntelliJ IDEA. 9 | * User: rock 10 | * Date: 3/17/15 11 | * Time: 10:04 AM 12 | * To change this template use File | Settings | File Templates. 13 | */ 14 | public class Shell { 15 | public int exec(String command){ 16 | String[] cmd = { "/bin/sh","-c",command}; 17 | 18 | int returnkey=0; 19 | try { 20 | Process p0 = Runtime.getRuntime().exec(cmd); 21 | //读取标准输出流 22 | BufferedReader bufferedReader =new BufferedReader(new InputStreamReader(p0.getInputStream())); 23 | String line; 24 | while ((line=bufferedReader.readLine()) != null) { 25 | System.out.println(line); 26 | } 27 | //读取标准错误流 28 | BufferedReader brError = new BufferedReader(new InputStreamReader(p0.getErrorStream())); 29 | String errline = null; 30 | while ((errline = brError.readLine()) != null) { 31 | System.out.println(errline); 32 | } 33 | //waitFor()判断Process进程是否终止,通过返回值判断是否正常终止。0代表正常终止 34 | returnkey=p0.waitFor(); 35 | 36 | } catch (Exception e1) { 37 | e1.printStackTrace(); 38 | return returnkey=-1 ; 39 | } 40 | return returnkey; 41 | } 42 | 43 | public static void main(String[] args) { 44 | Shell shell=new Shell(); 45 | // String[] cmd = { "/bin/sh", "-c", "mvn install ; mkdir /installation/upgrade/" }; 46 | System.out.println(shell.exec("ping www.baidu.com")); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | com.uumai 5 | uumai-core 6 | 1.0 7 | 8 | 9 | 10 | 4.0.0 11 | 12 | com.uumai 13 | uumai-crawler-core 14 | jar 15 | 1.0 16 | 17 | 18 | 19 | 20 | 21 | com.uumai 22 | uumai-mongo-client 23 | 1.0 24 | 25 | 26 | 27 | com.uumai 28 | uumai-redis-client 29 | 1.0 30 | 31 | 32 | 33 | com.uumai 34 | uumai-activemq-client 35 | 1.0 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | com.uumai 44 | uumai-zookeeper-client 45 | 1.0 46 | 47 | 48 | 49 | 50 | com.uumai 51 | uumai-crawler-util 52 | 1.0 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jiudian/ctrip/CityListTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jiudian.ctrip; 2 | 3 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 4 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 5 | import com.uumai.crawer.util.UumaiTime; 6 | 7 | public class CityListTasker extends QuartzLocalDebugAppMaster { //QuartzLocalDebugAppMaster{ 8 | @Override 9 | public void dobusiness() throws Exception { 10 | createonetask("http://hotels.ctrip.com/domestic-city-hotel.html"); 11 | 12 | 13 | } 14 | 15 | public void createonetask(String url) throws Exception{ 16 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 17 | 18 | // tasker.setProxy(new CrawlerProxy("cn-proxy.jp.oracle.com", 80)); 19 | 20 | // tasker.setDownloadType(DownloadType.firefox_download); 21 | tasker.setUrl(url); 22 | // tasker.addRequestHeader("Referer", "http://hotels.ctrip.com/hotel/beijing2"); 23 | // tasker.setRequestmethod("POST"); 24 | 25 | // tasker.addSeleniumAction("sendKeys", "id=DCityName1", "上海(SHA)"); 26 | // tasker.addSeleniumAction("sendKeys", "id=ACityName1", "北京(BJS)"); 27 | // tasker.addSeleniumAction("sendKeys", "id=DDate1", "2015-12-20"); 28 | // tasker.addSeleniumAction("click", "id=btnReSearch", null); 29 | 30 | 31 | // tasker.setStoreTableName("ctrip_jiudian_city"); 32 | tasker.addXpath_all("hotel_name", "//dl[@class='pinyin_filter_detail layoutfix']//a/text()"); 33 | tasker.addXpath_all("hotel_link", "//dl[@class='pinyin_filter_detail layoutfix']//a/@href"); 34 | 35 | putDistributeTask(tasker); 36 | } 37 | 38 | public static void main(String[] args) throws Exception{ 39 | 40 | CityListTasker master = new CityListTasker(); 41 | master.init(); 42 | 43 | master.start(); 44 | 45 | } 46 | 47 | 48 | } 49 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/license/LicenseInfo.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util.license; 2 | 3 | /** 4 | * Created by rock on 12/23/15. 5 | */ 6 | public class LicenseInfo { 7 | 8 | private String user; 9 | private String company; 10 | private String email; 11 | private String edition; 12 | private String release_version; 13 | private String valid_until; 14 | private String distribute_pi_count; 15 | 16 | public String getUser() { 17 | return user; 18 | } 19 | 20 | public void setUser(String user) { 21 | this.user = user; 22 | } 23 | 24 | public String getCompany() { 25 | return company; 26 | } 27 | 28 | public void setCompany(String company) { 29 | this.company = company; 30 | } 31 | 32 | public String getEmail() { 33 | return email; 34 | } 35 | 36 | public void setEmail(String email) { 37 | this.email = email; 38 | } 39 | 40 | public String getEdition() { 41 | return edition; 42 | } 43 | 44 | public void setEdition(String edition) { 45 | this.edition = edition; 46 | } 47 | 48 | public String getRelease_version() { 49 | return release_version; 50 | } 51 | 52 | public void setRelease_version(String release_version) { 53 | this.release_version = release_version; 54 | } 55 | 56 | public String getValid_until() { 57 | return valid_until; 58 | } 59 | 60 | public void setValid_until(String valid_until) { 61 | this.valid_until = valid_until; 62 | } 63 | 64 | public String getDistribute_pi_count() { 65 | return distribute_pi_count; 66 | } 67 | 68 | public void setDistribute_pi_count(String distribute_pi_count) { 69 | this.distribute_pi_count = distribute_pi_count; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/util/UumaiExportExcel.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.uumai.crawer.quartz.result.QuartzResult; 7 | import com.uumai.crawer.quartz.result.QuartzResultItem; 8 | import com.uumai.crawer.util.filesystem.ExcelFileUtil; 9 | 10 | public class UumaiExportExcel { 11 | 12 | public void exportExcel(String filename,List results){ 13 | try { 14 | ExcelFileUtil util=new ExcelFileUtil(filename); 15 | for(int i=0;i headlist=new ArrayList(); 21 | for(QuartzResultItem item:quartzResult.getItemlist()){ 22 | headlist.add(item.getName()); 23 | } 24 | util.writeLine(headlist); 25 | } 26 | 27 | List datalist=new ArrayList(); 28 | for(QuartzResultItem item:quartzResult.getItemlist()){ 29 | datalist.add(item.getValue().toString()); 30 | } 31 | util.writeLine(datalist); 32 | 33 | } 34 | util.createWorkBook(); 35 | } catch (Exception e) { 36 | e.printStackTrace(); 37 | } 38 | } 39 | 40 | public static void main(String[] args) { 41 | // UumaiResultUtil util=new UumaiResultUtil(); 42 | List results= null; //util.getresult("eastmoney", null); 43 | 44 | UumaiExportExcel eutil=new UumaiExportExcel(); 45 | eutil.exportExcel("/home/rock/tmp/eastmoney.xls", results); 46 | 47 | 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/CrawlerWorker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2; 2 | 3 | /** 4 | * Created by rock on 8/4/16. 5 | */ 6 | public class CrawlerWorker implements Runnable { 7 | public CrawlerTasker tasker; 8 | public CrawlerResult result; 9 | 10 | public CrawlerWorker(CrawlerTasker tasker) { 11 | this.tasker=tasker; 12 | } 13 | 14 | public void run(){ 15 | // while (true){ 16 | // CrawlerTasker tasker=this.pool.pollTask(); //queue.poll(); 17 | if(tasker==null){ 18 | // try { 19 | System.out.println("tasker is null!"); 20 | //Thread.sleep(1000); 21 | // } catch (InterruptedException e) { 22 | // e.printStackTrace(); 23 | // } 24 | }else{ 25 | System.out.println("Thread "+ Thread.currentThread().getName() +" do task "+tasker.getUrl()); 26 | // System.out.println("Thread "+ Thread.currentThread().getName() +" start do tasker:" + new UumaiTime().getNowString()); 27 | try { 28 | tasker.init(); 29 | 30 | dobusiness(); 31 | // System.out.println("Thread "+ Thread.currentThread().getName() +" finish do tasker:" + new UumaiTime().getNowString()); 32 | } catch (Exception e) { 33 | try { 34 | e.printStackTrace(); 35 | } catch (Exception e1) { 36 | e1.printStackTrace(); 37 | } 38 | } 39 | 40 | 41 | 42 | } 43 | 44 | 45 | // } 46 | 47 | } 48 | 49 | protected void dobusiness() throws Exception{ 50 | download(); 51 | pipeline(); 52 | 53 | } 54 | protected void download() throws Exception { 55 | } 56 | protected void pipeline() throws Exception{ 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/DownloadFactory.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download; 2 | 3 | import com.uumai.crawer2.download.emptymock.EmptyMockDownload; 4 | import com.uumai.crawer2.download.filedownload.FileDownload; 5 | import com.uumai.crawer2.download.httpclient.HttpClientDownload; 6 | import com.uumai.crawer2.download.httpdownload.HttpDownload; 7 | import com.uumai.crawer2.download.selenium.SeleniumDownloader; 8 | import com.uumai.crawer2.download.shelldownload.ShellDownload; 9 | 10 | /** 11 | * Created by rock on 8/17/15. 12 | */ 13 | public class DownloadFactory { 14 | public static synchronized Download getnewDownload(Download.DownloadType type){ 15 | // if(type== Download.DownloadType.java_download){ 16 | // return new HttpDownload(); 17 | // }else 18 | // 19 | if(type== Download.DownloadType.httpclient_download){ 20 | return new HttpClientDownload(); 21 | }else if(type== Download.DownloadType.chrome_download){ 22 | return new SeleniumDownloader(); 23 | }else if(type== Download.DownloadType.firefox_download){ 24 | return new SeleniumDownloader(); 25 | }else if(type== Download.DownloadType.openscript_download){ 26 | 27 | }else if(type== Download.DownloadType.emptymockdown){ 28 | return new EmptyMockDownload(); 29 | }else if(type== Download.DownloadType.file_download){ 30 | return new FileDownload(); 31 | }else if(type== Download.DownloadType.shell_download){ 32 | return new ShellDownload(); 33 | }else if(type== Download.DownloadType.htmlunit_download){ 34 | return new SeleniumDownloader(); 35 | }else if(type== Download.DownloadType.phantomjs_download){ 36 | return new SeleniumDownloader(); 37 | } 38 | return new HttpDownload(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /fengchao/uumai-multiserver/uumai-thrift/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 19 | 20 | com.uumai 21 | uumai-multiserver 22 | 1.0 23 | 24 | 25 | 26 | 4.0.0 27 | 28 | com.uumai 29 | uumai-thrift 30 | 1.0 31 | 32 | uumai-thrift 33 | 34 | 35 | 42 | 43 | 44 | 45 | org.apache.thrift 46 | libthrift 47 | 0.9.3 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jipiao/chunqiu/ChunqiuTiejia.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jipiao.chunqiu; 2 | 3 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 4 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 5 | import com.uumai.crawer.util.UumaiTime; 6 | import com.uumai.crawer2.download.CrawlerProxy; 7 | 8 | public class ChunqiuTiejia extends QuartzLocalDebugAppMaster { 9 | 10 | 11 | 12 | @Override 13 | public void dobusiness() throws Exception{ 14 | sendtask("http://flights.ch.com/tejia"); 15 | 16 | } 17 | 18 | private void sendtask(String url) throws Exception{ 19 | QuartzCrawlerTasker tasker=new QuartzCrawlerTasker(); 20 | // tasker.setCookies(cookie); 21 | // tasker.setUrl("http://data.eastmoney.com/zjlx/600307.html"); 22 | tasker.setUrl(url); 23 | // tasker.setDownloadType(DownloadType.selenium_download); 24 | // tasker.setStoreTableName("chunqiu_tejia"); 25 | for(int i=1;i<=14;i++){ 26 | for(int j=1;j<=5;j++){ 27 | tasker.addXpath("flight", "//table[@class='b0'][1]//tr[@class='flight']["+i+"]/td["+j+"]/a/text()"); 28 | tasker.addXpath("link", "//table[@class='b0'][1]//tr[@class='flight']["+i+"]/td["+j+"]/a/@href"); 29 | tasker.addXpath_newrow(); 30 | } 31 | 32 | } 33 | for(int i=1;i<=8;i++){ 34 | for(int j=1;j<=5;j++){ 35 | tasker.addXpath("flight", "//table[@class='b0'][2]//tr[@class='flight']["+i+"]/td["+j+"]/a/text()"); 36 | tasker.addXpath("link", "//table[@class='b0'][2]//tr[@class='flight']["+i+"]/td["+j+"]/a/@href"); 37 | tasker.addXpath_newrow(); 38 | } 39 | 40 | } 41 | putDistributeTask(tasker); 42 | } 43 | 44 | public static void main(String[] args) throws Exception { 45 | 46 | 47 | ChunqiuTiejia master=new ChunqiuTiejia(); 48 | master.init(); 49 | 50 | master.start(); 51 | 52 | } 53 | 54 | 55 | 56 | } 57 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/AndSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * All selectors will be arranged as a pipeline.
8 | * The next selector uses the result of the previous as source. 9 | * @author
10 | * @since 0.2.0 11 | */ 12 | public class AndSelector implements Selector { 13 | 14 | private List selectors = new ArrayList(); 15 | 16 | public AndSelector(Selector... selectors) { 17 | for (Selector selector : selectors) { 18 | this.selectors.add(selector); 19 | } 20 | } 21 | 22 | public AndSelector(List selectors) { 23 | this.selectors = selectors; 24 | } 25 | 26 | @Override 27 | public String select(String text) { 28 | for (Selector selector : selectors) { 29 | if (text == null) { 30 | return null; 31 | } 32 | text = selector.select(text); 33 | } 34 | return text; 35 | } 36 | 37 | @Override 38 | public List selectList(String text) { 39 | List results = new ArrayList(); 40 | boolean first = true; 41 | for (Selector selector : selectors) { 42 | if (first) { 43 | results = selector.selectList(text); 44 | first = false; 45 | } else { 46 | List resultsTemp = new ArrayList(); 47 | for (String result : results) { 48 | resultsTemp.addAll(selector.selectList(result)); 49 | } 50 | results = resultsTemp; 51 | if (results == null || results.size() == 0) { 52 | return results; 53 | } 54 | } 55 | } 56 | return results; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/result/QuartzXpathItem.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.result; 2 | 3 | import java.io.Serializable; 4 | 5 | public class QuartzXpathItem implements Serializable { 6 | public enum XpathType { 7 | Xpath,Xpath_ALL, JsonPath ,JsonPath_ALL, REGEX_EXPRESS,REGEX_EXPRESS_ALL, _UUMAI_NEWROW_ 8 | } 9 | 10 | private XpathType type; 11 | private String xpathName; 12 | private String xpath; 13 | private String xpathvalue; 14 | private String fromsource; 15 | private String assource; 16 | private boolean notoutput=false; 17 | 18 | public QuartzXpathItem asSource(String sourcename){ 19 | this.assource=sourcename; 20 | return this; 21 | } 22 | public void fromsource(String sourcename){ 23 | this.fromsource=sourcename; 24 | } 25 | 26 | public void setNotOutput(){ 27 | this.notoutput=true; 28 | } 29 | public String getXpathName() { 30 | return xpathName; 31 | } 32 | 33 | public void setXpathName(String xpathName) { 34 | this.xpathName = xpathName; 35 | } 36 | 37 | public String getXpath() { 38 | return xpath; 39 | } 40 | 41 | public void setXpath(String xpath) { 42 | this.xpath = xpath; 43 | } 44 | 45 | public XpathType getType() { 46 | return type; 47 | } 48 | 49 | public void setType(XpathType type) { 50 | this.type = type; 51 | } 52 | 53 | public String getXpathvalue() { 54 | return xpathvalue; 55 | } 56 | 57 | public void setXpathvalue(String xpathvalue) { 58 | this.xpathvalue = xpathvalue; 59 | } 60 | public String getFromsource() { 61 | return fromsource; 62 | } 63 | public void setFromsource(String fromsource) { 64 | this.fromsource = fromsource; 65 | } 66 | public String getAssource() { 67 | return assource; 68 | } 69 | public void setAssource(String assource) { 70 | this.assource = assource; 71 | } 72 | public boolean isNotoutput() { 73 | return notoutput; 74 | } 75 | 76 | 77 | 78 | } 79 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jobs/lagou/SearchJobTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jobs.lagou; 2 | 3 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 4 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 5 | import com.uumai.crawer.util.UumaiTime; 6 | import com.uumai.crawer2.download.CrawlerProxy; 7 | 8 | public class SearchJobTasker extends QuartzLocalDebugAppMaster {// AbstractAppMaster { 9 | 10 | @Override 11 | public void dobusiness() throws Exception { 12 | 13 | sendtask("java",1); 14 | 15 | 16 | 17 | 18 | 19 | } 20 | 21 | 22 | 23 | private void sendtask(String searchtext, int page) throws Exception{ 24 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 25 | 26 | String url="http://www.lagou.com/jobs/positionAjax.json?px=new&city=上海&first=false&pn="+page+"&kd="+searchtext; 27 | // tasker.setRequestmethod("POST"); 28 | tasker.setUrl(url); 29 | // tasker.setEncoding("gbk"); 30 | // tasker.setDownloadType(DownloadType.selenium_download); 31 | tasker.setProxy(new CrawlerProxy("cn-proxy.jp.oracle.com", 80)); 32 | // tasker.setStoreTableName("lagou_searchjob"); 33 | tasker.addResultItem("searchtext",searchtext); 34 | tasker.addResultItem("page",page); 35 | 36 | tasker.addJsonpath_all("positionId", "$.content.result[*].positionId"); 37 | tasker.addJsonpath_all("companyId", "$.content.result[*].companyId"); 38 | // tasker.addJsonpath("resultMessage", "jingdong_service_promotion_getcode_responce.queryjs_result.resultMessage"); 39 | // tasker.addJsonpath("url", "jingdong_service_promotion_getcode_responce.queryjs_result.url"); 40 | 41 | putDistributeTask(tasker); 42 | 43 | } 44 | 45 | public static void main(String[] args) throws Exception { 46 | 47 | 48 | SearchJobTasker master=new SearchJobTasker(); 49 | 50 | master.init(); 51 | 52 | master.start(); 53 | 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/selenium/test/TestSelenium.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.selenium.test; 2 | 3 | import com.thoughtworks.selenium.Selenium; 4 | import com.thoughtworks.selenium.webdriven.WebDriverBackedSelenium; 5 | import org.openqa.selenium.By; 6 | import org.openqa.selenium.WebDriver; 7 | import org.openqa.selenium.WebElement; 8 | import org.openqa.selenium.chrome.ChromeDriver; 9 | 10 | /** 11 | * Created by rock on 5/6/15. 12 | */ 13 | public class TestSelenium { 14 | public static void main(String[] args){ 15 | // System.getProperties().setProperty("webdriver.chrome.driver", 16 | // "/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket/shop_indexer/driver/chromedriver"); 17 | 18 | // You may use any WebDriver implementation. Firefox is used here as an example 19 | WebDriver driver = new ChromeDriver(); 20 | 21 | // A "base url", used by selenium to resolve relative URLs 22 | String baseUrl = "http://www.oracle.com"; 23 | 24 | // Create the Selenium implementation 25 | Selenium selenium = new WebDriverBackedSelenium(driver, baseUrl); 26 | 27 | // Perform actions with selenium 28 | 29 | selenium.open("http://www.oracle.com"); 30 | 31 | 32 | WebElement webElement = driver.findElement(By.xpath("/html")); 33 | String content = webElement.getAttribute("outerHTML"); 34 | System.out.print("html:"+content); 35 | 36 | // Get the underlying WebDriver implementation back. This will refer to the 37 | // same WebDriver instance as the "driver" variable above. 38 | WebDriver driverInstance = ((WebDriverBackedSelenium) selenium).getWrappedDriver(); 39 | 40 | //Finally, close the browser. Call stop on the WebDriverBackedSelenium instance 41 | //instead of calling driver.quit(). Otherwise, the JVM will continue running after 42 | //the browser has been closed. 43 | selenium.stop(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/shelldownload/ShellDownload.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.shelldownload; 2 | 3 | import com.uumai.crawer2.CrawlerResult; 4 | import com.uumai.crawer2.CrawlerTasker; 5 | import com.uumai.crawer2.download.Download; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.InputStreamReader; 9 | 10 | /** 11 | * Created by rock on 1/27/16. 12 | */ 13 | public class ShellDownload implements Download { 14 | @Override 15 | public CrawlerResult download(CrawlerTasker tasker) throws Exception { 16 | CrawlerResult result=new CrawlerResult(); 17 | String[] cmd = { "/bin/sh","-c",tasker.getUrl()}; 18 | 19 | try { 20 | Process p0 = Runtime.getRuntime().exec(cmd); 21 | //读取标准输出流 22 | StringBuffer osb=new StringBuffer(); 23 | BufferedReader bufferedReader =new BufferedReader(new InputStreamReader(p0.getInputStream())); 24 | String line; 25 | while ((line=bufferedReader.readLine()) != null) { 26 | // System.out.println(line); 27 | osb.append(line); 28 | } 29 | //读取标准错误流 30 | StringBuffer esb=new StringBuffer(); 31 | BufferedReader brError = new BufferedReader(new InputStreamReader(p0.getErrorStream())); 32 | String errline = null; 33 | while ((errline = brError.readLine()) != null) { 34 | // System.out.println(errline); 35 | esb.append(errline); 36 | } 37 | //waitFor()判断Process进程是否终止,通过返回值判断是否正常终止。0代表正常终止 38 | int returnkey=p0.waitFor(); 39 | result.setReturncode(returnkey); 40 | result.setRawText(osb.toString()+esb.toString()); 41 | } catch (Exception e1) { 42 | result.setRawText(e1.getMessage()); 43 | result.setReturncode(-1); 44 | } 45 | return result; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler2/util/htmlparse/JsoupUtil.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler2.util.htmlparse; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.File; 10 | import java.io.FileReader; 11 | 12 | /** 13 | * Created by rock on 4/9/15. 14 | */ 15 | public class JsoupUtil { 16 | String rawText; 17 | 18 | String baseUri; 19 | private Document doc; 20 | 21 | public JsoupUtil( String rawText,String baseUri){ 22 | this.rawText=rawText; 23 | this.baseUri=baseUri; 24 | } 25 | 26 | public Document getDoc() { 27 | if(doc==null){ 28 | if(baseUri!=null){ 29 | doc = Jsoup.parse(rawText, baseUri); 30 | }else{ 31 | doc = Jsoup.parse(rawText); 32 | } 33 | } 34 | return doc; 35 | } 36 | 37 | public Elements getElementsByTag(String tagname){ 38 | return this.getDoc().getElementsByTag(tagname); 39 | } 40 | 41 | public static void main(String[] argx) throws Exception{ 42 | File file=new File("/home/rock/kanxg/downloadfiles/amazon/B007FNCBQQ.html"); 43 | System.out.println("file is"+ file.getName()); 44 | StringBuffer buffer=new StringBuffer(); 45 | BufferedReader reader = new BufferedReader(new FileReader(file)); 46 | String line = null; 47 | while ((line = reader.readLine()) != null) { 48 | buffer.append(line); 49 | } 50 | JsoupUtil jsoupUtil=new JsoupUtil(buffer.toString(), ""); 51 | Elements links = jsoupUtil.getElementsByTag("a"); 52 | for (Element link : links) { 53 | String s = link.attr("abs:href"); 54 | if (s.startsWith("http://www.amazon.com")) { 55 | System.out.println("product url:"+ s); 56 | 57 | } 58 | } 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/io/HadoopSerializeUtil.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util.io; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Writable; 5 | import org.apache.hadoop.util.StringUtils; 6 | 7 | import java.io.*; 8 | 9 | /** 10 | * Created by rock on 1/1/16. 11 | */ 12 | public class HadoopSerializeUtil { 13 | /** 14 | * 将Writable对象转换成字节流 15 | */ 16 | public static byte[] serialize(Writable writable) throws IOException { 17 | ByteArrayOutputStream out = new ByteArrayOutputStream(); 18 | DataOutputStream dataOut = new DataOutputStream(out); 19 | writable.write(dataOut); 20 | dataOut.close(); 21 | return out.toByteArray(); 22 | } 23 | 24 | /** 25 | * 将字节流转换成Writable对象 26 | */ 27 | public static void deserialize(Writable writable, byte[] bytes) 28 | throws IOException { 29 | ByteArrayInputStream in = new ByteArrayInputStream(bytes); 30 | DataInputStream dataIn = new DataInputStream(in); 31 | writable.readFields(dataIn); 32 | dataIn.close(); 33 | } 34 | 35 | 36 | /** 37 | * 打印字节流 38 | */ 39 | public static void printBytesHex(byte[] bytes) { 40 | for (int i = 0; i < bytes.length; i++) { 41 | System.out.print(StringUtils.byteToHexString(bytes, i, i + 1) 42 | .toUpperCase()); 43 | if (i % 16 == 15) 44 | System.out.print('\n'); 45 | else if (i % 1 == 0) 46 | System.out.print(' '); 47 | } 48 | } 49 | 50 | public static void main(String[] args) throws IOException { 51 | IntWritable intWritable = new IntWritable(99999); 52 | // 序列化 53 | byte[] bytes = serialize(intWritable); 54 | printBytesHex(bytes); 55 | 56 | IntWritable intWritable2 = new IntWritable(); 57 | // 反序列化 58 | deserialize(intWritable2, bytes); 59 | System.out.println(intWritable2); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/localdebug/LocalDebugCrawlerWorker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.localdebug; 2 | 3 | import com.uumai.crawer.util.filesystem.UumaiFileUtil; 4 | import com.uumai.crawer2.CrawlerResult; 5 | import com.uumai.crawer2.CrawlerTasker; 6 | import com.uumai.crawer2.CrawlerWorker; 7 | import com.uumai.crawer2.download.Download; 8 | import com.uumai.crawer2.download.DownloadFactory; 9 | 10 | /** 11 | * Created by rock on 12/9/15. 12 | */ 13 | public class LocalDebugCrawlerWorker extends CrawlerWorker { 14 | public LocalDebugCrawlerWorker(CrawlerTasker tasker ) { 15 | super(tasker); 16 | } 17 | 18 | 19 | 20 | @Override 21 | protected void download() throws Exception { 22 | //try get from cache 23 | if(tasker.getSavefilename()!=null&&tasker.isUseingcache()){ 24 | UumaiFileUtil uumaiFileUtil=new UumaiFileUtil(); 25 | String text=uumaiFileUtil.readfromcache(tasker.getSavefilename()); 26 | if(text!=null) { 27 | this.result=new CrawlerResult(); 28 | result.setRawText(text); 29 | result.setUrl(tasker.getUrl()); 30 | System.out.println("read cache from file:" + tasker.getSavefilename()); 31 | return; 32 | } 33 | } 34 | 35 | Download download= DownloadFactory.getnewDownload(tasker.getDownloadType()); 36 | this.result=download.download(tasker); 37 | this.result.setUrl(tasker.getUrl()); 38 | // System.out.println("download html:"+tasker.getRawText()); 39 | 40 | 41 | 42 | if(tasker.getSavefilename()!=null&&tasker.isUseingcache()){ 43 | UumaiFileUtil uumaiFileUtil=new UumaiFileUtil(); 44 | uumaiFileUtil.save2file(tasker.getSavefilename(),result.getRawText()); 45 | System.out.println("save html to cache file:" + tasker.getSavefilename()); 46 | } 47 | 48 | } 49 | 50 | @Override 51 | protected void pipeline() throws Exception { 52 | 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jipiao/ctrip/JipiaoListTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jipiao.ctrip; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.mongodb.BasicDBObject; 7 | import com.mongodb.DBCursor; 8 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 9 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 10 | import com.uumai.crawer.quartz.util.QuartzQueryItem; 11 | import com.uumai.crawer.util.UumaiTime; 12 | import com.uumai.crawer2.download.CrawlerProxy; 13 | 14 | public class JipiaoListTasker extends QuartzLocalDebugAppMaster { //QuartzLocalDebugAppMaster{ 15 | 16 | @Override 17 | public void dobusiness() throws Exception { 18 | createonetask("http://flights.ctrip.com/schedule","ROOT"); 19 | 20 | 21 | 22 | 23 | } 24 | 25 | 26 | public void createonetask(String url,String from) throws Exception{ 27 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 28 | 29 | // tasker.setProxy(new CrawlerProxy("cn-proxy.jp.oracle.com", 80)); 30 | 31 | // tasker.setDownloadType(DownloadType.firefox_download); 32 | tasker.setUrl(url); 33 | // tasker.addRequestHeader("Referer", "http://flights.ctrip.com/booking/SHA-BJS1-day-1.html"); 34 | // tasker.addSeleniumAction("sendKeys", "id=DCityName1", "上海(SHA)"); 35 | // tasker.addSeleniumAction("sendKeys", "id=ACityName1", "北京(BJS)"); 36 | // tasker.addSeleniumAction("sendKeys", "id=DDate1", "2015-12-20"); 37 | // tasker.addSeleniumAction("click", "id=btnReSearch", null); 38 | tasker.addResultItem("from",from); 39 | 40 | // tasker.setStoreTableName("ctrip_jipiaolist"); 41 | tasker.addXpath_all("flight_name", "//div[@class='m']/a/text()"); 42 | tasker.addXpath_all("flight", "//div[@class='m']/a/@href"); 43 | 44 | putDistributeTask(tasker); 45 | } 46 | 47 | public static void main(String[] args) throws Exception{ 48 | 49 | JipiaoListTasker master = new JipiaoListTasker(); 50 | master.init(); 51 | 52 | master.start(); 53 | 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /fengchao/uumai-distribute-sys/uumai-storm/topology-core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 4.0.0 20 | 21 | 22 | com.uumai 23 | uumai-storm-topology-core 24 | jar 25 | 1.0 26 | 27 | 28 | 29 | 30 | 37 | 38 | 39 | com.uumai 40 | uumai-crawler-multi-core 41 | 1.0 42 | 43 | 44 | 45 | org.apache.storm 46 | storm-core 47 | 0.9.3 48 | 49 | provided 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/Java8Time.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util; 2 | 3 | import com.uumai.crawer.util.math.MathUtils; 4 | 5 | import java.text.ParseException; 6 | import java.text.SimpleDateFormat; 7 | import java.time.LocalDateTime; 8 | import java.time.format.DateTimeFormatter; 9 | 10 | /** 11 | * Created by rock on 10/26/15. 12 | */ 13 | public class Java8Time { 14 | 15 | DateTimeFormatter format = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); 16 | static SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 17 | 18 | public LocalDateTime getNow(){ 19 | 20 | LocalDateTime now = LocalDateTime.now(); 21 | return now; 22 | } 23 | 24 | public String getNowString(){ 25 | LocalDateTime now = LocalDateTime.now(); 26 | return format.format(now); 27 | 28 | } 29 | 30 | public float getTimeDuration(String startime){ 31 | return this.getTimeDuration(startime,getNowString()); 32 | } 33 | /** 34 | * 35 | * @param startime 36 | * @param endtime 37 | * @return 分钟 38 | */ 39 | public float getTimeDuration(String startime,String endtime){ 40 | // return joda_fmt.parseDateTime(endtime) -joda_fmt.parseDateTime(startime); 41 | try { 42 | float duration=df.parse(endtime).getTime()-df.parse(startime).getTime(); 43 | // String dr= MathUtils.round(new Double(duration).toString(), 2, 1); 44 | 45 | // return MathUtils.multiply(dr, "24"); 46 | return duration/(1000*60); 47 | // return dr; 48 | } catch (ParseException e) { 49 | e.printStackTrace(); 50 | return -1; 51 | } 52 | } 53 | 54 | public static void main(String[] args){ 55 | 56 | 57 | Java8Time java8Time=new Java8Time(); 58 | // System.out.println(java8Time.getNow()); 59 | 60 | // String newupdatetime=java8Time.getNowString(); 61 | // System.out.println(newupdatetime); 62 | float dura=java8Time.getTimeDuration("2016-03-08 10:33:43","2016-03-08 11:41:43"); 63 | System.out.println(dura); 64 | 65 | 66 | 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jipiao/ctrip/JipiaoInternListTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jipiao.ctrip; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.mongodb.BasicDBObject; 7 | import com.mongodb.DBCursor; 8 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 9 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 10 | import com.uumai.crawer.quartz.util.QuartzQueryItem; 11 | import com.uumai.crawer.util.UumaiTime; 12 | 13 | public class JipiaoInternListTasker extends QuartzLocalDebugAppMaster { //QuartzLocalDebugAppMaster{ 14 | 15 | @Override 16 | public void dobusiness() throws Exception { 17 | 18 | 19 | String link="http://flights.ctrip.com/international/Schedule/#schedule_a"; 20 | String from="A"; 21 | createonetask(link,from); 22 | 23 | 24 | 25 | } 26 | 27 | 28 | public void createonetask(String url,String from) throws Exception{ 29 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 30 | 31 | // tasker.setProxy(new CrawlerProxy("cn-proxy.jp.oracle.com", 80)); 32 | 33 | // tasker.setDownloadType(DownloadType.firefox_download); 34 | tasker.setUrl(url); 35 | // tasker.addRequestHeader("Referer", "http://flights.ctrip.com/booking/SHA-BJS1-day-1.html"); 36 | // tasker.addSeleniumAction("sendKeys", "id=DCityName1", "上海(SHA)"); 37 | // tasker.addSeleniumAction("sendKeys", "id=ACityName1", "北京(BJS)"); 38 | // tasker.addSeleniumAction("sendKeys", "id=DDate1", "2015-12-20"); 39 | // tasker.addSeleniumAction("click", "id=btnReSearch", null); 40 | tasker.addResultItem("from",from); 41 | 42 | // tasker.setStoreTableName("ctrip_jipiaolist_inter"); 43 | tasker.addXpath_all("flight_name", "//ul[@class='schedule_detail_list clearfix']//a/text()"); 44 | tasker.addXpath_all("flight", "//ul[@class='schedule_detail_list clearfix']//a/@href"); 45 | 46 | putDistributeTask(tasker); 47 | } 48 | 49 | public static void main(String[] args) throws Exception{ 50 | 51 | JipiaoInternListTasker master = new JipiaoInternListTasker(); 52 | master.init(); 53 | 54 | 55 | master.start(); 56 | 57 | } 58 | 59 | 60 | } 61 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jobs/lagou/JobTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jobs.lagou; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 7 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 8 | import com.uumai.crawer.quartz.util.QuartzQueryItem; 9 | import com.uumai.crawer.util.UumaiTime; 10 | import com.uumai.crawer2.download.CrawlerProxy; 11 | 12 | public class JobTasker extends QuartzLocalDebugAppMaster{ //QuartzLocalDebugAppMaster {// AbstractAppMaster { 13 | 14 | @Override 15 | public void dobusiness() throws Exception { 16 | 17 | // sendtask(1374710,taskerserie); 18 | 19 | 20 | 21 | 22 | 23 | } 24 | 25 | 26 | private void sendtask(int positionId) throws Exception{ 27 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 28 | // tasker.setCookies(cookie); 29 | // tasker.setUrl("http://data.eastmoney.com/zjlx/600307.html"); 30 | String url="http://www.lagou.com/jobs/"+positionId+".html"; 31 | // tasker.setProxy(new CrawlerProxy("cn-proxy.jp.oracle.com", 80)); 32 | 33 | // tasker.setRequestmethod("POST"); 34 | tasker.setUrl(url); 35 | tasker.setFollingRedirect(false); 36 | // tasker.setEncoding("gbk"); 37 | // tasker.setDownloadType(DownloadType.selenium_download); 38 | // tasker.setStoreTableName("lagou_jobs"); 39 | tasker.addResultItem("_id",positionId); 40 | 41 | tasker.addXpath("title", "//dt[@class='clearfix join_tc_icon']/h1/@title"); 42 | tasker.addXpath_all("job_request", "//dd[@class='job_request']/p[1]/span/text()"); 43 | tasker.addXpath("zhiweiyouhuo", "//dd[@class='job_request']/p[2]/text()"); 44 | tasker.addXpath("description", "//dd[@class='job_bt']/allText()"); 45 | tasker.addXpath("jd_publisher", "//div[@class='publisher_name']/allText()"); 46 | 47 | tasker.addXpath("companyid", "//input[@id='companyid']/@value"); 48 | putDistributeTask(tasker); 49 | 50 | } 51 | 52 | public static void main(String[] args) throws Exception { 53 | 54 | 55 | JobTasker master=new JobTasker(); 56 | 57 | master.init(); 58 | 59 | master.start(); 60 | 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/remoteshelldownload/RemoveShellDownload.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.remoteshelldownload; 2 | 3 | import com.uumai.crawer2.CrawlerResult; 4 | import com.uumai.crawer2.CrawlerTasker; 5 | import com.uumai.crawer2.download.Download; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.InputStreamReader; 9 | 10 | /** 11 | * Created by rock on 4/10/16. 12 | */ 13 | public class RemoveShellDownload implements Download { 14 | @Override 15 | public CrawlerResult download(CrawlerTasker tasker) throws Exception { 16 | 17 | //copy jar from HDFS to local /tmp 18 | 19 | CrawlerResult result = new CrawlerResult(); 20 | String[] cmd = {"/bin/sh", "-c", tasker.getUrl()}; 21 | 22 | try { 23 | Process p0 = Runtime.getRuntime().exec(cmd); 24 | //读取标准输出流 25 | StringBuffer osb = new StringBuffer(); 26 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(p0.getInputStream())); 27 | String line; 28 | while ((line = bufferedReader.readLine()) != null) { 29 | // System.out.println(line); 30 | osb.append(line); 31 | } 32 | //读取标准错误流 33 | StringBuffer esb = new StringBuffer(); 34 | BufferedReader brError = new BufferedReader(new InputStreamReader(p0.getErrorStream())); 35 | String errline = null; 36 | while ((errline = brError.readLine()) != null) { 37 | // System.out.println(errline); 38 | esb.append(errline); 39 | } 40 | //waitFor()判断Process进程是否终止,通过返回值判断是否正常终止。0代表正常终止 41 | int returnkey = p0.waitFor(); 42 | result.setReturncode(returnkey); 43 | result.setRawText(osb.toString() + esb.toString()); 44 | } catch (Exception e1) { 45 | result.setRawText(e1.getMessage()); 46 | result.setReturncode(-1); 47 | } 48 | return result; 49 | } 50 | private void copyHDFS2local(String hdfsurl,String localdir){ 51 | 52 | } 53 | private void cleantasker(){ 54 | //remove tmp dir 55 | 56 | } 57 | } -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/core/cookies/CookieConstant.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.core.cookies; 2 | 3 | import java.io.File; 4 | import java.util.List; 5 | 6 | import com.uumai.crawer.util.CookieUtil; 7 | import com.uumai.crawer.util.UumaiProperties; 8 | import com.uumai.crawer2.CookieManager.CookieHelper; 9 | import com.uumai.crawer2.CookieManager.CrawlerCookie; 10 | 11 | public class CookieConstant { 12 | 13 | // public static String xueqiu_cookie = CookieUtil.readcookiefromfile(new File("/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket/crawler-website/crawler-quartz-client/deploy/resources/xueqiu_cookies.txt")); 14 | 15 | public static List amazon_cookie = new CookieHelper().readcookiefromfile(new File(UumaiProperties.getUUmaiHome()+ "/cookies/jd_cookies.txt"));; //=new CookieHelper().readcookiefromfile(new File("/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket/crawler-website/crawler-quartz-client/deploy/resources/jd_cookies.txt")); 16 | 17 | // CookieUtil.readcookiefromfile(new File("/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket/crawler-website/crawler-quartz-client/deploy/resources/amazon_cookies.txt")); 18 | // public static String amazoncn_cookie=CookieUtil.readcookiefromfile(new File("/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket/crawler-website/crawler-quartz-client/deploy/resources/amazoncn_cookies.txt")); 19 | 20 | public static List jd_cookie=new CookieHelper().readcookiefromfile(new File(UumaiProperties.getUUmaiHome() + "/cookies/jd_cookies.txt")); 21 | 22 | public static List jdlianmengapi_cookie=new CookieHelper().readcookiefromfile(new File(UumaiProperties.getUUmaiHome() + "/cookies/jdlianmengapi_cookies.txt")); 23 | 24 | 25 | 26 | 27 | // public static String fiveonejob_cookie=CookieUtil.readcookiefromfile(new File("/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket/crawler-website/crawler-quartz-client/deploy/resources/51job_cookies.txt")); 28 | 29 | // public static String linkedin_cookie=CookieUtil.readcookiefromfile(new File("/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket/crawler-website/crawler-quartz-client/deploy/resources/linkedin_cookies.txt")); 30 | 31 | 32 | } 33 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/CookieManager/CrawlerCookie.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.CookieManager; 2 | 3 | import java.io.Serializable; 4 | import java.util.Date; 5 | 6 | /** 7 | * Created by rock on 12/11/15. 8 | */ 9 | public class CrawlerCookie implements Serializable { 10 | private String name; 11 | private String value; 12 | private String path; 13 | private String domain; 14 | private Date expiry; 15 | private boolean isSecure; 16 | private boolean isHttpOnly; 17 | 18 | public String getName() { 19 | return name; 20 | } 21 | 22 | public void setName(String name) { 23 | this.name = name; 24 | } 25 | 26 | public String getValue() { 27 | return value; 28 | } 29 | 30 | public void setValue(String value) { 31 | this.value = value; 32 | } 33 | 34 | public String getPath() { 35 | return path; 36 | } 37 | 38 | public void setPath(String path) { 39 | this.path = path; 40 | } 41 | 42 | public String getDomain() { 43 | return domain; 44 | } 45 | 46 | public void setDomain(String domain) { 47 | this.domain = domain; 48 | } 49 | 50 | public Date getExpiry() { 51 | return expiry; 52 | } 53 | 54 | public void setExpiry(Date expiry) { 55 | this.expiry = expiry; 56 | } 57 | 58 | public boolean isSecure() { 59 | return isSecure; 60 | } 61 | 62 | public void setSecure(boolean isSecure) { 63 | this.isSecure = isSecure; 64 | } 65 | 66 | public boolean isHttpOnly() { 67 | return isHttpOnly; 68 | } 69 | 70 | public void setHttpOnly(boolean isHttpOnly) { 71 | this.isHttpOnly = isHttpOnly; 72 | } 73 | 74 | 75 | @Override 76 | public String toString() { 77 | return "CrawlerCookie{" + 78 | "name='" + name + '\'' + 79 | ", value='" + value + '\'' + 80 | ", path='" + path + '\'' + 81 | ", domain='" + domain + '\'' + 82 | ", expiry=" + expiry + 83 | ", isSecure=" + isSecure + 84 | ", isHttpOnly=" + isHttpOnly + 85 | '}'; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /fengchao/uumai-distribute-sys/uumai-mapreduce/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 19 | 20 | com.uumai 21 | uumai-distribute-sys 22 | 1.0 23 | 24 | 25 | 26 | 4.0.0 27 | 28 | com.uumai 29 | uumai-mapreduce 30 | jar 31 | 1.0 32 | 33 | uumai-mapreduce 34 | 35 | 36 | 37 | com.uumai 38 | uumai-crawler-multi-core 39 | 1.0 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | org.apache.hadoop 48 | hadoop-mapreduce-client-jobclient 49 | 2.6.0 50 | provided 51 | 52 | 53 | 54 | 55 | org.apache.hadoop 56 | hadoop-mapreduce-client-app 57 | 2.6.0 58 | provided 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/audio/Player.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util.audio; 2 | 3 | /** 4 | * Created by rock on 11/4/15. 5 | */ 6 | 7 | import java.io.*; 8 | import java.net.URL; 9 | import sun.audio.*; 10 | 11 | 12 | public class Player { 13 | private AudioStream as; //单次播放声音用 14 | ContinuousAudioDataStream cas;//循环播放声音 15 | // 构造函数 16 | public Player(String filename) 17 | { 18 | try { 19 | InputStream in = new FileInputStream(filename); 20 | 21 | //打开一个声音文件流作为输入 22 | as = new AudioStream (in); 23 | } catch (FileNotFoundException e) { 24 | e.printStackTrace(); 25 | } catch (IOException e) { 26 | e.printStackTrace(); 27 | } 28 | } 29 | // 一次播放 开始 30 | public void start() 31 | { 32 | if( as==null ){ 33 | System.out.println("AudioStream object is not created!"); 34 | return; 35 | }else{ 36 | AudioPlayer.player.start (as); 37 | } 38 | } 39 | // 一次播放 停止 40 | public void stop() 41 | { 42 | if( as==null ){ 43 | System.out.println("AudioStream object is not created!"); 44 | return; 45 | }else{ 46 | AudioPlayer.player.stop(as); 47 | } 48 | } 49 | // 循环播放 开始 50 | public void continuousStart() 51 | { 52 | // Create AudioData source. 53 | AudioData data = null; 54 | try { 55 | data = as.getData(); 56 | } catch (IOException e) { 57 | // TODO Auto-generated catch block 58 | e.printStackTrace(); 59 | } 60 | 61 | // Create ContinuousAudioDataStream. 62 | cas = new ContinuousAudioDataStream (data); 63 | 64 | // Play audio. 65 | AudioPlayer.player.start(cas); 66 | } 67 | // 循环播放 停止 68 | public void continuousStop() 69 | { 70 | if(cas != null) 71 | { 72 | AudioPlayer.player.stop (cas); 73 | } 74 | } 75 | 76 | public static void main(String[] args){ 77 | Player player=new Player("/home/rock/kanxg/tools/pidgin-2.10.11/share/sounds/alert.wav"); 78 | player.start(); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/selenium/test/TestPhantomJSDriver.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.selenium.test; 2 | 3 | import com.uumai.crawer.util.UumaiProperties; 4 | import org.openqa.selenium.By; 5 | import org.openqa.selenium.WebDriver; 6 | import org.openqa.selenium.WebElement; 7 | import org.openqa.selenium.htmlunit.HtmlUnitDriver; 8 | import org.openqa.selenium.phantomjs.PhantomJSDriver; 9 | import org.openqa.selenium.remote.DesiredCapabilities; 10 | 11 | /** 12 | * Created by rock on 12/4/15. 13 | */ 14 | public class TestPhantomJSDriver { 15 | 16 | static{ 17 | System.getProperties().setProperty("phantomjs.binary.path", 18 | UumaiProperties.getUUmaiHome() + "/driver/phantomjs"); 19 | } 20 | public static void main(String[] args){ 21 | // Notice that the remainder of the code relies on the interface, 22 | // not the implementation. 23 | DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs(); 24 | desiredCapabilities.setCapability("loadImages",false); 25 | desiredCapabilities.setJavascriptEnabled(true); 26 | 27 | PhantomJSDriver driver = new PhantomJSDriver(desiredCapabilities); 28 | // And now use this to visit Google 29 | driver.get("file:///home/rock/kanxg/knowledges/bigdata/hadoop/zookeeper/ZooKeeper%E5%8E%9F%E7%90%86%E5%8F%8A%E4%BD%BF%E7%94%A8%20%20%20%E5%B0%8F%E6%AD%A6%E5%93%A5%E7%9A%84%E5%8D%9A%E5%AE%A2%20-%20%E5%B7%A6%E6%89%8B%E7%A8%8B%E5%BA%8F%E5%8F%B3%E6%89%8B%E8%AF%97.html"); 30 | 31 | // Find the text input element by its name 32 | // WebElement element = driver.findElement(By.name("q")); 33 | 34 | // Enter something to search for 35 | // element.sendKeys("Cheese!"); 36 | 37 | // Now submit the form. WebDriver will find the form for us from the element 38 | // element.submit(); 39 | 40 | // Check the title of the page 41 | System.out.println("Page title is: " + driver.getTitle()); 42 | 43 | WebElement webElement = driver.findElement(By.xpath("/html")); 44 | String content = webElement.getAttribute("outerHTML"); 45 | System.out.print("html:"+content); 46 | driver.close(); 47 | driver.quit(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jobs/fiveonejob/JobDetailTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jobs.fiveonejob; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | import java.util.Set; 7 | 8 | import com.mongodb.BasicDBObject; 9 | import com.mongodb.DBCursor; 10 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 11 | import com.uumai.crawer.quartz.core.cookies.CookieConstant; 12 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 13 | import com.uumai.crawer.quartz.util.QuartzQueryItem; 14 | import com.uumai.crawer.util.UumaiTime; 15 | import com.uumai.crawer2.download.Download; 16 | 17 | public class JobDetailTasker extends QuartzLocalDebugAppMaster { 18 | Set asinSet = new HashSet(); 19 | 20 | 21 | @Override 22 | public void dobusiness() throws Exception { 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | } 31 | 32 | 33 | 34 | private void sendtask(String jobid,String url) throws Exception{ 35 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 36 | // tasker.setCookies(cookie); 37 | // tasker.setUrl("http://data.eastmoney.com/zjlx/600307.html"); 38 | // tasker.setCookies(CookieConstant.fiveonejob_cookie); 39 | 40 | 41 | tasker.setUrl(url); 42 | tasker.setEncoding("gbk"); 43 | // tasker.setDownloadType(DownloadType.selenium_download); 44 | // tasker.setStoreTableName("51job_detail"); 45 | tasker.addResultItem("_id",new Integer(jobid)); 46 | 47 | tasker.addResultItem("url", url); 48 | 49 | tasker.addXpath("title","//li[@class='tCompany_job_name']/allText()"); 50 | tasker.addXpath("desc","//div[@class='tCompany_text']/allText()"); 51 | // tasker.addJsonpath("resultCode", "jingdong_service_promotion_getcode_responce.queryjs_result.resultCode"); 52 | // tasker.addJsonpath("resultMessage", "jingdong_service_promotion_getcode_responce.queryjs_result.resultMessage"); 53 | // tasker.addJsonpath("url", "jingdong_service_promotion_getcode_responce.queryjs_result.url"); 54 | 55 | putDistributeTask(tasker); 56 | 57 | } 58 | 59 | public static void main(String[] args) throws Exception { 60 | 61 | 62 | JobDetailTasker master=new JobDetailTasker(); 63 | 64 | master.init(); 65 | 66 | master.start(); 67 | 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /fengchao/uumai-common/mongodb-client/src/main/java/com/uumai/crawer/util/MongoUtil.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util; 2 | 3 | import com.mongodb.DB; 4 | import com.mongodb.Mongo; 5 | import com.mongodb.MongoClient; 6 | import com.mongodb.MongoClientURI; 7 | 8 | public class MongoUtil 9 | { 10 | private static final int port = 27017; 11 | private static final String host = "localhost"; 12 | private static final String default_dbName = "mydb"; 13 | // private Mongo mongo = null; 14 | // private DB db = null; 15 | 16 | // private Morphia morphia = null; 17 | private MongoClient mongoClient = null; 18 | // private Datastore ds = null; 19 | 20 | // public static Mongo getMongo() 21 | // { 22 | // if (mongo == null) { 23 | // try { 24 | // mongo = new Mongo("localhost", 27017); 25 | // } catch (Exception e) { 26 | // e.printStackTrace(); 27 | // } 28 | // } 29 | // return mongo; 30 | // } 31 | public DB getDB() { 32 | return getMongoClient().getDB(default_dbName); 33 | } 34 | public DB getDB(String dbName) { 35 | return getMongoClient().getDB(dbName); 36 | } 37 | private MongoClient getMongoClient() { 38 | if (mongoClient == null) { 39 | try { 40 | //mongoClient = new MongoClient(new MongoClientURI(System.getProperty("uumai.mongodb.MONGO_URI", "mongodb://localhost:27017"))); 41 | mongoClient = new MongoClient(new MongoClientURI(UumaiProperties.readconfig("uumai.mongodb.MONGO_URI", "mongodb://localhost:27017"))); 42 | 43 | } catch (Exception e) { 44 | e.printStackTrace(); 45 | } 46 | } 47 | return mongoClient; 48 | } 49 | 50 | // public Morphia getMorphia() { 51 | // if (morphia == null) { 52 | // morphia = new Morphia(); 53 | //// morphia.map(new Class[] { AmazonProduct.class }).map(new Class[] { AmazonCategory.class }); 54 | //// morphia.map(new Class[] { KeepaHistory.class }); 55 | //// morphia.map(new Class[] { JdProduct.class }); 56 | // } 57 | // return morphia; 58 | // } 59 | // public Datastore getDs() { 60 | // if (ds == null) { 61 | // ds = getMorphia().createDatastore(getMongoClient(), "mydb"); 62 | // } 63 | // return ds; 64 | // } 65 | 66 | 67 | 68 | public void close(){ 69 | if(mongoClient!=null){ 70 | mongoClient.close(); 71 | mongoClient=null; 72 | } 73 | } 74 | 75 | // public static String getDbName() { 76 | // return "mydb"; 77 | // } 78 | } -------------------------------------------------------------------------------- /fengchao/uumai-common/mongodb-client/src/main/java/com/uumai/dao/helper/Json2DBHelper.java: -------------------------------------------------------------------------------- 1 | package com.uumai.dao.helper; 2 | 3 | import com.mongodb.DB; 4 | import com.mongodb.DBCollection; 5 | import com.mongodb.DBObject; 6 | import com.mongodb.MongoException; 7 | import com.mongodb.util.JSON; 8 | import com.uumai.crawer.util.MongoUtil; 9 | import java.io.IOException; 10 | 11 | public class Json2DBHelper 12 | { 13 | MongoUtil mongoUtil=new MongoUtil(); 14 | 15 | // public void store(String json, String collectionname) throws IOException { 16 | // try { 17 | // 18 | //// Mongo mongo = new Mongo("localhost", 27017); 19 | //// DB db = mongo.getDB("mydb"); 20 | // DB db = MongoUtil.getDb(); 21 | // DBCollection collection = db.getCollection(collectionname); 22 | // // InputStream is = ConvertXMLtoJSON.class 23 | // // .getResourceAsStream("sample.xml"); 24 | // // String xml = IOUtils.toString(is); 25 | // // System.out.println(xml); 26 | // // XMLSerializer xmlSerializer = new XMLSerializer(); 27 | // // JSON json = xmlSerializer.read(xml); 28 | // 29 | // // convert JSON to DBObject directly 30 | // DBObject object = (DBObject) com.mongodb.util.JSON.parse(json); 31 | // collection.insert(object); 32 | // // DBCursor cursorDoc = collection.find(); 33 | // // while (cursorDoc.hasNext()) { 34 | // // System.out.println(cursorDoc.next()); 35 | // // } 36 | // System.out.println("Done"); 37 | // } catch (MongoException e) { 38 | // e.printStackTrace(); 39 | // } 40 | // } 41 | // 42 | public void store(String json, String collectionname) 43 | throws IOException 44 | { 45 | try 46 | { 47 | DB db = mongoUtil.getDB(); 48 | DBCollection collection = db.getCollection(collectionname); 49 | 50 | DBObject object = (DBObject)JSON.parse(json); 51 | collection.insert(new DBObject[] { object }); 52 | 53 | //System.out.println("Done"); 54 | mongoUtil.close(); 55 | } catch (MongoException e) { 56 | e.printStackTrace(); 57 | } 58 | } 59 | 60 | public static void main(String[] args) throws Exception { 61 | Json2DBHelper helper = new Json2DBHelper(); 62 | helper.store("{A:111}", "c1"); 63 | } 64 | } -------------------------------------------------------------------------------- /fengchao/uumai-common/zookeeper-client/src/main/java/com/uumai/zookeeperclient/uitl/ZookeeperFactory.java: -------------------------------------------------------------------------------- 1 | package com.uumai.zookeeperclient.uitl; 2 | 3 | import org.apache.zookeeper.WatchedEvent; 4 | import org.apache.zookeeper.Watcher; 5 | import org.apache.zookeeper.ZooKeeper; 6 | 7 | /** 8 | * Created by rock on 7/9/15. 9 | */ 10 | public class ZookeeperFactory { 11 | public static String ZOOKEEPER_SERVER="uumai_zk_server:2181"; //UumaiProperties.readconfig("uumai.zookeeper.server","localhost:2181"); 12 | public static int ZOOKEEPER_sessionTimeout=5000; 13 | 14 | public static synchronized ZooKeeper getinstance(String ZOOKEEPER_SERVER) throws Exception { 15 | ZooKeeper zk = new ZooKeeper(ZOOKEEPER_SERVER,ZOOKEEPER_sessionTimeout,new Watcher() { 16 | // 监控所有被触发的事件 17 | public void process(WatchedEvent event) { 18 | // System.out.println(event.getPath());MongoDao 19 | // System.out.println(event.getType().name()); 20 | // System.out.println(event.getState().getIntValue()); 21 | } 22 | }); 23 | return zk; 24 | } 25 | 26 | public static synchronized ZooKeeper getinstance() throws Exception{ 27 | ZooKeeper zk= new ZooKeeper(ZOOKEEPER_SERVER,ZOOKEEPER_sessionTimeout,new Watcher() { 28 | // 监控所有被触发的事件 29 | public void process(WatchedEvent event) { 30 | // System.out.println(event.getPath()); 31 | // System.out.println(event.getType().name()); 32 | // System.out.println(event.getState().getIntValue()); 33 | } 34 | }); 35 | 36 | return zk; 37 | } 38 | 39 | // //@nouse 40 | // public static ZooKeeper getinstancewithWatch(Watcher watcher) throws Exception{ 41 | // ZooKeeper zk = new ZooKeeper("uumai_zookeeperserver:2181", 500000,watcher); 42 | // 43 | //// ZooKeeper zk = new ZooKeeper(zookeeperserverip, 500000, new Watcher() { 44 | //// // 监控所有被触发的事件 45 | //// public void process(WatchedEvent event) { 46 | //// System.out.println(event.getPath()); 47 | //// System.out.println(event.getType().name()); 48 | //// //System.out.println(event.getState().getIntValue()); 49 | //// } 50 | //// }); 51 | // return zk; 52 | // } 53 | 54 | 55 | } 56 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/PlainText.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * Selectable plain text.
8 | * Can not be selected by XPath or CSS Selector. 9 | * 10 | * @author
11 | * @since 0.1.0 12 | */ 13 | public class PlainText extends AbstractSelectable { 14 | 15 | protected List sourceTexts; 16 | 17 | public PlainText(List sourceTexts) { 18 | this.sourceTexts = sourceTexts; 19 | } 20 | 21 | public PlainText(String text) { 22 | this.sourceTexts = new ArrayList(); 23 | sourceTexts.add(text); 24 | } 25 | 26 | public static PlainText create(String text) { 27 | return new PlainText(text); 28 | } 29 | 30 | @Override 31 | public Selectable xpath(String xpath) { 32 | throw new UnsupportedOperationException("XPath can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); 33 | } 34 | 35 | @Override 36 | public Selectable $(String selector) { 37 | throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); 38 | } 39 | 40 | @Override 41 | public Selectable $(String selector, String attrName) { 42 | throw new UnsupportedOperationException("$ can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); 43 | } 44 | 45 | @Override 46 | public Selectable smartContent() { 47 | throw new UnsupportedOperationException("Smart content can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); 48 | } 49 | 50 | @Override 51 | public Selectable links() { 52 | throw new UnsupportedOperationException("Links can not apply to plain text. Please check whether you use a previous xpath with attribute select (/@href etc)."); 53 | } 54 | 55 | @Override 56 | public List nodes() { 57 | List nodes = new ArrayList(getSourceTexts().size()); 58 | for (String string : getSourceTexts()) { 59 | nodes.add(PlainText.create(string)); 60 | } 61 | return nodes; 62 | } 63 | 64 | @Override 65 | protected List getSourceTexts() { 66 | return sourceTexts; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jiudian/ctrip/JiudianTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jiudian.ctrip; 2 | 3 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 4 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 5 | import com.uumai.crawer.util.UumaiTime; 6 | import com.uumai.crawer2.download.CrawlerProxy; 7 | 8 | public class JiudianTasker extends QuartzLocalDebugAppMaster { //QuartzLocalDebugAppMaster{ 9 | @Override 10 | public void dobusiness() throws Exception { 11 | createonetask(2,"2016-01-22","2016-01-23",1); 12 | 13 | 14 | } 15 | 16 | public void createonetask(int cityId,String StartTime,String DepTime,int page) throws Exception{ 17 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 18 | String url="http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx?cityId="+cityId+"&StartTime="+StartTime+"&DepTime="+DepTime+"&operationtype=NEWHOTELORDER&IsOnlyAirHotel=F&htlPageView=0&hotelType=F&hasPKGHotel=F&requestTravelMoney=F&isusergiftcard=F&useFG=F&priceRange=-2&promotion=F&prepay=F&IsCanReserve=F&OrderBy=99&hidTestLat=0%257C0&HideIsNoneLogin=T&isfromlist=T&ubt_price_key=htl_search_result_promotion&isHuaZhu=False&abForHuaZhu=True&markType=0&a=0&contrast=0&page="+page+"&contyped=0"; 19 | 20 | // tasker.setProxy(new CrawlerProxy("cn-proxy.jp.oracle.com", 80)); 21 | 22 | // tasker.setDownloadType(DownloadType.firefox_download); 23 | tasker.setUrl(url); 24 | // tasker.addRequestHeader("Referer", "http://hotels.ctrip.com/hotel/beijing2"); 25 | // tasker.setRequestmethod("POST"); 26 | 27 | // tasker.addSeleniumAction("sendKeys", "id=DCityName1", "上海(SHA)"); 28 | // tasker.addSeleniumAction("sendKeys", "id=ACityName1", "北京(BJS)"); 29 | // tasker.addSeleniumAction("sendKeys", "id=DDate1", "2015-12-20"); 30 | // tasker.addSeleniumAction("click", "id=btnReSearch", null); 31 | tasker.addResultItem("cityId",cityId); 32 | tasker.addResultItem("StartTime",StartTime); 33 | tasker.addResultItem("DepTime",DepTime); 34 | tasker.addResultItem("page",page); 35 | 36 | // tasker.setStoreTableName("ctrip_jiudian"); 37 | // tasker.addXpath_all("flight", "//div[@class='info-flight']/allText()"); 38 | tasker.addJsonpath("json", "*"); 39 | 40 | putDistributeTask(tasker); 41 | } 42 | 43 | public static void main(String[] args) throws Exception{ 44 | 45 | JiudianTasker master = new JiudianTasker(); 46 | master.init(); 47 | 48 | master.start(); 49 | 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/pom-withdependencies.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 4.0.0 6 | 7 | 8 | com.uumai 9 | crawler-example 10 | 1.0 11 | 12 | 13 | 14 | 15 | 16 | com.uumai 17 | uumai-crawler-quartz 18 | 1.0 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | org.apache.maven.plugins 31 | maven-compiler-plugin 32 | 2.3.2 33 | 34 | 1.6 35 | 1.6 36 | UTF-8 37 | 38 | 39 | 40 | org.apache.maven.plugins 41 | maven-assembly-plugin 42 | 2.2-beta-5 43 | 44 | 45 | jar-with-dependencies 46 | 47 | 48 | 49 | true 50 | com.uumai.crawer.notinusing.emaillist.crawler.AfProductProcessor 51 | 52 | 53 | 54 | 55 | 56 | 57 | make-assembly 58 | package 59 | 60 | assembly 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jobs/fiveonejob/FiveonejobTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jobs.fiveonejob; 2 | 3 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 4 | import com.uumai.crawer.quartz.core.cookies.CookieConstant; 5 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 6 | import com.uumai.crawer.util.UumaiTime; 7 | import com.uumai.crawer2.download.Download; 8 | 9 | public class FiveonejobTasker extends QuartzLocalDebugAppMaster { 10 | 11 | @Override 12 | public void dobusiness() throws Exception { 13 | String searchtext="java"; 14 | sendtask(searchtext,1); 15 | 16 | 17 | 18 | 19 | } 20 | 21 | 22 | private void sendtask(String searchtext, int page) throws Exception{ 23 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 24 | // tasker.setCookies(cookie); 25 | // tasker.setUrl("http://data.eastmoney.com/zjlx/600307.html"); 26 | String url="http://search.51job.com/jobsearch/search_result.php"; 27 | // tasker.setCookies(CookieConstant.fiveonejob_cookie); 28 | 29 | tasker.setPostdata("fromJs=1&jobarea=000000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=9&providesalary=99&keyword="+searchtext+"&keywordtype=0&curr_page="+page+"&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9"); 30 | tasker.setRequestmethod("POST"); 31 | tasker.setUrl(url); 32 | tasker.setEncoding("gbk"); 33 | // tasker.setDownloadType(DownloadType.selenium_download); 34 | // tasker.setStoreTableName("51job"); 35 | tasker.addResultItem("searchtext",searchtext); 36 | tasker.addResultItem("page",page); 37 | 38 | tasker.addResultItem("url", url); 39 | 40 | tasker.addXpath_all("title","//div[@class='el']/p/a/text()"); 41 | tasker.addXpath_all("link","//div[@class='el']/p/a/@href"); 42 | // tasker.addJsonpath("resultCode", "jingdong_service_promotion_getcode_responce.queryjs_result.resultCode"); 43 | // tasker.addJsonpath("resultMessage", "jingdong_service_promotion_getcode_responce.queryjs_result.resultMessage"); 44 | // tasker.addJsonpath("url", "jingdong_service_promotion_getcode_responce.queryjs_result.url"); 45 | 46 | putDistributeTask(tasker); 47 | 48 | } 49 | 50 | public static void main(String[] args) throws Exception { 51 | 52 | 53 | FiveonejobTasker master=new FiveonejobTasker(); 54 | 55 | master.init(); 56 | 57 | master.start(); 58 | 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jobs/lagou/GongsiTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jobs.lagou; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 7 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 8 | import com.uumai.crawer.quartz.util.QuartzQueryItem; 9 | import com.uumai.crawer.util.UumaiTime; 10 | import com.uumai.crawer2.CookieManager.CrawlerCookie; 11 | import com.uumai.crawer2.download.CrawlerProxy; 12 | 13 | public class GongsiTasker extends QuartzLocalDebugAppMaster { //QuartzLocalDebugAppMaster {// AbstractAppMaster { 14 | 15 | @Override 16 | public void dobusiness() throws Exception { 17 | 18 | // 19 | 20 | 21 | 22 | 23 | } 24 | 25 | 26 | 27 | 28 | private void sendtask(int compid) throws Exception{ 29 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 30 | // tasker.setCookies(cookie); 31 | // tasker.setUrl("http://data.eastmoney.com/zjlx/600307.html"); 32 | String url="http://www.lagou.com/gongsi/"+compid+".html"; 33 | 34 | // tasker.setRequestmethod("POST"); 35 | tasker.setUrl(url); 36 | tasker.setFollingRedirect(false); 37 | // tasker.setProxy(new CrawlerProxy("cn-proxy.jp.oracle.com", 80)); 38 | 39 | // tasker.setEncoding("gbk"); 40 | // tasker.setDownloadType(DownloadType.selenium_download); 41 | // tasker.setStoreTableName("lagou_gongsi"); 42 | tasker.addResultItem("_id",compid); 43 | 44 | 45 | // tasker.addXpath("html", "*"); 46 | tasker.addXpath("companyname", "//div[@class='company_main']/h1/a/text()"); 47 | tasker.addXpath("companyimage", "//img[@alt='公司Logo']/@src"); 48 | tasker.addXpath("companurl", "//div[@class='company_main']/h1/a/@href"); 49 | tasker.addXpath("companyword", "//div[@class='company_main']/div/text()"); 50 | tasker.addXpath_all("company_products", "//div[@id='company_products']/div[2]/allText()"); 51 | 52 | tasker.addXpath("company_intro_text", "//div[@class='company_intro_text']/allText()"); 53 | tasker.addXpath_all("basic_container", "//div[@id='basic_container']//span/text()"); 54 | tasker.addXpath("company_mangers_item", "//div[@class='company_mangers_item']/allText()"); 55 | 56 | tasker.addXpath("companyaddress", "//p[@class='mlist_li_title']/allText()"); 57 | tasker.addXpath("companyaddress_detail", "//p[@class='mlist_li_desc']/allText()"); 58 | putDistributeTask(tasker); 59 | 60 | } 61 | 62 | public static void main(String[] args) throws Exception { 63 | 64 | 65 | GongsiTasker master=new GongsiTasker(); 66 | 67 | master.init(); 68 | 69 | master.start(); 70 | 71 | } 72 | 73 | 74 | 75 | } 76 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/CookieUtil.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.Reader; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | /** 13 | * Created by rock on 7/24/15. 14 | */ 15 | public class CookieUtil { 16 | 17 | static Map cookielist=new HashMap(); 18 | 19 | // public static String loadCookie(String cookiekey){ 20 | // String cookievalue=cookielist.get(cookiekey); 21 | // if(cookievalue==null){ 22 | // File cookiefile=new File(UumaiProperties.getConfigRootPath()+File.separator +cookiekey +".cookie.txt"); 23 | // if(!cookiefile.exists()) { 24 | // cookievalue=""; 25 | // }else{ 26 | // cookievalue=readcookiefromfile(cookiefile); 27 | // } 28 | // cookielist.put(cookiekey,cookievalue); 29 | // } 30 | // return cookievalue; 31 | // } 32 | 33 | // public static String readcookiefromfile(File file){ 34 | // StringBuffer sb=new StringBuffer(); 35 | // boolean firstline=true; 36 | // BufferedReader reader=null; 37 | // try{ 38 | // reader=new BufferedReader(new FileReader(file)); 39 | // while (true){ 40 | // String linetext=reader.readLine(); 41 | // if(linetext==null) break; 42 | // String[] splittexts= linetext.split("\t"); 43 | // if(splittexts.length>1){ 44 | //// System.out.println("cookie key:"+splittexts[splittexts.length-2]); 45 | //// System.out.println("cookie value:"+splittexts[splittexts.length-1]); 46 | // 47 | // if(!firstline) 48 | // sb.append(";"); 49 | // 50 | // sb.append(splittexts[splittexts.length-2]); 51 | // sb.append("="); 52 | // sb.append(splittexts[splittexts.length-1]); 53 | // 54 | // if(firstline) firstline=false; 55 | // } 56 | // 57 | // } 58 | // reader.close(); 59 | // }catch(Exception e){ 60 | // e.printStackTrace(); 61 | // } 62 | // return sb.toString(); 63 | // 64 | // } 65 | 66 | public static void main(String[] args) throws Exception { 67 | // UumaiProperties.init("/home/rock/kanxg/Dropbox/mysourcecode/uumai/bitbucket/shop_indexer/crawler-example/deploy/resources/uumai.properties"); 68 | // System.out.println(CookieUtil.loadCookie("amazon")); 69 | ; 70 | 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /crawler-website/crawler-example/src/main/java/com/uumai/crawer/quartz/jipiao/ctrip/JipiaoTasker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.jipiao.ctrip; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.mongodb.BasicDBObject; 7 | import com.mongodb.DBCursor; 8 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 9 | import com.uumai.crawer.quartz.localdebug.QuartzLocalDebugAppMaster; 10 | import com.uumai.crawer.quartz.util.QuartzQueryItem; 11 | import com.uumai.crawer.util.UumaiTime; 12 | import com.uumai.crawer2.download.CrawlerProxy; 13 | import com.uumai.crawer2.download.Download.DownloadType; 14 | 15 | public class JipiaoTasker extends QuartzLocalDebugAppMaster { //QuartzLocalDebugAppMaster{ 16 | 17 | @Override 18 | public void dobusiness() throws Exception { 19 | String datatime="2016-01-25"; 20 | checkflight("SHA","CAN","2016-01-25"); 21 | 22 | 23 | 24 | 25 | } 26 | private void checkflight(String flight_name,String flight,String DDate1) throws Exception{ 27 | String[] s=flight.replace("http://flights.ctrip.com/schedule/", "").replace(".html", "").split("\\."); 28 | if(s==null||s.length!=2) return; 29 | createonetask(flight_name,s[0].toUpperCase(),s[1].toUpperCase(),DDate1); 30 | 31 | } 32 | 33 | 34 | 35 | public void createonetask(String flight_name,String DCity1,String ACity1,String DDate1) throws Exception{ 36 | 37 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 38 | String url="http://flights.ctrip.com/domesticsearch/search/SearchFirstRouteFlights?DCity1="+DCity1+"&ACity1="+ACity1+"&SearchType=S&DDate1="+DDate1+"&IsNearAirportRecommond=0"; 39 | 40 | tasker.setProxy(new CrawlerProxy("cn-proxy.sg.oracle.com", 80)); 41 | 42 | // tasker.setDownloadType(DownloadType.firefox_download); 43 | tasker.setUrl(url); 44 | tasker.addRequestHeader("Referer", "http://flights.ctrip.com/booking/SHA-BJS1-day-1.html"); 45 | // tasker.addSeleniumAction("sendKeys", "id=DCityName1", "上海(SHA)"); 46 | // tasker.addSeleniumAction("sendKeys", "id=ACityName1", "北京(BJS)"); 47 | // tasker.addSeleniumAction("sendKeys", "id=DDate1", "2015-12-20"); 48 | // tasker.addSeleniumAction("click", "id=btnReSearch", null); 49 | tasker.addResultItem("url",url); 50 | tasker.addResultItem("flight_name",flight_name); 51 | tasker.addResultItem("flight_time",DDate1); 52 | 53 | // tasker.setStoreTableName("ctrip_jipiao"); 54 | // tasker.addXpath_all("flight", "//div[@class='info-flight']/allText()"); 55 | tasker.addJsonpath("json", "*"); 56 | 57 | putDistributeTask(tasker); 58 | } 59 | 60 | public static void main(String[] args)throws Exception{ 61 | 62 | JipiaoTasker master = new JipiaoTasker(); 63 | master.init(); 64 | 65 | master.start(); 66 | 67 | } 68 | 69 | 70 | } 71 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/CookieManager/httpdownload/HttpDownloadCookitHelper.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.CookieManager.httpdownload; 2 | 3 | import com.uumai.crawer2.CookieManager.CookieHelper; 4 | import com.uumai.crawer2.CookieManager.CrawlerCookie; 5 | import com.uumai.crawer2.CrawlerResult; 6 | import com.uumai.crawer2.CrawlerTasker; 7 | import com.uumai.crawer2.download.CrawlerProxy; 8 | import com.uumai.crawer2.download.httpdownload.HttpDownload; 9 | 10 | import java.io.*; 11 | import java.net.*; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | /** 16 | * Created by rock on 12/7/15. 17 | */ 18 | public class HttpDownloadCookitHelper { 19 | 20 | private CookieHelper cookieHelper=new CookieHelper(); 21 | private HttpDownload httpDownload=new HttpDownload(); 22 | 23 | private CrawlerProxy proxy; 24 | private List startCookie; 25 | public HttpDownloadCookitHelper (CrawlerProxy proxy,List startCookie){ 26 | CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); 27 | 28 | this.proxy=proxy; 29 | this.startCookie=startCookie; 30 | } 31 | 32 | public HttpDownloadCookitHelper (CrawlerProxy proxy){ 33 | CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); 34 | this.proxy=proxy; 35 | } 36 | 37 | 38 | public List getcookies(String url) throws Exception { 39 | return this.connect(url, null); 40 | } 41 | public List getcookies(List urllist) throws Exception { 42 | List cookie=startCookie; 43 | 44 | if(urllist!=null) { 45 | for (String url : urllist) { 46 | cookie = this.connect(url, cookie); 47 | 48 | } 49 | } 50 | return cookie; 51 | } 52 | 53 | private List connect(String urlStr,List cookies) throws Exception { 54 | CrawlerTasker tasker=new CrawlerTasker(); 55 | tasker.setUrl(urlStr); 56 | tasker.setCookies(cookies); 57 | tasker.setProxy(proxy); 58 | CrawlerResult crawlerResult=httpDownload.download(tasker); 59 | return crawlerResult.getCookies(); 60 | } 61 | public static void main(String[] a) throws Exception{ 62 | // HttpDownloadCookitHelper httpDownloadCookitHelper=new HttpDownloadCookitHelper(new CrawlerProxy("cn-proxy.jp.oracle.com", 80),null); 63 | HttpDownloadCookitHelper httpDownloadCookitHelper=new HttpDownloadCookitHelper(null,null); 64 | 65 | List urllist=new ArrayList(); 66 | urllist.add("http://www.linkedin.com"); 67 | // urllist.add("http://xueqiu.com/1130548918"); 68 | 69 | List cookie= httpDownloadCookitHelper.getcookies(urllist); 70 | for(CrawlerCookie c:cookie) { 71 | System.out.println("cookie:" + c); 72 | } 73 | 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/download/selenium/SeleniumActionBot.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.download.selenium; 2 | 3 | import org.openqa.selenium.By; 4 | import org.openqa.selenium.WebDriver; 5 | import org.openqa.selenium.WebElement; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * Created by rock on 12/9/15. 11 | */ 12 | public class SeleniumActionBot { 13 | 14 | private WebDriver driver; 15 | private List seleniumActionsList; 16 | public SeleniumActionBot(WebDriver driver, List seleniumActionsList){ 17 | this.driver=driver; 18 | this.seleniumActionsList=seleniumActionsList; 19 | } 20 | 21 | public void doactions(){ 22 | if(seleniumActionsList==null) return; 23 | for(SeleniumActions action:seleniumActionsList){ 24 | doactioin(action); 25 | } 26 | } 27 | 28 | private void doactioin(SeleniumActions seleniumActions){ 29 | String command=seleniumActions.getCommand(); 30 | String target=seleniumActions.getTarget(); 31 | String value= seleniumActions.getValue(); 32 | 33 | if("open".equalsIgnoreCase(command)){ 34 | driver.get(command); 35 | return; 36 | }else if("sleep".equalsIgnoreCase(command)){ 37 | try { 38 | Thread.sleep(Long.parseLong(value)); 39 | return; 40 | } catch (NumberFormatException e) { 41 | e.printStackTrace(); 42 | } catch (InterruptedException e) { 43 | e.printStackTrace(); 44 | } 45 | } 46 | 47 | WebElement webElement=null; 48 | if(target.startsWith("id=")){ 49 | webElement=driver.findElement(By.id(target.substring(3))); 50 | }else if(target.startsWith("css=")){ 51 | webElement=driver.findElement(By.cssSelector(target.substring(4))); 52 | // }else if("className".equalsIgnoreCase(action.getByType())){ 53 | // webElement=driver.findElement(By.className(action.getPath())); 54 | }else if(target.startsWith("link=")){ 55 | webElement=driver.findElement(By.linkText(target.substring(5))); 56 | }else if(target.startsWith("name=")){ 57 | webElement=driver.findElement(By.name(target.substring(5))); 58 | // }else if("partialLinkText".equalsIgnoreCase(action.getByType())){ 59 | // webElement=driver.findElement(By.partialLinkText(action.getPath())); 60 | // }else if("tagName".equalsIgnoreCase(action.getByType())){ 61 | // webElement=driver.findElement(By.tagName(action.getPath())); 62 | }else{ 63 | //default is xpath 64 | webElement=driver.findElement(By.xpath(target)); 65 | } 66 | 67 | 68 | if("clear".equalsIgnoreCase(command)){ 69 | webElement.clear(); 70 | }else if("sendKeys".equalsIgnoreCase(command)){ 71 | webElement.sendKeys(value); 72 | }else if("click".equalsIgnoreCase(command)){ 73 | //defalut is click 74 | webElement.click(); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/us/codecraft/xsoup/CombiningEvaluator.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.xsoup; 2 | 3 | import org.jsoup.helper.StringUtil; 4 | import org.jsoup.nodes.Element; 5 | import org.jsoup.select.Evaluator; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Arrays; 9 | import java.util.Collection; 10 | import java.util.List; 11 | 12 | /** 13 | * Base combining (and, or) evaluator. 14 | * 15 | * Copy from {@link org.jsoup.select.CombiningEvaluator} because it is package visible 16 | * 17 | * @see org.jsoup.select.CombiningEvaluator 18 | */ 19 | abstract class CombiningEvaluator extends Evaluator { 20 | final List evaluators; 21 | 22 | CombiningEvaluator() { 23 | super(); 24 | evaluators = new ArrayList(); 25 | } 26 | 27 | CombiningEvaluator(Collection evaluators) { 28 | this(); 29 | this.evaluators.addAll(evaluators); 30 | } 31 | 32 | Evaluator rightMostEvaluator() { 33 | return evaluators.size() > 0 ? evaluators.get(evaluators.size() - 1) : null; 34 | } 35 | 36 | void replaceRightMostEvaluator(Evaluator replacement) { 37 | evaluators.set(evaluators.size() - 1, replacement); 38 | } 39 | 40 | static final class And extends CombiningEvaluator { 41 | And(Collection evaluators) { 42 | super(evaluators); 43 | } 44 | 45 | And(Evaluator... evaluators) { 46 | this(Arrays.asList(evaluators)); 47 | } 48 | 49 | @Override 50 | public boolean matches(Element root, Element node) { 51 | for (int i = 0; i < evaluators.size(); i++) { 52 | Evaluator s = evaluators.get(i); 53 | if (!s.matches(root, node)) 54 | return false; 55 | } 56 | return true; 57 | } 58 | 59 | @Override 60 | public String toString() { 61 | return StringUtil.join(evaluators, " "); 62 | } 63 | } 64 | 65 | static final class Or extends CombiningEvaluator { 66 | 67 | Or(Collection evaluators) { 68 | super(); 69 | this.evaluators.addAll(evaluators); 70 | } 71 | 72 | Or(Evaluator... evaluators) { 73 | this(Arrays.asList(evaluators)); 74 | } 75 | 76 | Or() { 77 | super(); 78 | } 79 | 80 | public void add(Evaluator e) { 81 | evaluators.add(e); 82 | } 83 | 84 | @Override 85 | public boolean matches(Element root, Element node) { 86 | for (int i = 0; i < evaluators.size(); i++) { 87 | Evaluator s = evaluators.get(i); 88 | if (s.matches(root, node)) 89 | return true; 90 | } 91 | return false; 92 | } 93 | 94 | @Override 95 | public String toString() { 96 | return String.format(":or%s", evaluators); 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # uudatahive(蜂巢爬虫系统) 2 | 蜂巢爬虫系统是一套只需要定义XPath,就可实现爬取网站,APP的系统, 支持多种解析方式(XPath,正则表达式),多种下载方式(HttpClient库, PhantomJs, JBrowser, Selenium-Firefox,Chrome ),多种输出方式(Excel,MongoDB)。 可不做任何修改发布到Yarn,Docker,Mesos系统中实现分布式。 3 | 4 | # 爬虫构架 5 | 构架一套基于JAVA的分布式爬虫系统,可单机运行, 可不做任何修改发布到Yarn,Docker,Mesos实现分布式。 6 | 7 | # 框架特点 8 | 1. 可单机运行, 只需要定义简单的2-3行code,即可完成爬虫; 9 | 10 | 2. 可提交UUData分布式爬虫云, 直接支持分布式; (部分代码整理中,开源plan) 11 | 12 | 3. 下载方式: 支持java标准库, HttpClient库, PhantomJs, JBrowser , Selenium-Firefox,Chrome, MockDownload, FileDownload, ShellDownload; 13 | 14 | 4. 解析方式: 支持HTML XPath, Json XPath, 正则表达式, 自定义扩展解析库 15 | 16 | 5. 保存方式: 支持本地文件输出: txt, Excel 格式, MongoDB输出(分布式) 17 | 18 | 19 | # 最简单的爬虫: 20 | 21 | 爬取百度搜索某个关键字(java)的结果数量 22 | ``` 23 | public class BaiduSearch extends QuartzLocalAppMaster { 24 | @Override 25 | public void dobusiness() throws Exception { 26 | QuartzCrawlerTasker tasker = new QuartzCrawlerTasker(); 27 | tasker.setUrl("http://www.baidu.com/s?wd=java"); //定义URL 28 | tasker.addXpath("result", "//div[@class='nums']/text()"); //定义结果的XPath 29 | putDistributeTask(tasker); //执行 30 | } 31 | public static void main(String[] args) throws Exception { 32 | new BaiduSearch().init().start(); 33 | } 34 | } 35 | ``` 36 | 运行输出: 37 | ``` 38 | {"result":"百度为您找到相关结果约15,100,000个"} 39 | ``` 40 | 41 | 42 | 43 | # 下载安装 44 | ``` 45 | PreCondition: 安装了JDK8, Maven, Git 46 | git clone https://github.com/kanxg/uudatahive 47 | cd uudatahive 48 | ./bin/buildfengchao.sh //编译 core 49 | ./bin/buildapp.sh example //编译 demo项目 50 | ``` 51 | 52 | 运行 Demo 例子 53 | ``` 54 | cd build/fengchao 55 | ./uumai.sh com.uumai.crawer.quartz.search.baidu.BaiduSearch 56 | ``` 57 | 58 | # 开发新的爬虫 59 | ``` 60 | cd crawler-website/crawler-example/ 61 | mvn eclipse:eclipse or mvn idea:idea 生成项目 62 | 在eclipse or IntellJ 中 import项目 63 | 定义一个类继承自QuartzLocalDebugAppMaster, 添加 dobusiness() 方法 64 | 定义爬虫内容 65 | 调试 66 | ``` 67 | 68 | # 开发手册 69 | 70 | [[开发手册]](https://kanxg.gitbooks.io/uudatahive/content/) 如何使用蜂巢系统的参考 71 | 72 | 73 | [[Xpath参考]](https://www.gitbook.com/book/kanxg/fengchao_xpath/details) 包含已经解析的Xpath 74 | 75 | * 京东,amazon,dangdang,一号店等B2C商城的价格解析 76 | * 携程,去哪等酒店解析 77 | * 各航空公司机票价格解析 78 | * 拉勾,猎聘,linkedin等简历解析 79 | * 搜索引擎解析 80 | * 股票解析 81 | 82 | 83 | 84 | # 单机爬虫原理 85 | 86 | * AppMaster 创建一个tasker,定义url,Xpath, Download Type, result Format,提交tasker给Worker; 87 | 88 | * Worker 拿到任务,初始化,从DownloadFactory构建下载器,下载, 解析,输出; 89 | 90 | ![image](https://kanxg.gitbooks.io/uudatahive/content/img/uumai_fengchao.png) 91 | 92 | 93 | # 分布式爬虫原理 94 | 95 | * AppMaster 创建tasker,序列化,提交到任务池。 96 | 97 | * AdminAppMaster 提交工作池到分布式系统, 工作池启动,从任务池拿取任务,反序列化tasker, 构建worker,运行任务; 98 | 99 | ![image](https://kanxg.gitbooks.io/uudatahive/content/img/uumai_distributed.png) 100 | 101 | # 提交分布式系统 102 | 103 | 1. 当前系统支持5种分布式: 104 | 105 | a) YARN, Hadoop 分布式系统 106 | 107 | b) Docker 虚拟化分布式 108 | 109 | c) Mesos 分布式系统 (未完成) 110 | 111 | d) Apache Storm (旧的系统,已弃用) 112 | 113 | e) Standalone (旧的系统,已弃用) 114 | 115 | 116 | 2. 如何0代码修改,提交分布式系统? 117 | 118 | # 问题解决以及爬虫定义分享 119 | 120 | * 官方QQ群:uudata蜂巢系统交流群 117354543 121 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/RegexSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | import java.util.regex.PatternSyntaxException; 10 | 11 | /** 12 | * Selector in regex.
13 | * 14 | * @author
15 | * @since 0.1.0 16 | */ 17 | public class RegexSelector implements Selector { 18 | 19 | private String regexStr; 20 | 21 | private Pattern regex; 22 | 23 | private int group = 1; 24 | 25 | public RegexSelector(String regexStr, int group) { 26 | if (StringUtils.isBlank(regexStr)) { 27 | throw new IllegalArgumentException("regex must not be empty"); 28 | } 29 | // Check bracket for regex group. Add default group 1 if there is no group. 30 | // Only check if there exists the valid left parenthesis, leave regexp validation for Pattern. 31 | if (StringUtils.countMatches(regexStr, "(") - StringUtils.countMatches(regexStr, "\\(") == 32 | StringUtils.countMatches(regexStr, "(?:") - StringUtils.countMatches(regexStr, "\\(?:")) { 33 | regexStr = "(" + regexStr + ")"; 34 | } 35 | this.regexStr = regexStr; 36 | try { 37 | regex = Pattern.compile(regexStr, Pattern.DOTALL | Pattern.CASE_INSENSITIVE); 38 | } catch (PatternSyntaxException e) { 39 | throw new IllegalArgumentException("invalid regex", e); 40 | } 41 | this.group = group; 42 | } 43 | 44 | public RegexSelector(String regexStr) { 45 | this(regexStr, 1); 46 | } 47 | 48 | @Override 49 | public String select(String text) { 50 | return selectGroup(text).get(group); 51 | } 52 | 53 | @Override 54 | public List selectList(String text) { 55 | List strings = new ArrayList(); 56 | List results = selectGroupList(text); 57 | for (RegexResult result : results) { 58 | strings.add(result.get(group)); 59 | } 60 | return strings; 61 | } 62 | 63 | public RegexResult selectGroup(String text) { 64 | Matcher matcher = regex.matcher(text); 65 | if (matcher.find()) { 66 | String[] groups = new String[matcher.groupCount() + 1]; 67 | for (int i = 0; i < groups.length; i++) { 68 | groups[i] = matcher.group(i); 69 | } 70 | return new RegexResult(groups); 71 | } 72 | return RegexResult.EMPTY_RESULT; 73 | } 74 | 75 | public List selectGroupList(String text) { 76 | Matcher matcher = regex.matcher(text); 77 | List resultList = new ArrayList(); 78 | while (matcher.find()) { 79 | String[] groups = new String[matcher.groupCount() + 1]; 80 | for (int i = 0; i < groups.length; i++) { 81 | groups[i] = matcher.group(i); 82 | } 83 | resultList.add(new RegexResult(groups)); 84 | } 85 | return resultList; 86 | } 87 | 88 | @Override 89 | public String toString() { 90 | return regexStr; 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-util/src/main/java/com/uumai/crawer/util/license/LicenseValidateHelper.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.util.license; 2 | 3 | /** 4 | * Created by rock on 12/23/15. 5 | */ 6 | 7 | import java.io.File; 8 | import java.io.FileOutputStream; 9 | import java.io.OutputStream; 10 | 11 | import com.uumai.crawer.util.UumaiProperties; 12 | import com.verhas.licensor.License; 13 | import org.joda.time.DateTime; 14 | import org.joda.time.format.DateTimeFormat; 15 | import org.joda.time.format.DateTimeFormatter; 16 | 17 | public class LicenseValidateHelper { 18 | 19 | private static final String license_file= UumaiProperties.getUUmaiHome() + "/licenses/uumai.license"; 20 | private static final String pubring_file= UumaiProperties.getUUmaiHome() + "/licenses/pubring.gpg"; 21 | 22 | public LicenseValidateHelper(){ 23 | 24 | } 25 | 26 | public LicenseInfo validate() throws Exception{ 27 | 28 | File licenseFile = new File(license_file); 29 | if(!licenseFile.exists()) 30 | throw new Exception("didn't detect license file from:"+license_file); 31 | File pubkeyFile = new File(pubring_file); 32 | if(!pubkeyFile.exists()) 33 | throw new Exception("didn't detect public key file from:"+pubring_file); 34 | 35 | // licence 文件验证 36 | License license = new License(); 37 | 38 | if (license 39 | .loadKeyRing(pubring_file, null) 40 | .setLicenseEncodedFromFile(license_file).isVerified()) { 41 | // System.out.println(license.getFeature("edition")); 42 | // System.out.println(license.getFeature("valid-until")); 43 | // System.out.println(license.getFeature("distribute-pi-count")); 44 | LicenseInfo licenseInfo=new LicenseInfo(); 45 | licenseInfo.setUser(license.getFeature("user")); 46 | licenseInfo.setCompany(license.getFeature("company")); 47 | licenseInfo.setEmail(license.getFeature("email")); 48 | licenseInfo.setEdition(license.getFeature("edition")); 49 | licenseInfo.setRelease_version(license.getFeature("release-version")); 50 | licenseInfo.setValid_until(license.getFeature("valid-until")); 51 | licenseInfo.setDistribute_pi_count(license.getFeature("distribute-pi-count")); 52 | 53 | validateInfo(licenseInfo); 54 | 55 | return licenseInfo; 56 | } 57 | 58 | throw new Exception("unknown license check error! quit..."); 59 | } 60 | 61 | private void validateInfo(LicenseInfo licenseInfo) throws Exception{ 62 | //just check validate time 63 | DateTime in = new DateTime(); 64 | 65 | DateTimeFormatter joda_fmt = DateTimeFormat.forPattern("yyyy.MM.dd");//自定义日期格式 66 | DateTime license_time= joda_fmt.parseDateTime(licenseInfo.getValid_until()); 67 | 68 | if(!in.isBefore(license_time)) 69 | throw new Exception("license has expiried. please contact uumai!"); 70 | 71 | } 72 | 73 | public static void main(String[] args) throws Exception{ 74 | LicenseValidateHelper licenseValidateHelper=new LicenseValidateHelper(); 75 | licenseValidateHelper.validate(); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/CssSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import org.apache.commons.collections.CollectionUtils; 4 | import org.jsoup.nodes.Element; 5 | import org.jsoup.nodes.Node; 6 | import org.jsoup.nodes.TextNode; 7 | import org.jsoup.select.Elements; 8 | 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | 12 | /** 13 | * CSS selector. Based on Jsoup. 14 | * 15 | * @author
16 | * @since 0.1.0 17 | */ 18 | public class CssSelector extends BaseElementSelector { 19 | 20 | private String selectorText; 21 | 22 | private String attrName; 23 | 24 | public CssSelector(String selectorText) { 25 | this.selectorText = selectorText; 26 | } 27 | 28 | public CssSelector(String selectorText, String attrName) { 29 | this.selectorText = selectorText; 30 | this.attrName = attrName; 31 | } 32 | 33 | private String getValue(Element element) { 34 | if (attrName == null) { 35 | return element.outerHtml(); 36 | } else if ("innerHtml".equalsIgnoreCase(attrName)) { 37 | return element.html(); 38 | } else if ("text".equalsIgnoreCase(attrName)) { 39 | return getText(element); 40 | } else if ("allText".equalsIgnoreCase(attrName)) { 41 | return element.text(); 42 | } else { 43 | return element.attr(attrName); 44 | } 45 | } 46 | 47 | protected String getText(Element element) { 48 | StringBuilder accum = new StringBuilder(); 49 | for (Node node : element.childNodes()) { 50 | if (node instanceof TextNode) { 51 | TextNode textNode = (TextNode) node; 52 | accum.append(textNode.text()); 53 | } 54 | } 55 | return accum.toString(); 56 | } 57 | 58 | @Override 59 | public String select(Element element) { 60 | List elements = selectElements(element); 61 | if (CollectionUtils.isEmpty(elements)) { 62 | return null; 63 | } 64 | return getValue(elements.get(0)); 65 | } 66 | 67 | @Override 68 | public List selectList(Element doc) { 69 | List strings = new ArrayList(); 70 | List elements = selectElements(doc); 71 | if (CollectionUtils.isNotEmpty(elements)) { 72 | for (Element element : elements) { 73 | String value = getValue(element); 74 | if (value != null) { 75 | strings.add(value); 76 | } 77 | } 78 | } 79 | return strings; 80 | } 81 | 82 | @Override 83 | public Element selectElement(Element element) { 84 | Elements elements = element.select(selectorText); 85 | if (CollectionUtils.isNotEmpty(elements)) { 86 | return elements.get(0); 87 | } 88 | return null; 89 | } 90 | 91 | @Override 92 | public List selectElements(Element element) { 93 | return element.select(selectorText); 94 | } 95 | 96 | @Override 97 | public boolean hasAttribute() { 98 | return attrName != null; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/selenium/WebDriverFactory.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.selenium; 2 | 3 | import com.uumai.crawer.util.UumaiProperties; 4 | import com.uumai.crawer2.download.Download; 5 | import org.openqa.selenium.Proxy; 6 | import org.openqa.selenium.WebDriver; 7 | import org.openqa.selenium.chrome.ChromeDriver; 8 | import org.openqa.selenium.chrome.ChromeOptions; 9 | import org.openqa.selenium.firefox.FirefoxDriver; 10 | import org.openqa.selenium.htmlunit.HtmlUnitDriver; 11 | import org.openqa.selenium.phantomjs.PhantomJSDriver; 12 | import org.openqa.selenium.remote.CapabilityType; 13 | import org.openqa.selenium.remote.DesiredCapabilities; 14 | 15 | /** 16 | * Created by rock on 8/18/15. 17 | */ 18 | public class WebDriverFactory { 19 | 20 | public static int sleepTime=1000; 21 | 22 | static{ 23 | // System.getProperties().setProperty("webdriver.chrome.driver", 24 | // + UumaiProperties.readconfig("webdriver.chrome.driver", "/kanxg/Dropbox/mysourcecode/uumai/bitbucket/shop_indexer/driver/chromedriver")); 25 | 26 | System.getProperties().setProperty("webdriver.chrome.driver", 27 | UumaiProperties.getUUmaiHome() + "/driver/chromedriver"); 28 | 29 | System.getProperties().setProperty("phantomjs.binary.path", 30 | UumaiProperties.getUUmaiHome() + "/driver/phantomjs"); 31 | } 32 | 33 | 34 | public static synchronized WebDriver getDriver(Download.DownloadType type,String proxyIpAndPort){ 35 | DesiredCapabilities capabilities = DesiredCapabilities.chrome(); 36 | 37 | if(proxyIpAndPort!=null){ 38 | // Add the WebDriver proxy capability. 39 | Proxy proxy = new Proxy(); 40 | proxy.setHttpProxy(proxyIpAndPort) 41 | .setFtpProxy(proxyIpAndPort) 42 | .setSslProxy(proxyIpAndPort); 43 | capabilities.setCapability(CapabilityType.PROXY, proxy); 44 | // 以下三行是为了避免localhost和selenium driver的也使用代理,务必要加,否则无法与iedriver通讯 45 | capabilities.setCapability(CapabilityType.ForSeleniumServer.AVOIDING_PROXY, true); 46 | capabilities.setCapability(CapabilityType.ForSeleniumServer.ONLY_PROXYING_SELENIUM_TRAFFIC, true); 47 | System.setProperty("http.nonProxyHosts", "localhost"); 48 | 49 | } 50 | 51 | 52 | 53 | ChromeOptions options = new ChromeOptions(); 54 | // options.addArguments("start-maximized"); 55 | // options.addArguments("--no-startup-window"); 56 | // options.addArguments("silent-launch"); 57 | 58 | capabilities.setCapability(ChromeOptions.CAPABILITY, options); 59 | capabilities.setJavascriptEnabled(true); 60 | 61 | WebDriver e=null; 62 | 63 | if(type== Download.DownloadType.firefox_download){ 64 | e = new FirefoxDriver(capabilities); 65 | }else if(type== Download.DownloadType.chrome_download){ 66 | e = new ChromeDriver(capabilities); 67 | }else if(type== Download.DownloadType.htmlunit_download){ 68 | e = new HtmlUnitDriver(capabilities); 69 | }else { //if(type== Download.DownloadType.phantomjs_download){ 70 | e = new PhantomJSDriver(capabilities); 71 | } 72 | 73 | return e; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-quartz/src/main/java/com/uumai/crawer/quartz/localdebug/QuartzLocalDebugCrawlerWorker.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer.quartz.localdebug; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.FileWriter; 5 | import java.util.ArrayList; 6 | import java.util.Iterator; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | import net.minidev.json.JSONArray; 11 | import net.minidev.json.JSONObject; 12 | 13 | import com.google.gson.Gson; 14 | import com.google.gson.JsonElement; 15 | import com.google.gson.JsonObject; 16 | import com.jayway.jsonpath.JsonPath; 17 | import com.uumai.crawer.quartz.JsonParseHelper; 18 | import com.uumai.crawer.quartz.QuartzCrawlerTasker; 19 | import com.uumai.crawer.quartz.result.QuartzXpathItem; 20 | import com.uumai.crawer.util.filesystem.ExcelFileUtil; 21 | import com.uumai.crawer2.CrawlerTasker; 22 | import com.uumai.crawer2.localdebug.LocalDebugCrawlerWorker; 23 | import com.uumai.dao.helper.Json2DBHelper; 24 | 25 | public class QuartzLocalDebugCrawlerWorker extends LocalDebugCrawlerWorker{ 26 | 27 | public QuartzLocalDebugCrawlerWorker(CrawlerTasker tasker) { 28 | super(tasker); 29 | } 30 | 31 | @Override 32 | protected void download() throws Exception { 33 | super.download(); 34 | } 35 | 36 | @Override 37 | protected void pipeline() throws Exception { 38 | 39 | QuartzCrawlerTasker quartztasker = (QuartzCrawlerTasker) tasker; 40 | 41 | JsonParseHelper jsonParseHelper=new JsonParseHelper(quartztasker,result); 42 | 43 | List list= jsonParseHelper.parse(); 44 | 45 | if(quartztasker.getStoreTableName()!=null){ 46 | if(quartztasker.getStoreTableName().endsWith(".xls")){ 47 | ExcelFileUtil util=new ExcelFileUtil(quartztasker.getStoreTableName()); 48 | for(JsonObject obj:list){ 49 | List columnvalues=new ArrayList(); 50 | Iterator i$ = obj.entrySet().iterator(); 51 | while(i$.hasNext()) { 52 | Map.Entry entry = (Map.Entry)i$.next(); 53 | String value= ((JsonElement)entry.getValue()).toString(); 54 | if(value!=null){ 55 | value=value.substring(1, value.length()-1); 56 | columnvalues.add(value); 57 | } 58 | 59 | } 60 | util.writeLine(columnvalues); 61 | 62 | } 63 | util.createWorkBook(); 64 | } else if(quartztasker.getStoreTableName().endsWith(".txt")){ 65 | BufferedWriter out=new BufferedWriter(new FileWriter(quartztasker.getStoreTableName(),false)); 66 | for(JsonObject obj:list){ 67 | out.write(obj.toString()); 68 | out.newLine(); 69 | } 70 | out.close(); 71 | }else{ 72 | for(JsonObject obj:list){ 73 | System.out.println(obj.toString()); 74 | } 75 | } 76 | 77 | }else{ 78 | for(JsonObject obj:list){ 79 | System.out.println(obj.toString()); 80 | } 81 | } 82 | 83 | 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/SmartContentSelector.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | /** 8 | * 9 | * @author
10 | * @since 0.4.1 11 | * 12 | */ 13 | public class SmartContentSelector implements Selector { 14 | 15 | public SmartContentSelector() { 16 | } 17 | 18 | @Override 19 | public String select(String html) { 20 | html = html.replaceAll("(?is)", ""); 21 | html = html.replaceAll("(?is)", ""); // remove html comment 22 | html = html.replaceAll("(?is).*?", ""); // remove javascript 23 | html = html.replaceAll("(?is).*?", ""); // remove css 24 | html = html.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special char 25 | html = html.replaceAll("(?is)<.*?>", ""); 26 | List lines; 27 | int blocksWidth =3; 28 | int threshold =86; 29 | int start; 30 | int end; 31 | StringBuilder text = new StringBuilder(); 32 | ArrayList indexDistribution = new ArrayList(); 33 | 34 | lines = Arrays.asList(html.split("\n")); 35 | 36 | for (int i = 0; i < lines.size() - blocksWidth; i++) { 37 | int wordsNum = 0; 38 | for (int j = i; j < i + blocksWidth; j++) { 39 | lines.set(j, lines.get(j).replaceAll("\\s+", "")); 40 | wordsNum += lines.get(j).length(); 41 | } 42 | indexDistribution.add(wordsNum); 43 | } 44 | 45 | start = -1; end = -1; 46 | boolean boolstart = false, boolend = false; 47 | text.setLength(0); 48 | 49 | for (int i = 0; i < indexDistribution.size() - 1; i++) { 50 | if (indexDistribution.get(i) > threshold && ! boolstart) { 51 | if (indexDistribution.get(i+1).intValue() != 0 52 | || indexDistribution.get(i+2).intValue() != 0 53 | || indexDistribution.get(i+3).intValue() != 0) { 54 | boolstart = true; 55 | start = i; 56 | continue; 57 | } 58 | } 59 | if (boolstart) { 60 | if (indexDistribution.get(i).intValue() == 0 61 | || indexDistribution.get(i+1).intValue() == 0) { 62 | end = i; 63 | boolend = true; 64 | } 65 | } 66 | StringBuilder tmp = new StringBuilder(); 67 | if (boolend) { 68 | //System.out.println(start+1 + "\t\t" + end+1); 69 | for (int ii = start; ii <= end; ii++) { 70 | if (lines.get(ii).length() < 5) continue; 71 | tmp.append(lines.get(ii) + "\n"); 72 | } 73 | String str = tmp.toString(); 74 | //System.out.println(str); 75 | if (str.contains("Copyright") ) continue; 76 | text.append(str); 77 | boolstart = boolend = false; 78 | } 79 | } 80 | return text.toString(); 81 | } 82 | 83 | @Override 84 | public List selectList(String text) { 85 | throw new UnsupportedOperationException(); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /fengchao/uumai-common/crawler-xsoup/src/main/java/com/uumai/crawler/selector/AbstractSelectable.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawler.selector; 2 | 3 | import org.apache.commons.collections.CollectionUtils; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | /** 9 | * @author 10 | * @since 0.5.2 11 | */ 12 | public abstract class AbstractSelectable implements Selectable { 13 | 14 | protected abstract List getSourceTexts(); 15 | 16 | @Override 17 | public Selectable css(String selector) { 18 | return $(selector); 19 | } 20 | 21 | @Override 22 | public Selectable css(String selector, String attrName) { 23 | return $(selector, attrName); 24 | } 25 | 26 | protected Selectable select(Selector selector, List strings) { 27 | List results = new ArrayList(); 28 | for (String string : strings) { 29 | String result = selector.select(string); 30 | if (result != null) { 31 | results.add(result); 32 | } 33 | } 34 | return new PlainText(results); 35 | } 36 | 37 | protected Selectable selectList(Selector selector, List strings) { 38 | List results = new ArrayList(); 39 | for (String string : strings) { 40 | List result = selector.selectList(string); 41 | results.addAll(result); 42 | } 43 | return new PlainText(results); 44 | } 45 | 46 | @Override 47 | public List all() { 48 | return getSourceTexts(); 49 | } 50 | 51 | @Override 52 | public Selectable jsonPath(String jsonPath) { 53 | throw new UnsupportedOperationException(); 54 | } 55 | 56 | @Override 57 | public String get() { 58 | if (CollectionUtils.isNotEmpty(all())) { 59 | return all().get(0); 60 | } else { 61 | return null; 62 | } 63 | } 64 | 65 | @Override 66 | public Selectable select(Selector selector) { 67 | return select(selector, getSourceTexts()); 68 | } 69 | 70 | @Override 71 | public Selectable selectList(Selector selector) { 72 | return selectList(selector, getSourceTexts()); 73 | } 74 | 75 | @Override 76 | public Selectable regex(String regex) { 77 | RegexSelector regexSelector = Selectors.regex(regex); 78 | return selectList(regexSelector, getSourceTexts()); 79 | } 80 | 81 | @Override 82 | public Selectable regex(String regex, int group) { 83 | RegexSelector regexSelector = Selectors.regex(regex, group); 84 | return selectList(regexSelector, getSourceTexts()); 85 | } 86 | 87 | @Override 88 | public Selectable replace(String regex, String replacement) { 89 | ReplaceSelector replaceSelector = new ReplaceSelector(regex,replacement); 90 | return select(replaceSelector, getSourceTexts()); 91 | } 92 | 93 | public String getFirstSourceText() { 94 | if (getSourceTexts() != null && getSourceTexts().size() > 0) { 95 | return getSourceTexts().get(0); 96 | } 97 | return null; 98 | } 99 | 100 | @Override 101 | public String toString() { 102 | return get(); 103 | } 104 | 105 | @Override 106 | public boolean match() { 107 | return getSourceTexts() != null && getSourceTexts().size() > 0; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /fengchao/uumai-core/crawler-core/src/main/java/com/uumai/crawer2/download/httpclient/HttpConnectionManager.java: -------------------------------------------------------------------------------- 1 | package com.uumai.crawer2.download.httpclient; 2 | 3 | import org.apache.http.HeaderElement; 4 | import org.apache.http.HeaderElementIterator; 5 | import org.apache.http.HttpHost; 6 | import org.apache.http.HttpResponse; 7 | import org.apache.http.client.protocol.HttpClientContext; 8 | import org.apache.http.conn.ConnectionKeepAliveStrategy; 9 | import org.apache.http.conn.routing.HttpRoute; 10 | import org.apache.http.impl.client.CloseableHttpClient; 11 | import org.apache.http.impl.client.HttpClients; 12 | import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; 13 | import org.apache.http.message.BasicHeaderElementIterator; 14 | import org.apache.http.protocol.HTTP; 15 | import org.apache.http.protocol.HttpContext; 16 | 17 | /** 18 | * Created by kanxg on 14-12-30. 19 | */ 20 | public class HttpConnectionManager { 21 | 22 | static PoolingHttpClientConnectionManager cm; 23 | 24 | static CloseableHttpClient httpClient; 25 | 26 | static { 27 | cm = new PoolingHttpClientConnectionManager(); 28 | // Increase max total connection to 200 29 | cm.setMaxTotal(200); 30 | // Increase default max connection per route to 20 31 | cm.setDefaultMaxPerRoute(20); 32 | // Increase max connections for localhost:80 to 50 33 | HttpHost localhost = new HttpHost("www.amazon.com", 80); 34 | cm.setMaxPerRoute(new HttpRoute(localhost), 30); 35 | 36 | ConnectionKeepAliveStrategy myStrategy = new ConnectionKeepAliveStrategy() { 37 | 38 | public long getKeepAliveDuration(HttpResponse response, HttpContext context) { 39 | // Honor 'keep-alive' header 40 | HeaderElementIterator it = new BasicHeaderElementIterator( 41 | response.headerIterator(HTTP.CONN_KEEP_ALIVE)); 42 | while (it.hasNext()) { 43 | HeaderElement he = it.nextElement(); 44 | String param = he.getName(); 45 | String value = he.getValue(); 46 | if (value != null && param.equalsIgnoreCase("timeout")) { 47 | try { 48 | return Long.parseLong(value) * 1000; 49 | } catch(NumberFormatException ignore) { 50 | } 51 | } 52 | } 53 | HttpHost target = (HttpHost) context.getAttribute( 54 | HttpClientContext.HTTP_TARGET_HOST); 55 | if ("www.amazon.com".equalsIgnoreCase(target.getHostName())) { 56 | // Keep alive for 5 seconds only 57 | return 1 * 1000; 58 | } else { 59 | // otherwise keep alive for 30 seconds 60 | return 30 * 1000; 61 | } 62 | } 63 | 64 | }; 65 | httpClient = HttpClients.custom() 66 | .setConnectionManager(cm) 67 | .setKeepAliveStrategy(myStrategy) 68 | //.setConnectionReuseStrategy(NoConnectionReuseStrategy.INSTANCE) 69 | .build(); 70 | 71 | Thread IdleConnectionMonitorThread= new IdleConnectionMonitorThread(cm); 72 | IdleConnectionMonitorThread.setDaemon(true); 73 | IdleConnectionMonitorThread.start(); 74 | } 75 | 76 | 77 | 78 | public static CloseableHttpClient getHttpClient() { 79 | return httpClient; 80 | } 81 | 82 | 83 | } 84 | -------------------------------------------------------------------------------- /fengchao/uumai-distribute-sys/uumai-yarn/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 19 | 20 | com.uumai 21 | uumai-distribute-sys 22 | 1.0 23 | 24 | 25 | 26 | 4.0.0 27 | 28 | com.uumai 29 | uumai-yarn 30 | jar 31 | 1.0 32 | 33 | uumai-yarn 34 | 35 | 36 | 37 | com.uumai 38 | uumai-crawler-multi-core 39 | 1.0 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | org.apache.hadoop 49 | hadoop-hdfs 50 | 2.6.0 51 | 52 | 53 | 54 | 55 | 56 | org.apache.hadoop 57 | hadoop-common 58 | 2.6.0 59 | 60 | 61 | 62 | 63 | org.apache.hadoop 64 | hadoop-annotations 65 | 2.6.0 66 | 67 | 68 | 69 | org.apache.hadoop 70 | hadoop-yarn-api 71 | 2.6.0 72 | 73 | 74 | 75 | org.apache.hadoop 76 | hadoop-yarn-common 77 | 2.6.0 78 | 79 | 80 | javax.servlet 81 | servlet-api 82 | 83 | 84 | 85 | 86 | 87 | org.apache.hadoop 88 | hadoop-yarn-client 89 | 2.6.0 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | --------------------------------------------------------------------------------