├── docs ├── README.md ├── about.md ├── img │ ├── code_arch.png │ ├── use_case.png │ ├── limitation.png │ ├── system_tire.png │ ├── software_package.png │ ├── deploy_arch_cluster.png │ ├── deploy_arch_single.png │ ├── thread_queue_per_thread.png │ └── thread_shared_memqueue.png ├── 特色功能.md ├── 缓存表.md ├── 爬虫主要模块.md ├── 集群部署组件.md ├── index.md ├── springboot-mybatis_use.md ├── 用户部署.md ├── url队列表.md ├── 单机部署组件.md ├── 部署图-单机-集群.md └── 技术选型.md ├── jscrapy-core └── src │ ├── test │ ├── resources │ │ ├── LocalFilePiplineTest.yaml │ │ ├── H2CacherTest.yaml │ │ ├── yaml2beanTest.yaml │ │ ├── H2UrlConsumerTest.yaml │ │ ├── MongoCacherTest.yaml │ │ ├── MongoDedupTest.yaml │ │ ├── log4j.properties │ │ └── db.properties │ └── java │ │ ├── util │ │ ├── ResourcePathUtils.java │ │ └── Yaml2BeanUtilTest.java │ │ ├── dal │ │ ├── PgQueueTest.java │ │ ├── H2QueueTest.java │ │ └── QueueTest.java │ │ ├── downloader │ │ └── HttpDownloaderTest.java │ │ ├── request │ │ └── FetchRequestTest.java │ │ ├── dedup │ │ └── DeDupTest.java │ │ ├── cacher │ │ └── CacherTest.java │ │ ├── urlproducer │ │ └── UrlProducerTest.java │ │ └── urlconsumer │ │ └── UrlConsumerTest.java │ └── main │ ├── java │ └── org │ │ └── jscrapy │ │ └── core │ │ ├── exp │ │ ├── ExceptionCode.java │ │ ├── pipline │ │ │ └── PiplineExp.java │ │ ├── consumer │ │ │ └── ConsumerExp.java │ │ ├── producer │ │ │ └── ProducerExp.java │ │ ├── assemble │ │ │ └── UrlAssembleExp.java │ │ ├── processor │ │ │ └── ProcessorExp.java │ │ ├── downloader │ │ │ └── DownloaderExp.java │ │ └── BaseExp.java │ │ ├── request │ │ ├── UrlType.java │ │ ├── UrlStatus.java │ │ ├── HttpRequestMethod.java │ │ ├── Request.java │ │ ├── RequestContext.java │ │ └── HttpRequest.java │ │ ├── spider │ │ ├── impl │ │ │ ├── SessionBindSpider.java │ │ │ └── GenericSpider.java │ │ └── Spider.java │ │ ├── dal │ │ ├── h2cache │ │ │ └── H2PageCacheMapper.java │ │ ├── pgcache │ │ │ └── PgPageCacheMapper.java │ │ ├── h2queue │ │ │ ├── H2UrlQueueMapper.java │ │ │ └── H2UrlQueueDo.java │ │ ├── pgqueue │ │ │ ├── PgUrlQueueMapper.java │ │ │ └── PgUrlQueueDo.java │ │ ├── QueueLockMapper.java │ │ ├── PageCacheMapper.java │ │ ├── QueueLockDo.java │ │ ├── UrlQueueMapper.java │ │ ├── PageCacheDo.java │ │ └── UrlQueueDo.java │ │ ├── plugin │ │ ├── OrderValue.java │ │ ├── Plugin.java │ │ ├── PluginOrder.java │ │ └── PluginChain.java │ │ ├── processor │ │ ├── parser │ │ │ └── Parser.java │ │ ├── ParsersTable.java │ │ ├── impl │ │ │ └── GroovyProcessor.java │ │ └── Processor.java │ │ ├── ConfigAble.java │ │ ├── dedup │ │ ├── impl │ │ │ ├── H2Dedup.java │ │ │ └── PgDedup.java │ │ └── DeDup.java │ │ ├── memqueue │ │ ├── MemQueue.java │ │ └── impl │ │ │ ├── MemFIFOQueue.java │ │ │ └── MemFILOQueue.java │ │ ├── pipline │ │ ├── impl │ │ │ ├── H2Pipline.java │ │ │ └── PgPipline.java │ │ ├── plugin │ │ │ ├── after │ │ │ │ └── BeforePlugin.java │ │ │ └── before │ │ │ │ └── DefaultFieldsAddPlugin.java │ │ └── Pipline.java │ │ ├── downloader │ │ ├── Downloader.java │ │ ├── impl │ │ │ ├── OkHttpDownloaderImpl.java │ │ │ └── HttpDownloader.java │ │ └── DownloadResponse.java │ │ ├── parser │ │ └── Html.java │ │ ├── cacher │ │ ├── impl │ │ │ ├── PgCacher.java │ │ │ └── H2Cacher.java │ │ └── Cacher.java │ │ ├── JscrapyComponent.java │ │ ├── config │ │ ├── modulecfg │ │ │ ├── TaskComponentConfig.java │ │ │ ├── MongoDedepConfig.java │ │ │ └── H2QueueConfig.java │ │ ├── SysDefaultConfig.java │ │ └── ConfigKeys.java │ │ ├── producer │ │ ├── UrlProducer.java │ │ └── impl │ │ │ └── H2UrlProducer.java │ │ ├── data │ │ ├── DataItem.java │ │ └── ProcessResult.java │ │ ├── proxy │ │ ├── WatchableSpiderProxy.java │ │ └── SpiderProxy.java │ │ ├── comsumer │ │ ├── UrlConsumer.java │ │ └── impl │ │ │ └── H2UrlConsumer.java │ │ ├── util │ │ ├── Yaml2BeanUtil.java │ │ └── ClassLoadUtil.java │ │ ├── page │ │ └── Page.java │ │ ├── status │ │ └── TaskStatus.java │ │ ├── task │ │ └── Task.java │ │ └── TaskManager.java │ └── resources │ ├── spring │ ├── h2.xml │ └── spring-mybatis.xml │ ├── applicationContext.properties │ ├── applicationContext.xml │ └── mapper │ ├── h2pagecache.xml │ ├── queue_lock.xml │ ├── pgqueue.xml │ └── h2queue.xml ├── jscrapy-ext ├── src │ ├── test │ │ ├── resources │ │ │ ├── MapdbSchedulerTest.yaml │ │ │ ├── RedisDedupTest.yaml │ │ │ └── RedisSchedulerTest.yaml │ │ └── java │ │ │ ├── util │ │ │ └── ResourcePathUtils.java │ │ │ ├── dedup │ │ │ ├── SnakYamlTest.java │ │ │ └── DeDupExtTest.java │ │ │ └── pipline │ │ │ └── LocalFilePiplineTest.java │ └── main │ │ └── java │ │ └── org │ │ └── jscrapy │ │ └── ext │ │ ├── modulecfg │ │ ├── RedisDedupConfig.java │ │ ├── RedisSchedulerConfig.java │ │ └── RedisConfig.java │ │ ├── dedup │ │ ├── RedisDedup.java │ │ └── MongoDedup.java │ │ ├── cacher │ │ └── MongoCacher.java │ │ ├── scheduler │ │ └── RedisScheduler.java │ │ ├── pipline │ │ └── LocalFilePipline.java │ │ └── fetcher │ │ ├── HttpCharsetDetector.java │ │ └── ApacheHttpFetcher.java └── pom.xml ├── .travis.yml ├── env └── env-dev.properties ├── jscrapy-service ├── src │ └── main │ │ ├── resources │ │ ├── banner.txt │ │ ├── application.properties │ │ └── log4j.properties │ │ └── java │ │ └── org │ │ └── jscrapy │ │ └── service │ │ └── Application.java └── pom.xml ├── jscrapy-common ├── src │ ├── main │ │ └── java │ │ │ └── org │ │ │ └── jscrapy │ │ │ └── common │ │ │ ├── datetime │ │ │ └── DatetimeUtil.java │ │ │ ├── http │ │ │ └── HttpHeaderConstant.java │ │ │ ├── file │ │ │ ├── FileLines.java │ │ │ └── FileLineReader.java │ │ │ ├── js │ │ │ ├── JsExecuteResult.java │ │ │ └── JsExecuteUtil.java │ │ │ └── log │ │ │ ├── MyLoggerFactory.java │ │ │ └── TaskLogUtil.java │ └── test │ │ └── java │ │ └── log │ │ ├── TaskLogUtilTest.java │ │ └── MyLoggerFactoryTest.java └── pom.xml ├── .gitignore ├── jscrapy-admin └── pom.xml └── README.md /docs/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/about.md: -------------------------------------------------------------------------------- 1 | # 关于 2 | 3 | 4 | 5 | ## 作者简介 6 | 7 | 8 | 9 | ## 感谢 10 | 11 | -------------------------------------------------------------------------------- /docs/img/code_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/code_arch.png -------------------------------------------------------------------------------- /docs/img/use_case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/use_case.png -------------------------------------------------------------------------------- /docs/img/limitation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/limitation.png -------------------------------------------------------------------------------- /docs/img/system_tire.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/system_tire.png -------------------------------------------------------------------------------- /docs/img/software_package.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/software_package.png -------------------------------------------------------------------------------- /docs/img/deploy_arch_cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/deploy_arch_cluster.png -------------------------------------------------------------------------------- /docs/img/deploy_arch_single.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/deploy_arch_single.png -------------------------------------------------------------------------------- /docs/img/thread_queue_per_thread.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/thread_queue_per_thread.png -------------------------------------------------------------------------------- /docs/img/thread_shared_memqueue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwlyn/jscrapy/HEAD/docs/img/thread_shared_memqueue.png -------------------------------------------------------------------------------- /jscrapy-core/src/test/resources/LocalFilePiplineTest.yaml: -------------------------------------------------------------------------------- 1 | taskBaseConfig: 2 | taskId: task.id 3 | taskName: task.name 4 | -------------------------------------------------------------------------------- /docs/特色功能.md: -------------------------------------------------------------------------------- 1 | ## 特色功能 2 | - 实时调用抓取。应用场景:博客搬家(支持消息回调通知)。 3 | - 近实时抓取。应用场景:监控新闻站点,持续增量更新(分钟级别,间隔时间可设置)。 4 | - 批量抓取。间隔时间较长全量遍历抓取(自动调度)。 5 | 6 | 7 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/resources/H2CacherTest.yaml: -------------------------------------------------------------------------------- 1 | taskBaseConfig: 2 | taskId: task_id 3 | taskName: cacherTest 4 | groupId: TASK_VIRTUAL_ID 5 | -------------------------------------------------------------------------------- /jscrapy-ext/src/test/resources/MapdbSchedulerTest.yaml: -------------------------------------------------------------------------------- 1 | taskBaseConfig: 2 | taskId: task.id 3 | taskName: task.name 4 | groupId: TASK.VIRTUAL.ID 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk8 4 | - oraclejdk7 5 | services: 6 | - mongodb 7 | - redis 8 | - postgresql 9 | 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/exp/ExceptionCode.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.exp; 2 | 3 | /** 4 | * Created by cxu on 2015/6/22. 5 | */ 6 | public enum ExceptionCode { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/request/UrlType.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.request; 2 | 3 | /** 4 | * Created by cxu on 2017/1/21. 5 | */ 6 | public enum UrlType { 7 | SEED, LIST, DETAIL; 8 | } 9 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/spider/impl/SessionBindSpider.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.spider.impl; 2 | 3 | /** 4 | * Created by cxu on 2018/2/8. 5 | */ 6 | public class SessionBindSpider { 7 | } 8 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/request/UrlStatus.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.request; 2 | 3 | /** 4 | * Created by cxu on 2017/1/21. 5 | */ 6 | public enum UrlStatus { 7 | NEW, OUT_QUEUE, ERROR; 8 | } 9 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/resources/yaml2beanTest.yaml: -------------------------------------------------------------------------------- 1 | taskBaseConfig: 2 | taskId: task.id 3 | taskName: task.name 4 | urlFetchSize: 5 5 | threadCount: 1 6 | waitOnQueueEmptyMs: 200 7 | groupId: TASK.VIRTUAL.ID 8 | 9 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/modulecfg/RedisDedupConfig.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.modulecfg; 2 | 3 | /** 4 | * Created by cxu on 2017/1/16. 5 | */ 6 | public class RedisDedupConfig extends RedisConfig { 7 | } 8 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/modulecfg/RedisSchedulerConfig.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.modulecfg; 2 | 3 | /** 4 | * Created by cxu on 2017/1/18. 5 | */ 6 | public class RedisSchedulerConfig extends RedisConfig { 7 | } 8 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/request/HttpRequestMethod.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.request; 2 | 3 | /** 4 | * Created by cxu on 2014/11/21. 5 | */ 6 | public enum HttpRequestMethod { 7 | GET, POST, DELETE, TRACE, HEAD; 8 | } 9 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/exp/pipline/PiplineExp.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.exp.pipline; 2 | 3 | import org.jscrapy.core.exp.BaseExp; 4 | 5 | /** 6 | * Created by cxu on 2018/2/12. 7 | */ 8 | public class PiplineExp extends BaseExp { 9 | } 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/exp/consumer/ConsumerExp.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.exp.consumer; 2 | 3 | import org.jscrapy.core.exp.BaseExp; 4 | 5 | /** 6 | * Created by cxu on 2018/2/12. 7 | */ 8 | public class ConsumerExp extends BaseExp { 9 | } 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/exp/producer/ProducerExp.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.exp.producer; 2 | 3 | import org.jscrapy.core.exp.BaseExp; 4 | 5 | /** 6 | * Created by cxu on 2018/2/12. 7 | */ 8 | public class ProducerExp extends BaseExp { 9 | } 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/exp/assemble/UrlAssembleExp.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.exp.assemble; 2 | 3 | import org.jscrapy.core.exp.BaseExp; 4 | 5 | /** 6 | * Created by cxu on 2018/2/12. 7 | */ 8 | public class UrlAssembleExp extends BaseExp { 9 | } 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/exp/processor/ProcessorExp.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.exp.processor; 2 | 3 | import org.jscrapy.core.exp.BaseExp; 4 | 5 | /** 6 | * Created by cxu on 2018/2/12. 7 | */ 8 | public class ProcessorExp extends BaseExp { 9 | } 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/exp/downloader/DownloaderExp.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.exp.downloader; 2 | 3 | import org.jscrapy.core.exp.BaseExp; 4 | 5 | /** 6 | * Created by cxu on 2018/2/12. 7 | */ 8 | public class DownloaderExp extends BaseExp { 9 | } 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/h2cache/H2PageCacheMapper.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal.h2cache; 2 | 3 | import org.jscrapy.core.dal.PageCacheMapper; 4 | 5 | /** 6 | * Created by cxu on 2018/2/8. 7 | */ 8 | public interface H2PageCacheMapper extends PageCacheMapper { 9 | } 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/pgcache/PgPageCacheMapper.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal.pgcache; 2 | 3 | import org.jscrapy.core.dal.PageCacheMapper; 4 | 5 | /** 6 | * Created by cxu on 2018/2/8. 7 | */ 8 | public interface PgPageCacheMapper extends PageCacheMapper { 9 | } 10 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/plugin/OrderValue.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.plugin; 2 | 3 | /** 4 | * Created by cxu on 2018/2/12. 5 | */ 6 | 7 | public class OrderValue { 8 | public static final int AF_PIP_DEF_FIELD_ADD = 1000; 9 | 10 | 11 | public static final int BF_PIP_ = 2000; 12 | } 13 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/resources/H2UrlConsumerTest.yaml: -------------------------------------------------------------------------------- 1 | !!org.jscrapy.core.config.JscrapyConfig 2 | taskBaseConfig: 3 | taskId: task_id 4 | taskName: task_name 5 | groupId: TASK_VIRTUAL_ID 6 | 7 | taskComponentConfigs: 8 | QUEUE_H2: !!org.jscrapy.core.config.modulecfg.H2QueueConfig 9 | queueName: h2_consumer_test_queue 10 | -------------------------------------------------------------------------------- /env/env-dev.properties: -------------------------------------------------------------------------------- 1 | mongo.cacher.host=mongo.jscrapy.org 2 | mongo.cacher.port=27017 3 | 4 | mongo.dedup.host=mongo.jscrapy.org 5 | mongo.dedup.port=27017 6 | 7 | redis.dedup.host=redis.jscrapy.org 8 | redis.scheduler.host=redis.jscrapy.org 9 | redis.port=6379 10 | 11 | postgresql.host=postgresql.jscrapy.org 12 | postgresql.port=5432 13 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/processor/parser/Parser.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.processor.parser; 2 | 3 | import org.jscrapy.core.data.ProcessResult; 4 | import org.jscrapy.core.page.Page; 5 | 6 | /** 7 | * Created by cxu on 2017/2/7. 8 | */ 9 | public interface Parser { 10 | public ProcessResult parse(Page page); 11 | } 12 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/h2queue/H2UrlQueueMapper.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal.h2queue; 2 | 3 | import org.apache.ibatis.annotations.Mapper; 4 | import org.jscrapy.core.dal.UrlQueueMapper; 5 | 6 | /** 7 | * Created by cxu on 2016/8/1. 8 | */ 9 | @Mapper 10 | public interface H2UrlQueueMapper extends UrlQueueMapper { 11 | } -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/processor/ParsersTable.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.processor; 2 | 3 | import org.jscrapy.core.processor.parser.Parser; 4 | 5 | /** 6 | * Created by cxu on 2016/7/27. 7 | */ 8 | public class ParsersTable { 9 | 10 | public Parser getProcessorRule() { 11 | return null;//todo 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /jscrapy-ext/src/test/resources/RedisDedupTest.yaml: -------------------------------------------------------------------------------- 1 | !!org.jscrapy.core.config.JscrapyConfig 2 | taskBaseConfig: 3 | taskId: test-id 4 | taskName: test-task_name 5 | groupId: TASK.VIRTUAL.ID 6 | 7 | taskComponentConfigs: 8 | DEDUP_REDIS: !!org.jscrapy.ext.modulecfg.RedisDedupConfig 9 | host: ${redis.dedup.host} 10 | port: ${redis.port} 11 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/pgqueue/PgUrlQueueMapper.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal.pgqueue; 2 | 3 | import org.apache.ibatis.annotations.Mapper; 4 | import org.jscrapy.core.dal.UrlQueueMapper; 5 | 6 | /** 7 | * Created by cxu on 2016/8/1. 8 | */ 9 | @Mapper 10 | public interface PgUrlQueueMapper extends UrlQueueMapper { 11 | } 12 | -------------------------------------------------------------------------------- /jscrapy-ext/src/test/resources/RedisSchedulerTest.yaml: -------------------------------------------------------------------------------- 1 | !!org.jscrapy.core.config.JscrapyConfig 2 | taskBaseConfig: 3 | taskId: test-id-redis-sched 4 | taskName: test-task_name 5 | groupId: TASK.VIRTUAL.ID 6 | 7 | taskComponentConfigs: 8 | DEDUP_REDIS: !!org.jscrapy.ext.modulecfg.RedisSchedulerConfig 9 | host: ${redis.scheduler.host} 10 | port: ${redis.port} 11 | -------------------------------------------------------------------------------- /jscrapy-service/src/main/resources/banner.txt: -------------------------------------------------------------------------------- 1 | ___ 2 | |_ | 3 | | |___ ___ _ __ __ _ _ __ _ _ 4 | | / __|/ __| '__/ _` | '_ \| | | | 5 | /\__/ \__ | (__| | | (_| | |_) | |_| | 6 | \____/|___/\___|_| \__,_| .__/ \__, | 7 | | | __/ | 8 | |_| |___/ 9 | 10 | http://www.kammerl.de/ascii/AsciiSignature.php 11 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/ConfigAble.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | 5 | /** 6 | * Created by cxu on 2016/7/26. 7 | */ 8 | public interface ConfigAble { 9 | 10 | public abstract JscrapyConfig getJscrapyConfig(); 11 | 12 | public abstract void setJscrapyConfig(JscrapyConfig jscrapyConfig); 13 | } 14 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/resources/MongoCacherTest.yaml: -------------------------------------------------------------------------------- 1 | !!org.jscrapy.core.config.JscrapyConfig 2 | taskBaseConfig: 3 | taskId: task.id 4 | taskName: task.name 5 | groupId: TASK.VIRTUAL.ID 6 | 7 | taskComponentConfigs: 8 | DEDUP_MONGO: !!org.jscrapy.core.config.modulecfg.MongoDedepConfig 9 | dbName: jscrapy_mongo_cacher 10 | host: ${mongo.cacher.host} 11 | port: ${mongo.cacher.port} 12 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/resources/MongoDedupTest.yaml: -------------------------------------------------------------------------------- 1 | !!org.jscrapy.core.config.JscrapyConfig 2 | taskBaseConfig: 3 | taskId: task.id 4 | taskName: task.name 5 | groupId: TASK.VIRTUAL.ID 6 | 7 | taskComponentConfigs: 8 | DEDUP_MONGO: !!org.jscrapy.core.config.modulecfg.MongoDedepConfig 9 | dbName: jscrapy_test_dedup 10 | host: ${mongo.dedup.host} 11 | port: ${mongo.dedup.port} 12 | 13 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dedup/impl/H2Dedup.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dedup.impl; 2 | 3 | import org.jscrapy.core.dedup.DeDup; 4 | import org.jscrapy.core.request.Request; 5 | 6 | /** 7 | * Created by cxu on 2018/2/5. 8 | */ 9 | public class H2Dedup extends DeDup { 10 | @Override 11 | protected boolean isDup(Request request) { 12 | return false; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dedup/impl/PgDedup.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dedup.impl; 2 | 3 | import org.jscrapy.core.dedup.DeDup; 4 | import org.jscrapy.core.request.Request; 5 | 6 | /** 7 | * Created by cxu on 2018/2/5. 8 | */ 9 | public class PgDedup extends DeDup { 10 | @Override 11 | protected boolean isDup(Request request) { 12 | return false; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/memqueue/MemQueue.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.memqueue; 2 | 3 | import org.jscrapy.core.request.HttpRequest; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Created by cxu on 2018/2/9. 9 | */ 10 | public abstract class MemQueue { 11 | 12 | public abstract int push(List requests); 13 | 14 | public abstract HttpRequest poll(); 15 | } 16 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/plugin/Plugin.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.plugin; 2 | 3 | /** 4 | * Created by cxu on 2018/2/12. 5 | */ 6 | public abstract class Plugin { 7 | 8 | /** 9 | * pluginChain节点接口。 10 | * 每个plugin需要继承这个类,完成一个任务 11 | * @param context 12 | * @return false: 如果中断本次链条。true:如果要继续下一个 13 | */ 14 | public abstract boolean doAction(T context); 15 | } 16 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/pipline/impl/H2Pipline.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.pipline.impl; 2 | 3 | import org.jscrapy.core.data.DataItem; 4 | import org.jscrapy.core.pipline.Pipline; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by cxu on 2018/2/5. 10 | */ 11 | public class H2Pipline extends Pipline { 12 | @Override 13 | public void save(List dataItems) { 14 | 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/pipline/impl/PgPipline.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.pipline.impl; 2 | 3 | import org.jscrapy.core.data.DataItem; 4 | import org.jscrapy.core.pipline.Pipline; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by cxu on 2018/2/5. 10 | */ 11 | public class PgPipline extends Pipline { 12 | @Override 13 | public void save(List dataItems) { 14 | 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/downloader/Downloader.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.downloader; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.page.Page; 5 | import org.jscrapy.core.request.HttpRequest; 6 | 7 | /** 8 | * Created by cxu on 2014/11/21. 9 | */ 10 | public abstract class Downloader extends JscrapyComponent { 11 | 12 | public abstract Page download(HttpRequest request); 13 | } 14 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/resources/spring/h2.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/util/ResourcePathUtils.java: -------------------------------------------------------------------------------- 1 | package util; 2 | 3 | import java.io.File; 4 | 5 | /** 6 | * Created by cxu on 2015/10/3. 7 | */ 8 | public class ResourcePathUtils { 9 | public static String getResourceFileAbsPath(Class clazz, String fileName) { 10 | File cfgFile = new File(clazz.getResource(fileName).getFile()); 11 | String path = cfgFile.getAbsolutePath(); 12 | return path; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /jscrapy-ext/src/test/java/util/ResourcePathUtils.java: -------------------------------------------------------------------------------- 1 | package util; 2 | 3 | import java.io.File; 4 | 5 | /** 6 | * Created by cxu on 2015/10/3. 7 | */ 8 | public class ResourcePathUtils { 9 | public static String getResourceFileAbsPath(Class clazz, String fileName) { 10 | File cfgFile = new File(clazz.getResource(fileName).getFile()); 11 | String path = cfgFile.getAbsolutePath(); 12 | return path; 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /docs/缓存表.md: -------------------------------------------------------------------------------- 1 | ## H2缓存表 2 | 3 | ```sql 4 | CREATE TABLE IF NOT EXISTS ${table_name} 5 | ( 6 | id BIGSERIAL PRIMARY KEY, -- 唯一键 7 | page_id VARCHAR(64) , -- request.fp() 8 | gmt_created TIMESTAMP, -- 插入时间 9 | gmt_access TIMESTAMP, -- 被访问时间 10 | etag VARCHAR(64), -- etag 11 | page_content TEXT, -- html 12 | ); 13 | ``` 14 | 15 | 16 | - `${table_name} ` 使用taskName得到的值 17 | - `page_id` request.fp() 18 | 19 | 20 | 21 | 22 | 23 | ## Nosql 缓存 24 | 25 | 结构字段同h2 -------------------------------------------------------------------------------- /jscrapy-common/src/main/java/org/jscrapy/common/datetime/DatetimeUtil.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.common.datetime; 2 | 3 | import java.text.SimpleDateFormat; 4 | import java.util.Date; 5 | 6 | /** 7 | * Created by cxu on 2015/11/14. 8 | */ 9 | public class DatetimeUtil { 10 | 11 | public static String getTime(String format) { 12 | SimpleDateFormat sdf = new SimpleDateFormat(format); 13 | return sdf.format(new Date()); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/parser/Html.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.parser; 2 | 3 | import org.jscrapy.core.page.Page; 4 | import us.codecraft.webmagic.utils.UrlUtils; 5 | 6 | /** 7 | * Created by cxu on 2015/6/28. 8 | */ 9 | public class Html extends us.codecraft.webmagic.selector.Html{ 10 | public Html(Page page) { 11 | super(UrlUtils.fixAllRelativeHrefs(page.getRawText(), page.getRequest().getUrl())); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/plugin/PluginOrder.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.plugin; 2 | 3 | import java.lang.annotation.*; 4 | 5 | /** 6 | * 在一条链条上的插件的执行顺序,序号越小越先执行 7 | * Created by cxu on 2018/2/11. 8 | */ 9 | @Target(ElementType.TYPE) 10 | @Retention(RetentionPolicy.RUNTIME) 11 | @Documented 12 | public @interface PluginOrder { 13 | 14 | /** 15 | * 插件顺序,序号越小在链路上处于最先被调用 16 | * @return 17 | */ 18 | public int value() default 0; 19 | } 20 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/pipline/plugin/after/BeforePlugin.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.pipline.plugin.after; 2 | 3 | import org.jscrapy.core.plugin.Plugin; 4 | import org.jscrapy.core.plugin.PluginOrder; 5 | import org.jscrapy.core.plugin.OrderValue; 6 | 7 | /** 8 | * Created by cxu on 2018/2/11. 9 | */ 10 | @PluginOrder(OrderValue.AF_PIP_DEF_FIELD_ADD) 11 | public class BeforePlugin extends Plugin{ 12 | @Override 13 | public boolean doAction(Object context) { 14 | return false; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/cacher/impl/PgCacher.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.cacher.impl; 2 | 3 | import org.jscrapy.core.cacher.Cacher; 4 | import org.jscrapy.core.page.Page; 5 | import org.jscrapy.core.request.HttpRequest; 6 | 7 | /** 8 | * Created by cxu on 2018/2/5. 9 | */ 10 | public class PgCacher extends Cacher { 11 | @Override 12 | public Page loadPage(HttpRequest request) { 13 | return null; 14 | } 15 | 16 | @Override 17 | public void cachePage(Page page) { 18 | 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | .idea 3 | .settings 4 | .project 5 | .classpath 6 | target 7 | 8 | # Mobile Tools for Java (J2ME) 9 | .mtj.tmp/ 10 | 11 | # Package Files # 12 | *.jar 13 | *.war 14 | *.ear 15 | 16 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 17 | hs_err_pid* 18 | 19 | #intellig IDEA 20 | *.iml 21 | out 22 | #grails 23 | plugins 24 | 25 | *.log 26 | .gradle 27 | build 28 | gradle 29 | gradlew 30 | gradlew.bat 31 | *.bat 32 | null 33 | site/ 34 | *.log 35 | _book -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/pipline/plugin/before/DefaultFieldsAddPlugin.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.pipline.plugin.before; 2 | 3 | import org.jscrapy.core.plugin.OrderValue; 4 | import org.jscrapy.core.plugin.Plugin; 5 | import org.jscrapy.core.plugin.PluginOrder; 6 | 7 | /** 8 | * Created by cxu on 2018/2/11. 9 | */ 10 | @PluginOrder(OrderValue.AF_PIP_DEF_FIELD_ADD) 11 | public class DefaultFieldsAddPlugin extends Plugin{ 12 | @Override 13 | public boolean doAction(Object context) { 14 | return false; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/memqueue/impl/MemFIFOQueue.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.memqueue.impl; 2 | 3 | import org.jscrapy.core.memqueue.MemQueue; 4 | import org.jscrapy.core.request.HttpRequest; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by cxu on 2018/2/9. 10 | */ 11 | public class MemFIFOQueue extends MemQueue { 12 | @Override 13 | public int push(List requests) { 14 | return 0; 15 | } 16 | 17 | @Override 18 | public HttpRequest poll() { 19 | return null; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/memqueue/impl/MemFILOQueue.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.memqueue.impl; 2 | 3 | import org.jscrapy.core.memqueue.MemQueue; 4 | import org.jscrapy.core.request.HttpRequest; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by cxu on 2018/2/9. 10 | */ 11 | public class MemFILOQueue extends MemQueue { 12 | @Override 13 | public int push(List requests) { 14 | return 0; 15 | } 16 | 17 | @Override 18 | public HttpRequest poll() { 19 | return null; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/downloader/impl/OkHttpDownloaderImpl.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.downloader.impl; 2 | 3 | import org.jscrapy.core.downloader.DownloadResponse; 4 | import org.jscrapy.core.proxy.SpiderProxy; 5 | import org.jscrapy.core.request.HttpRequest; 6 | 7 | /** 8 | * Created by cxu on 2018/2/7. 9 | */ 10 | public class OkHttpDownloaderImpl { 11 | 12 | public OkHttpDownloaderImpl(SpiderProxy proxy){ 13 | 14 | } 15 | 16 | public DownloadResponse doDownload(HttpRequest request){ 17 | return null;//TODO 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/JscrapyComponent.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | 5 | /** 6 | * Created by cxu on 2018/2/5. 7 | */ 8 | public class JscrapyComponent implements ConfigAble { 9 | 10 | private JscrapyConfig JscrapyConfig; 11 | 12 | @Override 13 | public JscrapyConfig getJscrapyConfig() { 14 | return JscrapyConfig; 15 | } 16 | 17 | @Override 18 | public void setJscrapyConfig(JscrapyConfig jscrapyConfig) { 19 | JscrapyConfig = jscrapyConfig; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /docs/爬虫主要模块.md: -------------------------------------------------------------------------------- 1 | ## 爬虫组件 2 | 3 | | | 作用 | 子类 | 4 | | ----------- | --------------------- | ---- | 5 | | Spider | 控制爬虫的各个组件协调,是逻辑控制的核心。 | | 6 | | Downloader | 用于下载网页、图片、文件等。 | | 7 | | Processor | 调用脚本,完成解析。产出URL和数据内容。 | | 8 | | Pipeline | 存储数据到一个或者多个地方。 | | 9 | | UrlConsumer | 把URL从队列中取出。 | | 10 | | UrlProducer | 把新产生的URL放入到队列里。 | | 11 | | Cacher | 网页缓存 | | 12 | | Deduper | URL去重 | | 13 | | UrlAssmler | 从参数组装URL | | 14 | 15 | -------------------------------------------------------------------------------- /docs/集群部署组件.md: -------------------------------------------------------------------------------- 1 | # 集群部署模式详细 2 | 3 | Jscrapy在集群状态下需要组件如下: 4 | 5 | | *组件* | *作用* | 6 | | --------------- | --------------- | 7 | | postgreSql | 元数据管理 | 8 | | postgreSql | URL队列 | 9 | | ~~redis~~/kafka | admin与service交互 | 10 | | mongoDb | URL去重服务 | 11 | | mongoDb | 数据存储 | 12 | | mongoDb | 网页缓存 | 13 | | | | 14 | 15 | 因此集群模式下你需要部署的三方组件是: 16 | 17 | - postgreSql 18 | - ~~redis~~或者kafka 19 | - mongoDb 20 | 21 | 22 | 23 | > 不用redis做消息的原因是考虑到以后用kafka做url队列,用同一套组件完成。 24 | 25 | -------------------------------------------------------------------------------- /jscrapy-service/src/main/java/org/jscrapy/service/Application.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.service; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | import java.io.IOException; 7 | 8 | /** 9 | * Created by cxu on 2015/10/2. 10 | */ 11 | 12 | @SpringBootApplication(scanBasePackages = {"org.jscrapy.core.bootcfg"}) 13 | public class Application { 14 | public static void main(String[]args) throws IOException, InterruptedException { 15 | SpringApplication.run(Application.class, args);//启动 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/cacher/Cacher.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.cacher; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.page.Page; 5 | import org.jscrapy.core.request.HttpRequest; 6 | 7 | /** 8 | * 从缓存中读取网页 9 | * Created by cxu on 2015/7/12. 10 | */ 11 | public abstract class Cacher extends JscrapyComponent { 12 | public Cacher() { 13 | 14 | } 15 | 16 | /** 17 | * @param request 18 | * @return 命中则返回,否则null 19 | */ 20 | public abstract Page loadPage(HttpRequest request); 21 | 22 | public abstract void cachePage(Page page); 23 | 24 | } 25 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/processor/impl/GroovyProcessor.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.processor.impl; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | import org.jscrapy.core.processor.Processor; 5 | 6 | /** 7 | * Created by cxu on 2014/11/21. 8 | */ 9 | public class GroovyProcessor extends Processor { 10 | 11 | public GroovyProcessor(JscrapyConfig jscrapyConfig) { 12 | setJscrapyConfig(jscrapyConfig); 13 | } 14 | 15 | public void setJscrapyConfig(JscrapyConfig jscrapyConfig) { 16 | setJscrapyConfig(jscrapyConfig); 17 | //TODO init this object 18 | } 19 | 20 | 21 | } 22 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/QueueLockMapper.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal; 2 | 3 | import org.apache.ibatis.annotations.Mapper; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Created by cxu on 2018/2/12. 9 | */ 10 | @Mapper 11 | public interface QueueLockMapper { 12 | 13 | QueueLockDo selectForUpdate(String taskId); 14 | 15 | int deleteByTaskid(String taskId); 16 | 17 | int deleteByPrimaryKey(Long id); 18 | 19 | int insert(QueueLockDo record); 20 | 21 | QueueLockDo selectByPrimaryKey(Long id); 22 | 23 | List selectAll(); 24 | 25 | int updateByPrimaryKey(QueueLockDo record); 26 | } 27 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/config/modulecfg/TaskComponentConfig.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.config.modulecfg; 2 | 3 | /** 4 | * 任务的主要组件配置 5 | * Created by cxu on 2016/7/26. 6 | */ 7 | public abstract class TaskComponentConfig { 8 | /* 9 | private String spiderClass; //任务,控制流程 10 | private String downloaderClass;//下载 11 | private String processorClass;//用脚本、规则处理网页 12 | private String[] piplineClass;//存储 13 | private String urlConsumerClass;//从队列里取URL 14 | private String urlProducerClass;//将URL放入队列 15 | private String cacherClass;//缓存网页 16 | private String deduperClass;//URL去重 17 | */ 18 | 19 | 20 | } 21 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/modulecfg/RedisConfig.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.modulecfg; 2 | 3 | import org.jscrapy.core.config.modulecfg.TaskComponentConfig; 4 | 5 | /** 6 | * Created by cxu on 2017/1/18. 7 | */ 8 | public class RedisConfig extends TaskComponentConfig { 9 | private String host; 10 | private int port = 27017; 11 | 12 | public String getHost() { 13 | return host; 14 | } 15 | 16 | public void setHost(String host) { 17 | this.host = host; 18 | } 19 | 20 | public int getPort() { 21 | return port; 22 | } 23 | 24 | public void setPort(int port) { 25 | this.port = port; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /jscrapy-admin/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | org.jscrapy 5 | jscrapy 6 | 1.0-SNAPSHOT 7 | 8 | org.jscrapy 9 | jscrapy-admin 10 | 1.0-SNAPSHOT 11 | jar 12 | jscrapy-admin 13 | 14 | 15 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | jscrapy is an java implemention of scrapy, but more than scrapy. Enjoy it! 2 | 3 | ## Special features 4 | 5 | - Near realtime scrap 6 | - OCR 7 | - Session keep 8 | 9 | ## All features 10 | 11 | * `mkdocs new [dir-name]` - Create a new project. 12 | * `mkdocs serve` - Start the live-reloading docs server. 13 | * `mkdocs build` - Build the documentation site. 14 | * `mkdocs help` - Print this help message. 15 | 16 | ## Project layout 17 | 18 | ```text 19 | jscrapy/ 20 | |----jscrapy-admin/ #管控 21 | |----jscrapy-service/ #核心服务 22 | |----jscrapy-contrib/ #核心扩展 23 | |----jscrapy-core/ 24 | |----jscrapy-common/ 25 | |----jscrapy-docs/ # 文档目录 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/producer/UrlProducer.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.producer; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.request.RequestContext; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by cxu on 2016/7/27. 10 | */ 11 | public abstract class UrlProducer extends JscrapyComponent { 12 | /** 13 | * 将Request插入到请求队列中 14 | * 15 | * @param requests 16 | * @return 插入队列的实际Request数目 17 | */ 18 | public abstract int push(List requests); 19 | 20 | /** 21 | * 更新某些字段 22 | * @param requests 23 | */ 24 | public abstract void update(List requests); 25 | } 26 | -------------------------------------------------------------------------------- /jscrapy-common/src/main/java/org/jscrapy/common/http/HttpHeaderConstant.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.common.http; 2 | 3 | /** 4 | * Created by cxu on 2015/9/30. 5 | */ 6 | public class HttpHeaderConstant { 7 | public static final String USER_AGENT = "User-Agent"; 8 | public static final String COOKIE = "Cookie"; 9 | public static final String SET_COOKIE = "Set-Cookie"; 10 | public static final String AJAX = "X-Requested-With"; 11 | public static final String REFERER = "Referer"; 12 | public static final String ETAG = "Etag"; 13 | public static final String XMLHTTP_REQUEST = "XMLHttpRequest"; 14 | public static final String HTTP = "http"; 15 | public static final String HTTPS = "https"; 16 | 17 | } 18 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/pipline/Pipline.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.pipline; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.data.DataItem; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * Created by cxu on 2014/11/21. 11 | */ 12 | public abstract class Pipline extends JscrapyComponent { 13 | 14 | public Pipline(JscrapyConfig JscrapyConfig) { 15 | setJscrapyConfig(JscrapyConfig); 16 | } 17 | 18 | public Pipline() { 19 | 20 | } 21 | 22 | /** 23 | * 保存解析之后的数据 24 | * 25 | * @param dataItems 要保存的数据 26 | */ 27 | public abstract void save(List dataItems); 28 | 29 | } 30 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/resources/applicationContext.properties: -------------------------------------------------------------------------------- 1 | jdbc.h2.driverClassName=org.h2.Driver 2 | 3 | jdbc.h2.pageCacheUrl=jdbc:h2:tcp://localhost/~/.jscrapy/h2_page_cache/page_cache;mode=MySQL;DB_CLOSE_DELAY=-1;DB_CLOSE_ON_EXIT=FALSE;AUTO_SERVER=TRUE 4 | jdbc.h2.pageCacheUrl.username=sa 5 | jdbc.h2.pageCacheUrl.password= 6 | 7 | jdbc.h2.queueUrl=jdbc:h2:tcp://localhost/~/.jscrapy/h2_queue/url_queue;mode=MySQL;DB_CLOSE_DELAY=-1;DB_CLOSE_ON_EXIT=FALSE;AUTO_SERVER=TRUE 8 | jdbc.h2.queue.username=sa 9 | jdbc.h2.queue.password= 10 | 11 | jdbc.h2.queueLockUrl=jdbc:h2:tcp://localhost/~/.jscrapy/h2_queue_lock/url_queue;mode=MySQL;DB_CLOSE_DELAY=-1;DB_CLOSE_ON_EXIT=FALSE;AUTO_SERVER=TRUE 12 | jdbc.h2.queueLock.username=sa 13 | jdbc.h2.queueLock.password= -------------------------------------------------------------------------------- /jscrapy-common/src/test/java/log/TaskLogUtilTest.java: -------------------------------------------------------------------------------- 1 | package log; 2 | 3 | import org.jscrapy.common.log.MyLoggerFactory; 4 | import org.jscrapy.common.log.TaskLogUtil; 5 | import org.slf4j.Logger; 6 | import org.testng.annotations.Test; 7 | 8 | /** 9 | * Created by cxu on 2015/11/7. 10 | */ 11 | public class TaskLogUtilTest { 12 | @Test 13 | public void test() { 14 | Logger lg = MyLoggerFactory.getLogger(TaskLogUtilTest.class); 15 | TaskLogUtil.log(lg, "error", "hello {}, my name {}", "world", "cxu"); 16 | TaskLogUtil.log(lg, "info", "hello {}, my name {}", "world", "cxu"); 17 | TaskLogUtil.log(lg, "debug", "hello {}, my name {}", "world", "cxu"); 18 | TaskLogUtil.log(lg, "warn", "hello {}, my name {}", "world", "cxu"); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/h2queue/H2UrlQueueDo.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal.h2queue; 2 | 3 | import org.jscrapy.core.dal.UrlQueueDo; 4 | import org.jscrapy.core.request.UrlStatus; 5 | import org.jscrapy.core.request.UrlType; 6 | 7 | import java.util.Date; 8 | 9 | /** 10 | * Created by cxu on 2016/8/1. 11 | */ 12 | 13 | public class H2UrlQueueDo extends UrlQueueDo { 14 | public H2UrlQueueDo(Long id, String schedId, String url, UrlStatus urlStatus, Integer retryTimes, UrlType urlType, String siteId, Date gmtCreated, Date gmtAccess, String errorCode, String errorMsg) { 15 | super(id, schedId, url, urlStatus, retryTimes, urlType, siteId, gmtCreated, gmtAccess, errorCode, errorMsg); 16 | } 17 | 18 | public H2UrlQueueDo() { 19 | 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/pgqueue/PgUrlQueueDo.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal.pgqueue; 2 | 3 | import org.jscrapy.core.dal.UrlQueueDo; 4 | import org.jscrapy.core.request.UrlStatus; 5 | import org.jscrapy.core.request.UrlType; 6 | 7 | import java.util.Date; 8 | 9 | /** 10 | * Created by cxu on 2016/8/10. 11 | */ 12 | public class PgUrlQueueDo extends UrlQueueDo { 13 | 14 | public PgUrlQueueDo(Long id, String schedId, String url, UrlStatus urlStatus, Integer retryTimes, UrlType urlType, String siteId, Date gmtCreated, Date gmtAccess, String errorCode, String errorMsg) { 15 | super(id, schedId, url, urlStatus, retryTimes, urlType, siteId, gmtCreated, gmtAccess, errorCode, errorMsg); 16 | } 17 | 18 | public PgUrlQueueDo() { 19 | 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/dal/PgQueueTest.java: -------------------------------------------------------------------------------- 1 | package dal; 2 | 3 | import org.jscrapy.core.dal.UrlQueueMapper; 4 | import org.jscrapy.core.dal.pgqueue.PgUrlQueueMapper; 5 | import org.junit.runner.RunWith; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.test.context.TestPropertySource; 8 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 9 | 10 | /** 11 | * Created by cxu on 2016/8/10. 12 | */ 13 | @RunWith(SpringJUnit4ClassRunner.class) 14 | @TestPropertySource("classpath:db.properties") 15 | public class PgQueueTest extends QueueTest { 16 | @Autowired 17 | PgUrlQueueMapper pgUrlQueueMapper; 18 | 19 | @Override 20 | protected UrlQueueMapper getQueueMapper() { 21 | return pgUrlQueueMapper; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/dal/H2QueueTest.java: -------------------------------------------------------------------------------- 1 | package dal; 2 | 3 | import org.jscrapy.core.dal.UrlQueueMapper; 4 | import org.jscrapy.core.dal.h2queue.H2UrlQueueMapper; 5 | import org.junit.runner.RunWith; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.test.context.TestPropertySource; 8 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 9 | 10 | /** 11 | * Created by cxu on 2016/8/5. 12 | */ 13 | @RunWith(SpringJUnit4ClassRunner.class) 14 | @TestPropertySource("classpath:db.properties") 15 | public class H2QueueTest extends QueueTest { 16 | 17 | @Autowired 18 | H2UrlQueueMapper h2UrlQueueMapper; 19 | 20 | @Override 21 | protected UrlQueueMapper getQueueMapper() { 22 | return h2UrlQueueMapper; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/exp/BaseExp.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.exp; 2 | 3 | /** 4 | * Created by cxu on 2018/2/12. 5 | */ 6 | public class BaseExp extends RuntimeException { 7 | private String errCode; 8 | private String errMsg; 9 | 10 | public BaseExp() { 11 | 12 | } 13 | 14 | public BaseExp(String errCode, String errMsg) { 15 | this.errCode = errCode; 16 | this.errMsg = errMsg; 17 | } 18 | 19 | public String getErrCode() { 20 | return errCode; 21 | } 22 | 23 | public void setErrCode(String errCode) { 24 | this.errCode = errCode; 25 | } 26 | 27 | public String getErrMsg() { 28 | return errMsg; 29 | } 30 | 31 | public void setErrMsg(String errMsg) { 32 | this.errMsg = errMsg; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /docs/springboot-mybatis_use.md: -------------------------------------------------------------------------------- 1 | # mybatis-spring-boot 使用记录 2 | 3 | [mybatis-spring-boot](https://github.com/mybatis/spring-boot-starter)大多数介绍了 4 | 在单一的pom工程中使用例子。本节着重记录在多pom工程中使用中猜到的坑: 5 | 6 | - dal层在其他模块调用 7 | - 多数据源配置,H2, postgreSql 8 | 9 | 详细步骤记录如下: 10 | 11 | 1. 定义 XXMapper和XXDo java类。 12 | 2. 在classpath里定义 xxmapper.xml 13 | 3. 写一个mybatis-config.xml写上2中定义的xxmapper.xml的位置 14 | 4. 应用层:在application.properties中定义 `mybatis.config-locations=mybatis-config.xml`; datasource 15 | 的相关配置项。 16 | 5. 应用层:写一个spring-boot的 java config类, 在类上使用`@MapperScan('package.to.mapper')` 17 | 18 | 上述的第5步骤实在有点让我迷惑,找了半天才发现这样才能使用。原因不明,既然都可以自动配置了,为什么 19 | spring-boot不能多做一点帮助自动扫描到mapper? 20 | 21 | 其次我的应用是动态建表的,如果在mapper xml中的sql参数里使用 `#{}`的形式那么如果遇到有 22 | table做为参数的就直接报了sql语法错误,如果要动态操作表要改用 `${}`。需要注意的是对于`#{}`形式 23 | 的参数mybatis会自动进行sql注入检查,但是 `${}`形式的参数只会原样进行字符串替换。 24 | 25 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/config/modulecfg/MongoDedepConfig.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.config.modulecfg; 2 | 3 | /** 4 | * Created by cxu on 2017/1/16. 5 | */ 6 | public class MongoDedepConfig extends TaskComponentConfig { 7 | private String host; 8 | private int port; 9 | private String dbName; 10 | 11 | public String getHost() { 12 | return host; 13 | } 14 | 15 | public void setHost(String host) { 16 | this.host = host; 17 | } 18 | 19 | public int getPort() { 20 | return port; 21 | } 22 | 23 | public void setPort(int port) { 24 | this.port = port; 25 | } 26 | 27 | public String getDbName() { 28 | return dbName; 29 | } 30 | 31 | public void setDbName(String dbName) { 32 | this.dbName = dbName; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/config/modulecfg/H2QueueConfig.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.config.modulecfg; 2 | 3 | import java.util.concurrent.locks.ReentrantLock; 4 | 5 | /** 6 | * Created by cxu on 2017/1/25. 7 | */ 8 | public class H2QueueConfig extends TaskComponentConfig { 9 | 10 | private String queueName; 11 | 12 | //队列的互斥锁, 这个锁会造成一个问题是,如果在一个机器上部署多个Node,运行基于h2队列的任务胡出现多个节点不能互斥 13 | private ReentrantLock h2QueueLock = new ReentrantLock(); // not a fair lock 14 | 15 | public H2QueueConfig() { 16 | 17 | } 18 | 19 | public ReentrantLock getH2QueueLock() { 20 | return h2QueueLock; 21 | } 22 | 23 | public String getQueueName() { 24 | return queueName; 25 | } 26 | 27 | public void setQueueName(String queueName) { 28 | this.queueName = queueName; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/data/DataItem.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.data; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | /** 9 | * Created by cxu on 2015/6/21. 10 | */ 11 | public class DataItem { 12 | private Map dataItem; 13 | 14 | public DataItem() { 15 | dataItem = new HashMap<>(); 16 | } 17 | 18 | public DataItem put(String key, String value) { 19 | if (StringUtils.isNotBlank(value)) { 20 | dataItem.put(key.trim(), value.trim()); 21 | } 22 | return this; 23 | } 24 | 25 | public Map getDataItem() { 26 | return dataItem; 27 | } 28 | 29 | /** 30 | * 是否为空 31 | * @return 32 | */ 33 | public boolean isEmpty(){ 34 | return dataItem==null || dataItem.size()==0; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /jscrapy-service/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | 2 | ##################################### H2 3 | #spring.h2queue.datasource.url=jdbc:h2queue:~/h2queue/test;mode=MySQL;DB_CLOSE_DELAY=-1;DB_CLOSE_ON_EXIT=FALSE;AUTO_SERVER=TRUE 4 | spring.h2.datasource.url=jdbc:h2:tcp://localhost/~/.jscrapy/h2queue/jscrapy_queue;mode=MySQL;DB_CLOSE_DELAY=-1;DB_CLOSE_ON_EXIT=FALSE;AUTO_SERVER=TRUE 5 | spring.h2.datasource.username=sa 6 | spring.h2.datasource.password= 7 | spring.h2.datasource.mapperpath=classpath*:mapper/h2queue.xml 8 | 9 | ##################################### Pg 10 | spring.postgresql.datasource.url=jdbc:postgresql://localhost:5432/jscrapy_queue 11 | spring.postgresql.datasource.username=postgres 12 | spring.postgresql.datasource.password= 13 | spring.postgresql.datasource.mapperpath=classpath*:mapper/pgqueue.xml 14 | 15 | #logging.level.root=WARN 16 | #logging.level.sample.mybatis.mapper=TRACE 17 | #spring.datasource.schema=import.sql -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/proxy/WatchableSpiderProxy.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.proxy; 2 | 3 | import java.util.concurrent.atomic.AtomicLong; 4 | 5 | /** 6 | * 给SpiderProxy做监控 7 | * Created by cxu on 2015/9/29. 8 | */ 9 | public class WatchableSpiderProxy extends SpiderProxy { 10 | private static final long FAILED_THREAD_HOLE = 10;//允许的最大失败次数 11 | AtomicLong failedCount = new AtomicLong(0);//连续失败次数,如果中间有正常就恢复为0 12 | 13 | public WatchableSpiderProxy(ProxyType proxyType, String user, String password, String host, int port) { 14 | super(proxyType, user, password, host, port); 15 | } 16 | 17 | /** 18 | * 代理是否可用 19 | * @return 20 | */ 21 | public boolean isUsable() { 22 | return failedCount.longValue() <= FAILED_THREAD_HOLE; 23 | } 24 | 25 | public void incFailedCount(long failedCount) { 26 | this.failedCount.addAndGet(failedCount); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/request/Request.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.request; 2 | 3 | import java.io.Serializable; 4 | import java.util.Map; 5 | 6 | /** 7 | * 代表了一个Queue里的url请求 8 | * Created by cxu on 2014/11/21. 9 | */ 10 | public abstract class Request implements Serializable{ 11 | 12 | /** 13 | * 请求地址 14 | * @return 15 | */ 16 | public abstract String getUrl(); 17 | 18 | public abstract void setUrl(String url); 19 | 20 | /** 21 | * 获取url的请求方法:GET|POST|DELETE|TRACE|HEAD.. 22 | * @return 23 | */ 24 | public abstract HttpRequestMethod getHttpMethod(); 25 | 26 | /** 27 | * 如果是POST请求,获取请求的参数 28 | * @return 29 | */ 30 | public abstract Map getParameters(); 31 | 32 | public abstract String asJson(); 33 | 34 | /** 35 | * 对象的一个md5标识,用于去重 36 | * @return 37 | */ 38 | public abstract String uniqId(); 39 | 40 | } 41 | -------------------------------------------------------------------------------- /docs/用户部署.md: -------------------------------------------------------------------------------- 1 | # 用户手册 2 | 3 | ## 用例 4 | ![用例](img/use_case.png) 5 | 6 | ## 程序包介绍 7 | ![java包介绍](img/software_package.png) 8 | 9 | jscrapy一共产生了4个[可执行war包]()。 10 | 11 | - jscrapy-admin.war 管控平台。 12 | - jscrapy-console.war 命令行模式客户端。 13 | - jscrapy-service.war 爬虫核心服务。 14 | - jscrapy-exec.war 特为单机状态打包的一键安装包,实际是把管控和核心服务打包在一起方便部署。 15 | 16 | ## 部署 17 | 爬虫部署可以简单分为单机部署和集群部署两种方式。如果你的应用,对可用性要求不是 18 | 那么严格,并且机器资源有限可以采用单机部署的方式。如果你要在生产环境使用,那么建议 19 | 部署为集群模式,能够发挥多台机器并行处理的能力。 20 | 21 | ### 单机模式 22 | 1. 单机模式下,最简单部署方法就是将 `jscrapy-exec.war`上传到服务器,然后执行 23 | ```shell 24 | $java -jar jscrapy-exec.war 25 | ``` 26 | 这样你就能通过这台机器的IP登录管理平台,然后进行操作。 27 | 28 | 2. 另外一种方式是只上传`jscrapy-service.war`到服务器,然后执行 29 | ```shell 30 | $java -jar jscrapy-service.war 31 | ``` 32 | 最后配合 `jscrapy-console.war`或者本机启动 `jscrapy-admin.war`对系统进行使用。 33 | 34 | ### 集群模式 35 | 36 | 1. 把`jscrapy-service.war`部署到多台机器上启动: `$java -jar jscrapy-service.war` 37 | 2. 在多机启动`jscrapy-admin.war`: 过程中需要配置postgreSql地址。 38 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dedup/DeDup.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dedup; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.request.HttpRequest; 5 | import org.jscrapy.core.request.Request; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * URL去重 12 | * Created by cxu on 2015/6/22. 13 | */ 14 | public abstract class DeDup extends JscrapyComponent { 15 | 16 | /** 17 | * 测试是否是重复的 18 | * @param request 19 | * @return 20 | */ 21 | protected abstract boolean isDup(Request request); 22 | 23 | /** 24 | * 返回非重复的 25 | * @param request 26 | * @return 27 | */ 28 | public List deDup(List request){ 29 | List req = new ArrayList<>(request.size()); 30 | for(HttpRequest url : request){ 31 | if(!this.isDup(url)){ 32 | req.add(url); 33 | } 34 | } 35 | return req; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.log.root.dir=${user.home}/.jscrapy/logs 2 | ########################################################### 3 | # log4j property 4 | ########################################################### 5 | log4j.rootLogger = INFO,STOREFILE,CONSOLE 6 | log4j.appender.STOREFILE=org.apache.log4j.DailyRollingFileAppender 7 | log4j.appender.STOREFILE.File = ${log4j.log.root.dir}/jscrapy-dev.log 8 | log4j.appender.STOREFILE.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.STOREFILE.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %c %p:[%r ms] %m %n 10 | 11 | #////////////////////////////////////////////////////////// 12 | # 13 | # LOG TO CONSOLE 14 | # 15 | #////////////////////////////////////////////////////////// 16 | log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender 17 | log4j.appender.CONSOLE.target=System.out 18 | log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout 19 | log4j.appender.CONSOLE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 20 | -------------------------------------------------------------------------------- /jscrapy-service/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.log.root.dir=${user.home}/.jscrapy/logs 2 | ########################################################### 3 | # log4j property 4 | ########################################################### 5 | log4j.rootLogger = INFO,STOREFILE,CONSOLE 6 | log4j.appender.STOREFILE=org.apache.log4j.DailyRollingFileAppender 7 | log4j.appender.STOREFILE.File = ${log4j.log.root.dir}/jscrapy.log 8 | log4j.appender.STOREFILE.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.STOREFILE.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %c %p:[%r ms] %m %n 10 | 11 | #////////////////////////////////////////////////////////// 12 | # 13 | # LOG TO CONSOLE 14 | # 15 | #////////////////////////////////////////////////////////// 16 | log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender 17 | log4j.appender.CONSOLE.target=System.out 18 | log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout 19 | log4j.appender.CONSOLE.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 20 | -------------------------------------------------------------------------------- /jscrapy-common/src/main/java/org/jscrapy/common/file/FileLines.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.common.file; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * Created by cxu on 2015/11/14. 8 | */ 9 | public class FileLines { 10 | 11 | private List lines = new ArrayList(); 12 | private long offset;//文件偏移位置 13 | 14 | public List getLines() { 15 | return lines; 16 | } 17 | 18 | public void addLine(String line) { 19 | this.lines.add(line); 20 | } 21 | 22 | public long getOffset() { 23 | return offset; 24 | } 25 | 26 | public void setOffset(long offset) { 27 | this.offset = offset; 28 | } 29 | 30 | public int getLinesCount() { 31 | return lines.size(); 32 | } 33 | 34 | @Override 35 | public String toString() { 36 | StringBuffer buf = new StringBuffer(); 37 | for (String s : lines) { 38 | buf.append(s).append("\n"); 39 | } 40 | return buf.toString(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/comsumer/UrlConsumer.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.comsumer; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.memqueue.MemQueue; 5 | import org.jscrapy.core.request.RequestContext; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * Created by cxu on 2016/7/27. 11 | */ 12 | public abstract class UrlConsumer extends JscrapyComponent { 13 | 14 | protected MemQueue memQueue;//用于缓存从集中队列里取出来的request 15 | 16 | /** 17 | * 从集中队列中拿出Request 18 | * 19 | * @param n 每次拿出多少个Request 20 | * @return 没有则返回空的对象而不是null 21 | */ 22 | public abstract List poll(int n); 23 | 24 | /** 25 | * 从队列中删除Request 26 | * 27 | * @param requests 28 | * @return 29 | */ 30 | public abstract int delete(List requests); 31 | 32 | public void setMemQueue(MemQueue memQueue){ 33 | this.memQueue = memQueue; 34 | } 35 | 36 | public MemQueue getMemQueue() { 37 | return memQueue; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/processor/Processor.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.processor; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.data.ProcessResult; 6 | import org.jscrapy.core.page.Page; 7 | import org.jscrapy.core.processor.parser.Parser; 8 | 9 | /** 10 | * Created by cxu on 2014/11/21. 11 | */ 12 | public abstract class Processor extends JscrapyComponent { 13 | /** 14 | * [k, v] 列表 15 | * k: site_id 16 | * v: java 解析实例对象 17 | */ 18 | protected ParsersTable parsersTable; 19 | 20 | public void setParsersTable(ParsersTable parsersTable) { 21 | this.parsersTable = parsersTable; 22 | } 23 | 24 | public Processor(JscrapyConfig jscrapyConfig) { 25 | setJscrapyConfig(jscrapyConfig); 26 | } 27 | 28 | public Processor() { 29 | 30 | } 31 | 32 | public ProcessResult process(Page page) { 33 | Parser parser = parsersTable.getProcessorRule(); 34 | ProcessResult result = parser.parse(page); 35 | result.setRequest(page.getRequest()); 36 | return result; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /jscrapy-common/src/main/java/org/jscrapy/common/js/JsExecuteResult.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.common.js; 2 | 3 | /** 4 | * Created by cxu on 2016/2/23. 5 | */ 6 | public class JsExecuteResult { 7 | 8 | private Object result;//返回的结果数据 9 | private Boolean isSuccess = Boolean.FALSE;//是否成功 10 | private String message;//错误原因等 11 | 12 | /** 13 | * @return the result 14 | */ 15 | public Object getResult() { 16 | return result; 17 | } 18 | /** 19 | * @param result the result to set 20 | */ 21 | public void setResult(Object result) { 22 | this.result = result; 23 | } 24 | /** 25 | * @return the isSuccess 26 | */ 27 | public Boolean getIsSuccess() { 28 | return isSuccess; 29 | } 30 | /** 31 | * @param isSuccess the isSuccess to set 32 | */ 33 | public void setIsSuccess(Boolean isSuccess) { 34 | this.isSuccess = isSuccess; 35 | } 36 | /** 37 | * @return the message 38 | */ 39 | public String getMessage() { 40 | return message; 41 | } 42 | /** 43 | * @param message the message to set 44 | */ 45 | public void setMessage(String message) { 46 | this.message = message; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /docs/url队列表.md: -------------------------------------------------------------------------------- 1 | # 基于RDBMS的URL队列表 2 | 3 | | | id | url | url_status | retry_times | url_type | site_id | gmt_created | gmt_access | error_code | error_msg | 4 | | ---- | -------- | ---- | ---------- | ----------- | -------- | -------- | ----------- | ---------- | ---------- | --------- | 5 | | H2 | IDENTITY | TEXT | char(10) | TINYINT | char(10) | IDENTITY | TIMESTAMP | TIMESTAMP | char(10) | text | 6 | | PG | int | json | char(10) | int(2) | char(10) | int | TIMESTAMP | TIMESTAMP | char(10) | text | 7 | 8 | 9 | ## H2 10 | ```sql 11 | CREATE TABLE IF NOT EXISTS XXX 12 | ( 13 | id IDENTITY PRIMARY KEY, -- 唯一键 14 | sched_id VARCHAR(64), -- 调度的唯一性标识,每次都不一样即可,可以取当前任务启动时间戳 15 | url TEXT, -- url 16 | url_status CHAR(16), -- 种子状态[N:新进入, O:出队列, E:出错] 17 | retry_times TINYINT, -- 重试次数 18 | url_type CHAR(16), -- 种子类型[S:种子(Seed), L:列表(List), D: 详情页(Detail)] 19 | site_id VARCHAR(64), -- 站点ID 20 | gmt_created TIMESTAMP, -- url插入时间 21 | gmt_access TIMESTAMP, -- 被访问时间 22 | error_code CHAR(16), -- 错误编码 23 | error_msg TEXT -- 错误详细信息 24 | ); 25 | ``` 26 | 27 | ## Pg 28 | ```sql 29 | CREATE TABLE IF NOT EXISTS XXX 30 | ( 31 | id IDENTITY PRIMARY KEY, 32 | url TEXT, 33 | retry_times SMALLINT 34 | ); 35 | ``` -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/PageCacheMapper.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal; 2 | 3 | import org.apache.ibatis.annotations.Mapper; 4 | import org.apache.ibatis.annotations.Param; 5 | import org.springframework.context.annotation.Configuration; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * Created by cxu on 2017/1/8. 11 | */ 12 | @Mapper 13 | @Configuration 14 | public interface PageCacheMapper { 15 | 16 | /** 17 | * @param tableName 18 | */ 19 | public void createCacherTable(@Param("table_name") String tableName); 20 | 21 | /** 22 | * 批量缓存网页入队列 23 | * 24 | * @return 25 | */ 26 | public int batchInsert(@Param("table_name") String cacherTable, 27 | @Param("pages") List pages); 28 | 29 | /** 30 | * 31 | * @param cacherTable 32 | * @param page 33 | * @return 34 | */ 35 | public int insert(@Param("table_name") String cacherTable, 36 | @Param("page") PageCacheDo page); 37 | 38 | /** 39 | * 寻找缓存表里的网页 40 | * 41 | * @param cacherTable 42 | * @param pageId 43 | */ 44 | public PageCacheDo find(@Param("table_name") String cacherTable, 45 | @Param("page_id") String pageId); 46 | 47 | } -------------------------------------------------------------------------------- /jscrapy-core/src/test/resources/db.properties: -------------------------------------------------------------------------------- 1 | ##################################### h2 queue 2 | #spring.h2queue.queue.datasource.url=jdbc:h2queue:~/.jscrapy/h2queue/jscrapy_test_queue;mode=MySQL;DB_CLOSE_DELAY=-1;DB_CLOSE_ON_EXIT=FALSE;AUTO_SERVER=TRUE 3 | spring.h2.queue.datasource.url=jdbc:h2:tcp://localhost/~/.jscrapy/h2queue/jscrapy_test_queue;mode=MySQL;DB_CLOSE_DELAY=-1;DB_CLOSE_ON_EXIT=FALSE;AUTO_SERVER=TRUE 4 | spring.h2.queue.datasource.username=sa 5 | spring.h2.queue.datasource.password= 6 | spring.h2.queue.datasource.mapperpath=classpath*:mapper/h2queue.xml 7 | 8 | ##################################### h2queue h2cache 9 | spring.h2.cacher.datasource.url=jdbc:h2:tcp://localhost/~/.jscrapy/h2cache/jscrapy_test_cache;mode=MySQL;DB_CLOSE_DELAY=-1;DB_CLOSE_ON_EXIT=FALSE;AUTO_SERVER=TRUE 10 | spring.h2.cacher.datasource.username=sa 11 | spring.h2.cacher.datasource.password= 12 | spring.h2.cacher.datasource.mapperpath=classpath*:mapper/h2pagecache.xml 13 | 14 | ##################################### Pg 15 | spring.postgresql.queue.datasource.url=jdbc:postgresql://${postgresql.host}:${postgresql.port}/jscrapy_queue 16 | spring.postgresql.queue.datasource.username=postgres 17 | spring.postgresql.queue.datasource.password= 18 | spring.postgresql.queue.datasource.mapperpath=classpath*:mapper/pgqueue.xml 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jscrapy 2 | [![Build Status](https://travis-ci.org/jscrapy/jscrapy.svg?branch=master)](https://travis-ci.org/jscrapy/jscrapy) 3 | 4 | ## 功能 5 | - 用户友好,一个管控界面解决全部问题 6 | - 动态上线,新站点抓取无需重新发布部署。 7 | - 天生分布式。支持集群动态扩、缩容、分组 8 | - 精准解析。解析脚本化,支持Groovy, Javascript, python 9 | - 支持的去重方式:redis, mongodb, ehcache, 内存 10 | - 支持的队列: redis, mongodb(可靠抓取), ehcache, 内存 11 | - 支持伪分布式:一台机器模拟集群。 12 | - 支持大集群虚拟分组功能:隔离故障,降低维护成本。 13 | - 模块化:方便地使用脚本在运行时控制proxy,http header, url 14 | - 直接对接maven库,让部署新任务自动化、规范化 15 | - 分布式自动调度:无论单机还是多机都提供可靠的调度。不重复,不遗漏。 16 | - 提供低资源占用的js动态渲染解决方案:抓取ajax内容从此简单。 17 | - 增量抓取功能:新闻,股票,竞品数据... 18 | - 多页面数据合并功能:一条完整数据横跨几个页面?没关系! 19 | - 分页抓取功能,毫无乱序。 20 | - 支持URL优先级。 21 | - 提供辅助功能,利用机器学习实现无解析化抓取(实验特性) 22 | - 提供多种数据持久化策略:无论您想直接保存在db,还是希望实时处理,想要的都可以满足。 23 | - 登录:你懂的。 24 | - 验证码破解服务:你懂的,不罗嗦。 25 | - 提供多种变幻莫测的代理策略:当然你只需要管控界面上动动鼠标。 26 | - 解析太头疼?看看解析工具里的武器。表格,列表只需一条语句。 27 | - OCR太深奥?我都给你包好了,绝不放过任何一条数据。 28 | - 自适应线程调节,让“弱网站”全自动化 29 | ## 使用手册TODO 30 | 31 | 32 | ## 特别感谢 33 | 34 | ```text 35 | If I have been able to see further, it was only because I stood on the shoulders of giants. 36 | 37 | --Newton 38 | ``` 39 | 本产品的开发借鉴或者复用了很多优秀的开源软件。特别感谢 40 | 41 | - webmagic 42 | - scrapy 43 | - SeimiCrawler 44 | - gecco 45 | - G-Crawler 46 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/config/SysDefaultConfig.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.config; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | 5 | import java.io.File; 6 | import java.net.InetAddress; 7 | import java.net.UnknownHostException; 8 | 9 | /** 10 | * 系统常量 11 | * Created by cxu on 2015/9/18. 12 | */ 13 | public class SysDefaultConfig { 14 | public static final String APP_NAME = "jscrapy"; 15 | public static final String FILE_PATH_SEPERATOR = File.separator; 16 | public static final String DEFAULT_SPIDER_WORK_DIR = System.getProperty("user.home") 17 | + FILE_PATH_SEPERATOR 18 | + "." + APP_NAME 19 | + FILE_PATH_SEPERATOR; 20 | 21 | public static String HOST;//本机的IP 22 | public static int SCHEDULER_BATCH_SIZE = 1;//默认每次从队列里拿出来多少url 23 | public static int THREAD_COUNT = 1;// 任务的默认线程数目 24 | public static int WAIT_URL_SLEEP_TIME_MS = 1000; 25 | 26 | 27 | static{ 28 | InetAddress addr = null; 29 | try { 30 | addr = InetAddress.getLocalHost(); 31 | } catch (UnknownHostException e) { 32 | HOST = "?.?.?.?"; 33 | //TODO log error 34 | } 35 | if(StringUtils.isBlank(HOST)){ 36 | HOST = addr.getHostAddress().toString(); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/proxy/SpiderProxy.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.proxy; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | 5 | /** 6 | * Created by cxu on 2015/9/29. 7 | */ 8 | public class SpiderProxy { 9 | 10 | public enum ProxyType{ 11 | SOCKS, SOCKS5, HTTP 12 | } 13 | 14 | private final Enum proxyType; 15 | private final String userName; 16 | private final String password; 17 | private final String host; 18 | private final int port; 19 | 20 | public SpiderProxy(ProxyType proxyType, String userName, String password, String host, int port) { 21 | this.proxyType = proxyType; 22 | this.userName = StringUtils.isBlank(userName) ? "" : userName; 23 | this.password = StringUtils.isBlank(password) ? "" : password; 24 | this.host = host; 25 | this.port = port; 26 | } 27 | 28 | public String getUserName() { 29 | return userName; 30 | } 31 | 32 | public String getPassword() { 33 | return password; 34 | } 35 | 36 | public String getHost() { 37 | return host; 38 | } 39 | 40 | public int getPort() { 41 | return port; 42 | } 43 | 44 | @Override 45 | public String toString() { 46 | return userName + ":" + password + "@" + host + ":" + port; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/resources/applicationContext.xml: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /jscrapy-ext/src/test/java/dedup/SnakYamlTest.java: -------------------------------------------------------------------------------- 1 | package dedup; 2 | 3 | import org.testng.annotations.Test; 4 | import org.yaml.snakeyaml.DumperOptions; 5 | import org.yaml.snakeyaml.Yaml; 6 | 7 | /** 8 | * Created by cxu on 2017/1/16. 9 | */ 10 | public class SnakYamlTest { 11 | 12 | @Test 13 | public void test() { 14 | // JscrapyConfig jscrapyConfig = new JscrapyConfig(); 15 | // TaskBaseConfig taskBaseConfig = new TaskBaseConfig(); 16 | // taskBaseConfig.setTaskName("test"); 17 | // taskBaseConfig.setTaskId("test-id"); 18 | // jscrapyConfig.setTaskBaseConfig(taskBaseConfig); 19 | // 20 | // TaskComponentConfig taskComponentConfig = new MongoDedepConfig(); 21 | // MongoDedepConfig m = (MongoDedepConfig)taskComponentConfig; 22 | // m.setDbName("mongpodb_name"); 23 | // m.setPort(20338); 24 | // jscrapyConfig.setTaskComponentConfig(ComponentName.DEDUP_H2, m); 25 | 26 | DumperOptions options = new DumperOptions(); 27 | options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK); 28 | options.setCanonical(false); 29 | options.setDefaultScalarStyle(DumperOptions.ScalarStyle.PLAIN); 30 | 31 | Yaml yaml = new Yaml(options); 32 | // String s = yaml.dump(jscrapyConfig); 33 | 34 | // System.out.print(s); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/util/Yaml2BeanUtil.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.util; 2 | 3 | import org.springframework.core.io.Resource; 4 | import org.yaml.snakeyaml.Yaml; 5 | 6 | import java.io.*; 7 | 8 | /** 9 | * yaml配置文件映射为java bean 10 | * Created by cxu on 2016/12/29. 11 | */ 12 | public class Yaml2BeanUtil { 13 | /** 14 | * 15 | * @param clazz 16 | * @param file 17 | * @return 18 | */ 19 | public static Object loadAsBean(Class clazz, File file) throws FileNotFoundException { 20 | Yaml yaml = new Yaml(); 21 | Object o = yaml.loadAs(new FileInputStream(file), clazz); 22 | return o; 23 | } 24 | 25 | /** 26 | * 27 | * @param clazz 28 | * @param yamlContent 29 | * @return 30 | */ 31 | public static Object loadAsBean(Class clazz, String yamlContent) { 32 | Yaml yaml = new Yaml(); 33 | Object o = yaml.loadAs(yamlContent, clazz); 34 | return o; 35 | } 36 | 37 | /** 38 | * 39 | * @param clazz 40 | * @param resource 41 | * @return 42 | */ 43 | public static Object loadAsBean(Class clazz, Resource resource) throws IOException { 44 | Yaml yaml = new Yaml(); 45 | InputStream ins = resource.getInputStream(); 46 | Object o = yaml.loadAs(ins, clazz); 47 | return o; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/page/Page.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.page; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | import org.jscrapy.core.request.HttpRequest; 5 | 6 | /** 7 | * Created by cxu on 2014/11/21. 8 | */ 9 | public class Page { 10 | 11 | private boolean isFromCache; 12 | private HttpRequest request; 13 | private String rawText; 14 | 15 | public boolean isFromCache() { 16 | return isFromCache; 17 | } 18 | 19 | public void setIsFromCache(boolean isFromCache) { 20 | this.isFromCache = isFromCache; 21 | } 22 | 23 | public Page() { 24 | 25 | } 26 | 27 | public Page(String rawText){ 28 | this.rawText = rawText; 29 | } 30 | 31 | public String getRawText() { 32 | return rawText; 33 | } 34 | 35 | public void setRawText(String rawText) { 36 | this.rawText = rawText; 37 | } 38 | 39 | public HttpRequest getRequest(){ 40 | return this.request; 41 | } 42 | 43 | public void setRequest(HttpRequest request) { 44 | this.request = request; 45 | } 46 | 47 | public long sizeInKb() { 48 | if (StringUtils.isBlank(rawText)) { 49 | return 0; 50 | } 51 | else{ 52 | int len = rawText.length(); 53 | return Math.round(len / 1024); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/downloader/HttpDownloaderTest.java: -------------------------------------------------------------------------------- 1 | package downloader; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | import org.jscrapy.core.downloader.Downloader; 5 | import org.jscrapy.core.downloader.impl.HttpDownloader; 6 | import org.jscrapy.core.exception.MySpiderFetalException; 7 | import org.jscrapy.core.page.Page; 8 | import org.jscrapy.core.parser.Html; 9 | import org.jscrapy.core.request.HttpRequest; 10 | import org.testng.annotations.Test; 11 | import util.ResourcePathUtils; 12 | 13 | import static org.testng.Assert.assertEquals; 14 | import static org.testng.Assert.assertNotNull; 15 | 16 | /** 17 | * Created by cxu on 2015/11/7. 18 | */ 19 | public class HttpDownloaderTest { 20 | 21 | @Test 22 | public void testDownloadCanWork() throws MySpiderFetalException { 23 | String path = ResourcePathUtils.getResourceFileAbsPath(HttpDownloaderTest.class, "/H2CacherTest.yaml"); 24 | JscrapyConfig JscrapyConfig = null; 25 | JscrapyConfig = new JscrapyConfig(); 26 | 27 | String url = "http://www.oschina.net"; 28 | HttpRequest request = new HttpRequest(url); 29 | Downloader dl = new HttpDownloader(JscrapyConfig); 30 | Page pg = dl.download(request); 31 | assertNotNull(pg); 32 | Html html = new Html(pg); 33 | String title = html.$("title").xpath("//title/text()").get(); 34 | assertEquals("开源中国 - 找到您想要的开源项目,分享和交流", title); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /jscrapy-common/src/test/java/log/MyLoggerFactoryTest.java: -------------------------------------------------------------------------------- 1 | package log; 2 | 3 | import org.apache.commons.io.FileUtils; 4 | import org.apache.log4j.Logger; 5 | import org.jscrapy.common.datetime.DatetimeUtil; 6 | import org.jscrapy.common.log.MyLoggerFactory; 7 | import org.testng.annotations.Test; 8 | 9 | import java.io.File; 10 | import java.io.IOException; 11 | 12 | import static org.testng.Assert.assertTrue; 13 | 14 | /** 15 | * Created by cxu on 2015/10/31. 16 | */ 17 | public class MyLoggerFactoryTest { 18 | @Test 19 | public void testLog() throws IOException { 20 | String logPath = getLogPath(); 21 | Logger logger = MyLoggerFactory.getModuleLogger(MyLoggerFactoryTest.class.getSimpleName(), logPath); 22 | int i = 1000; 23 | while (i-- > 0) { 24 | logger.info("hahah"); 25 | } 26 | String file = getLogFilePath(); 27 | File f = new File(file); 28 | assertTrue(f.exists()); 29 | FileUtils.deleteQuietly(f); 30 | } 31 | 32 | private String getLogFilePath() { 33 | String logPath = getLogPath(); 34 | String file = logPath + DatetimeUtil.getTime("yyyyMMdd") + ".log"; 35 | return file; 36 | } 37 | 38 | private String getLogPath() { 39 | String workDir = System.getProperty("user.home"); 40 | String path = workDir + File.separator + ".jscrapy" + File.separator + "logs" + File.separator; 41 | return path; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /jscrapy-common/src/main/java/org/jscrapy/common/file/FileLineReader.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.common.file; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.RandomAccessFile; 6 | 7 | /** 8 | * Created by cxu on 2015/11/14. 9 | */ 10 | public class FileLineReader { 11 | /** 12 | * 从offset起读取lines行 13 | * @param filePath 14 | * @param offset 15 | * @param lines 16 | * @return 17 | */ 18 | public static FileLines readLines(String filePath, long offset, int lines) throws IOException { 19 | FileLines fileLines = new FileLines(); 20 | 21 | File f = new File(filePath) ; // 指定要操作的文件 22 | if (!f.exists()) { 23 | fileLines.setOffset(offset); 24 | return fileLines; 25 | } 26 | else if(f.length()<=offset){//游标大于文件 27 | fileLines.setOffset(offset); 28 | return fileLines; 29 | } 30 | RandomAccessFile rdf = null;// 声明RandomAccessFile类的对象 31 | rdf = new RandomAccessFile(f, "r"); 32 | 33 | rdf.seek(offset); 34 | for (int i = 0; i < lines; i++) { 35 | String str = rdf.readLine(); 36 | if (str != null) { 37 | fileLines.addLine(str); 38 | fileLines.setOffset(rdf.getFilePointer()); 39 | } 40 | else{ 41 | break; 42 | } 43 | } 44 | rdf.close(); 45 | return fileLines; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /docs/单机部署组件.md: -------------------------------------------------------------------------------- 1 | # 单机部署模式详细 2 | 3 | 单机模式下的Jscrapy需要的组件如下: 4 | 5 | | *组件* | *作用* | 6 | | ---------------------------------------- | --------------- | 7 | | [H2](http://www.h2database.com) | 元数据管理 | 8 | | H2 | URL队列 | 9 | | bigqueue | admin与service交互 | 10 | | H2([MVStore](http://www.h2database.com/html/mvstore.html)) | Url去重 | 11 | | 磁盘json文件 | 数据存储 | 12 | | 磁盘文件 | 网页缓存 | 13 | 14 | 15 | 16 | 最简单情况下你只需要启动应用程序, H2,bigqueue,orientdb都是java的嵌入式组件, 17 | 在打包好的应用程序里已经存在了。 18 | 19 | 某些情况下,虽然你部署了单机模式,但是为了容易处理数据还是希望能把数据直接保存 20 | 到一个更强大的Db里,这个时候你可以部署一台mongoDb。 21 | 22 | > 有一点需要记住,单机还是集群在Jscrapy的设计中只是一个逻辑上的概念,他们共享 23 | > 代码。一个单机的程序可以无缝切换为集群上的一个节点。集群上的一个节点也可以随时 24 | > 退出集群当做单机程序来使用。 25 | 26 | > 数据存储和网页缓存本来希望用一个嵌入式的nosql, 基于如下考量: 27 | > 28 | > 1. 嵌入式nosql和分布式环境下的nosql保持接口和形式上的高度一致,设计简单易理解 29 | > 2. 单纯存入文件会增加很多小文件,网页文件小而且大量。不利于磁盘性能 30 | > 31 | > 但是经过后如下考虑,决定不使用nosql,改用存json文件: 32 | > 33 | > 1. 减少依赖,越少依赖越可维护,可理解 34 | > 2. 使用磁盘文件存储虽然性能不强,但是本来这也是非常少用的方法。单机跑任务,基本是小任务 35 | > 3. 爬虫可以配置的,因此用户自己配置成MongoDB即可,无需为了保持一致而一定要找个嵌入式的nosql再次实现一次。 36 | > 4. 写在文件里,部署在一些便宜的linux主机上,可以随时用tail命令查看当前的数据输出 37 | > 5. 嵌入式的nosql要导出数据增加了学习成本 38 | > 39 | > PS:[orientDb](http://orientdb.com/) 和当前流行的为移动设备开发的 [nitrite-database](https://github.com/dizitart/nitrite-database) 需要对比一下 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /jscrapy-common/src/main/java/org/jscrapy/common/log/MyLoggerFactory.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.common.log; 2 | 3 | import org.jscrapy.common.datetime.DatetimeUtil; 4 | import org.apache.log4j.*; 5 | import org.slf4j.Logger; 6 | 7 | import java.io.File; 8 | 9 | /** 10 | * Created by cxu on 2015/10/27. 11 | */ 12 | public class MyLoggerFactory { 13 | public static Logger getLogger(Class clazz) { 14 | Logger logger = org.slf4j.LoggerFactory.getLogger(clazz); 15 | return logger; 16 | } 17 | 18 | /** 19 | * 20 | * @param logPath 21 | * @return 22 | */ 23 | public static org.apache.log4j.Logger getModuleLogger(String taskFp, String logPath) 24 | { 25 | if(!logPath.endsWith(File.separator)){ 26 | logPath = logPath + File.separator; 27 | } 28 | logPath = logPath + DatetimeUtil.getTime("yyyyMMdd") + ".log"; 29 | 30 | org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(taskFp); 31 | logger.removeAllAppenders(); 32 | logger.setLevel(Level.DEBUG); 33 | logger.setAdditivity(false);//不输出到其他的文件里 34 | FileAppender appender = new RollingFileAppender(); 35 | Layout layout = new PatternLayout("%d %p [%c] - %m%n"); 36 | appender.setFile(logPath); 37 | appender.setAppend(true); 38 | appender.setLayout(layout); 39 | appender.setEncoding("UTF-8"); 40 | appender.activateOptions(); 41 | logger.addAppender(appender); 42 | return logger; 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /docs/部署图-单机-集群.md: -------------------------------------------------------------------------------- 1 | # 本节目录 2 | 3 | - Jscrapy简介 4 | - 用例 5 | - 设计思路 6 | - 工程物理结构 7 | - 系统分层 8 | - 部署架构 9 | 10 | ## Jscrapy简介 11 | Jscrapy建立在作者在某首富公司多年垂直爬虫领域的积累之上,丰富的应用场景和严酷的 12 | 业务挑战让Jscrapy不再只是一个玩具。可以说无论是架构还是功能上,Jscrapy都堪称是千锤 13 | 百炼的工业级平台! 14 | 15 | 为什么名字叫Jscrapy?简单点理解就是java实现的scrapy。除此之外[scrapy](https://github.com/scrapy/scrapy) 16 | 真的是一个很棒的爬虫框架,作者从中受益良多也参考了很多它的设计。但是Jscrapy又有一些自己独有的特色: 17 | 18 | - 基于最新的java平台,稳定,更适合团队作战 19 | - 全自动化的任务部署,可对接maven仓库或者git仓库 20 | - 天生分布式和虚拟集群支持,动态扩容、缩容 21 | - 独一无二的异常处理体系,绝对不丢失一个网页 22 | - 无论是单机还是分布式部署完全无需配置 23 | - 基于机器负载动态调度任务 24 | - 支持更加丰富的应用场景,例如监控站点、一键搬家、优先级、回调通知,任务流等 25 | 26 | ## 设计思路 27 | ![](img/code_arch.png) 28 | 29 | ## 工程物理结构 30 | 31 | ```text 32 | jscrapy/ 33 | |----jscrapy-admin/ #管控 34 | |----jscrapy-service/ #核心服务 35 | |----jscrapy-contrib/ #核心扩展 36 | |----jscrapy-core/ 37 | |----jscrapy-common/ 38 | |----jscrapy-docs/ # 文档目录 39 | ``` 40 | 41 | * `jscrapy-admin` 系统管控,可视化任务上线,管理。 42 | * `jscrapy-service` 被动式爬虫服务,爬虫执行抓取的核心。 43 | * `jscrapy-contrib` 存储,队列等模块扩展。 44 | * `jscrapy-core` 爬虫核心。 45 | * `jscrapy-common` 基础库。 46 | * `jscrapy-docs` 文档。 47 | 48 | ## 系统分层 49 | 50 | ![系统层次图](img/system_tire.png) 51 | 52 | 53 | ## 部署架构 54 | Jscrapy支持两种部署模式: 55 | 56 | - 单机模式,适合开发、测试或者小批量数据临时抓取。但是单机同样提供了数据的完整性支持。 57 | - 集群模式,适合生产环节,短时间抓取大规模网站。 58 | 59 | ### 集群部署架构 60 | ![集群部署架构](img/deploy_arch_cluster.png) 61 | 62 | 详细部署方法参考[集群部署详情](cluster_deploy_arch/) 63 | ### 单机部署架构 64 | ![单机部署架构](img/deploy_arch_single.png) 65 | 66 | 详细部署方法参考[单机部署详情](single_deploy_arch/) 67 | 68 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/request/FetchRequestTest.java: -------------------------------------------------------------------------------- 1 | package request; 2 | 3 | import org.jscrapy.core.request.HttpRequest; 4 | import org.jscrapy.core.request.HttpRequestMethod; 5 | import org.testng.annotations.DataProvider; 6 | import org.testng.annotations.Test; 7 | 8 | import java.io.IOException; 9 | import java.util.Map; 10 | 11 | import static org.testng.Assert.assertEquals; 12 | 13 | /** 14 | * Created by cxu on 2015/5/21. 15 | */ 16 | public class FetchRequestTest { 17 | @DataProvider(name = "queue_request_provider") 18 | public Object[][] rangeData() { 19 | 20 | return new Object[][] { 21 | {"{\"url\":\"http://oxf1.com\",\"http_method\":\"POST\",\"post_parms\":{\"site\":\"oxf1\", \"person\":\"cxu\"}}", "http://oxf1.com", HttpRequestMethod.POST}, 22 | {"{\"url\":\"http://oxf1.com/test.html\",\"http_method\":\"GET\"}", "http://oxf1.com/test.html", HttpRequestMethod.GET}, 23 | {"{}", null, null }, 24 | }; 25 | } 26 | 27 | @Test(dataProvider = "queue_request_provider") 28 | public void testBuild(String jsonStr, String url , HttpRequestMethod method) throws IOException { 29 | HttpRequest req = HttpRequest.build(jsonStr); 30 | assert req!=null : "jackson parse result can not empty!"; 31 | 32 | assertEquals(req.getUrl(), url); 33 | assertEquals(req.getHttpMethod(), method); 34 | if(req.getParameters()!=null) { 35 | Map map = req.getParameters(); 36 | assertEquals("oxf1", map.get("site")); 37 | assertEquals("cxu", map.get("person")); 38 | } 39 | } 40 | 41 | 42 | 43 | 44 | } 45 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/downloader/impl/HttpDownloader.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.downloader.impl; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | import org.jscrapy.core.downloader.DownloadResponse; 5 | import org.jscrapy.core.downloader.Downloader; 6 | import org.jscrapy.core.page.Page; 7 | import org.jscrapy.core.request.HttpRequest; 8 | 9 | import java.io.IOException; 10 | import java.io.UnsupportedEncodingException; 11 | 12 | /** 13 | * Created by cxu on 2014/11/21. 14 | */ 15 | public class HttpDownloader extends Downloader { 16 | 17 | private OkHttpDownloaderImpl okHttpDownloader; 18 | 19 | public HttpDownloader(JscrapyConfig JscrapyConfig) { 20 | setJscrapyConfig(JscrapyConfig); 21 | } 22 | 23 | @Override 24 | public Page download(HttpRequest request) { 25 | 26 | try { 27 | DownloadResponse response = okHttpDownloader.doDownload(request); 28 | return getPage(request, response); 29 | } catch (UnsupportedEncodingException e) { 30 | e.printStackTrace(); 31 | } catch (IOException e) { 32 | e.printStackTrace(); 33 | } 34 | return null;//TODO 这里换成异常好点 35 | } 36 | 37 | /** 38 | * @param response 39 | * @return 40 | */ 41 | private Page getPage(HttpRequest request, DownloadResponse response) throws UnsupportedEncodingException { 42 | Page page = new Page(); 43 | String utf8Content = new String(response.getContent(), response.getCharset()); 44 | page.setRawText(utf8Content); 45 | page.setIsFromCache(false); 46 | page.setRequest(request); 47 | return page; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /jscrapy-common/src/main/java/org/jscrapy/common/log/TaskLogUtil.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.common.log; 2 | 3 | import org.slf4j.helpers.MessageFormatter; 4 | 5 | import java.lang.reflect.InvocationTargetException; 6 | import java.lang.reflect.Method; 7 | import java.util.ArrayList; 8 | 9 | /** 10 | * Created by cxu on 2015/11/7. 11 | */ 12 | public class TaskLogUtil { 13 | 14 | /** 15 | * 16 | * @param logger 17 | * @param methodName 18 | * @param msgPattern 19 | * @param args 20 | */ 21 | public static void log(Object logger, String methodName, String msgPattern, Object ... args) { 22 | ArrayList objs = new ArrayList(); 23 | for (Object o : args) { 24 | objs.add(o); 25 | } 26 | String msg = MessageFormatter.arrayFormat(msgPattern, objs.toArray()).getMessage(); 27 | 28 | log(logger, methodName, msg); 29 | } 30 | 31 | /** 32 | * 33 | * @param logger 34 | * @param methodName 35 | * @param msg 36 | */ 37 | public static void log(Object logger, String methodName, String msg) { 38 | //动态调用 39 | if (logger == null) { 40 | return; 41 | } 42 | Method method = null; 43 | try { 44 | method = logger.getClass().getMethod(methodName, new Class[]{String.class}); 45 | } catch (NoSuchMethodException e) { 46 | 47 | } 48 | 49 | if (method != null) { 50 | try { 51 | method.invoke(logger, msg); 52 | } catch (IllegalAccessException e) { 53 | e.printStackTrace(); 54 | } catch (InvocationTargetException e) { 55 | e.printStackTrace(); 56 | } 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /docs/技术选型.md: -------------------------------------------------------------------------------- 1 | ## URL管理组件 2 | | | 单机 | 集群 | 内存 | 磁盘 | URL优先级 | 不丢数据/错误恢复 | 缺点 | 3 | | --------- | ---: | :--- | ---- | ---- | --------- | --------- | -------------------- | 4 | | mapDb | Y | | Y | Y | FIFO/LIFO | Y | | 5 | | h2 | Y | | Y | Y | 数字优先级 | Y | | 6 | | bigQueue | Y | | | Y | FIFO | Y | | 7 | | mongoDb | Y | Y | | Y | 数字优先级 | Y | findAndModify每次只能取一条 | 8 | | kafka | Y | Y | | Y | FIFO | Y | | 9 | | pg | Y | Y | | Y | 数字优先级 | Y | | 10 | | redis | Y | Y | Y | | FIFO | | redis崩溃时所有任务不能恢复 | 11 | | JDK queue | Y | | Y | | FIFO/LIFO | | 程序退出,不能断点继续 | 12 | | | | | | | | | | 13 | 14 | ### 核心URL管理组件 15 | 16 | 1. ~~mapDb:单机方式,小数量URL队列。~~ h2完全可以囊括mapDb的功能。 17 | 2. h2:单机方式,小数量URL,要求URL严格不丢失。 18 | 3. postgreSql:集群方式,大URL量级,严格要求数据不丢失。 19 | 20 | 21 | 22 | > 这里没有选用mongoDb的主要原因是,mongoDb只支持每次出队列一条url,在任务量非常大的情况下对机器性能会有非常大的压力。 23 | 24 | 25 | 26 | ## URL去重组件 27 | 28 | | | 单机 | 集群 | 内存 | 磁盘 | 错误恢复 | 29 | | ------- | ---- | ---- | ---- | ---- | ---- | 30 | | mapDb | Y | | | Y | | 31 | | JDK Set | Y | | Y | | | 32 | | redis | Y | Y | Y | | | 33 | | mongoDb | Y | Y | | Y | Y | 34 | | H2 | Y | Y | Y | Y | Y | 35 | 36 | ### 核心URL去重组件 37 | 38 | 1. H2(MVStore):单机,去重。MVStore支持内存和磁盘两种方式,所以对比mapDb就没有必要再引入mapDb了。 39 | 2. mongoDb:集群方式,去重。 -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/QueueLockDo.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal; 2 | 3 | import java.util.Date; 4 | 5 | /** 6 | * Created by cxu on 2018/2/12. 7 | */ 8 | public class QueueLockDo { 9 | private Long id; 10 | private Date gmtCreate; 11 | private Date gmtModified; 12 | private String taskId; 13 | private String description; 14 | 15 | /** 16 | * 17 | * @param id 18 | * @param gmtCreate 19 | * @param gmtModified 20 | * @param taskId 21 | * @param description 22 | */ 23 | public QueueLockDo(Long id, Date gmtCreate, Date gmtModified, String taskId, String description) { 24 | this.id = id; 25 | this.gmtCreate = gmtCreate; 26 | this.gmtModified = gmtModified; 27 | this.taskId = taskId; 28 | this.description = description; 29 | } 30 | 31 | public Long getId() { 32 | return id; 33 | } 34 | 35 | public void setId(Long id) { 36 | this.id = id; 37 | } 38 | 39 | public Date getGmtCreate() { 40 | return gmtCreate; 41 | } 42 | 43 | public void setGmtCreate(Date gmtCreate) { 44 | this.gmtCreate = gmtCreate; 45 | } 46 | 47 | public Date getGmtModified() { 48 | return gmtModified; 49 | } 50 | 51 | public void setGmtModified(Date gmtModified) { 52 | this.gmtModified = gmtModified; 53 | } 54 | 55 | public String getTaskId() { 56 | return taskId; 57 | } 58 | 59 | public void setTaskId(String taskId) { 60 | this.taskId = taskId; 61 | } 62 | 63 | public String getDescription() { 64 | return description; 65 | } 66 | 67 | public void setDescription(String description) { 68 | this.description = description; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/UrlQueueMapper.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal; 2 | 3 | import org.apache.ibatis.annotations.Param; 4 | import org.jscrapy.core.request.UrlStatus; 5 | 6 | import java.util.List; 7 | 8 | /** 9 | * Created by cxu on 2016/8/5. 10 | */ 11 | public interface UrlQueueMapper { 12 | /** 13 | * @param queueName 14 | */ 15 | public int createNewQueue(@Param("queue_name") String queueName); 16 | 17 | /** 18 | * 删除队列 19 | * 20 | * @param queueName 21 | */ 22 | public void dropQueue(@Param("queue_name") String queueName); 23 | 24 | /** 25 | * 批量插入队列 26 | * 27 | * @return 28 | */ 29 | public int batchInsert(@Param("queue_name") String queueName, 30 | @Param("urls") List urls); 31 | 32 | /** 33 | * @return 34 | */ 35 | public List selectUrlByStatus(@Param("queue_name") String tableName, 36 | @Param("url_status") UrlStatus urlStatus, 37 | @Param("limit") int limit); 38 | 39 | /** 40 | * 批量更新队列 41 | */ 42 | public int batchUpdate(@Param("queue_name") String queueName, 43 | @Param("urls") List urls); 44 | 45 | /** 46 | * 批量更新队列 47 | */ 48 | public int batchUpdateUrlStatus(@Param("queue_name") String queueName, 49 | @Param("url_status") UrlStatus urlStatus, 50 | @Param("urls") List urls); 51 | 52 | /** 53 | * 批量删除 54 | */ 55 | public int batchDelete(@Param("queue_name") String queueName, 56 | @Param("urls") List urls); 57 | 58 | } 59 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/dedup/DeDupTest.java: -------------------------------------------------------------------------------- 1 | package dedup; 2 | 3 | import org.jscrapy.core.dedup.DeDup; 4 | import org.jscrapy.core.exception.MySpiderFetalException; 5 | import org.jscrapy.core.exception.MySpiderRecoverableException; 6 | import org.jscrapy.core.request.HttpRequest; 7 | import org.jscrapy.core.request.HttpRequestMethod; 8 | import org.testng.annotations.DataProvider; 9 | import org.testng.annotations.Test; 10 | 11 | import java.io.IOException; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | import static org.testng.Assert.assertEquals; 16 | 17 | /** 18 | * Created by cxu on 2015/9/19. 19 | */ 20 | public class DeDupTest { 21 | private HttpRequest rq = new HttpRequest("http://url1", HttpRequestMethod.DELETE, null); 22 | 23 | @DataProvider(name = "dp") 24 | public Object[][] dataProvider() throws IOException, MySpiderFetalException { 25 | return new DeDup[][]{ 26 | {}, 27 | }; 28 | } 29 | 30 | 31 | @Test(dataProvider = "dp") 32 | public void test(DeDup dedup) throws MySpiderRecoverableException { 33 | 34 | List req1 = new ArrayList<>(); 35 | req1.add(rq); 36 | req1 = dedup.deDup(req1); 37 | assertEquals(1, req1.size()); 38 | 39 | //先测试写入原来一样的,返回非空 40 | HttpRequest rq1 = rq; 41 | HttpRequest rq2 = new HttpRequest("http://url2", HttpRequestMethod.DELETE, null); 42 | HttpRequest rq3 = new HttpRequest("http://url3", HttpRequestMethod.DELETE, null); 43 | 44 | List req = new ArrayList<>(); 45 | req.add(rq1); 46 | req.add(rq2); 47 | req.add(rq3); 48 | 49 | req = dedup.deDup(req); 50 | assertEquals(2, req.size()); 51 | assertEquals(0, dedup.deDup(req).size()); 52 | 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/dedup/RedisDedup.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.dedup; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | import org.jscrapy.core.dedup.DeDup; 5 | import org.jscrapy.core.request.Request; 6 | import org.jscrapy.ext.modulecfg.RedisDedupConfig; 7 | import redis.clients.jedis.Jedis; 8 | import redis.clients.jedis.JedisPool; 9 | import redis.clients.jedis.JedisPoolConfig; 10 | 11 | /** 12 | * redis 实现的去重 13 | * Created by cxu on 2015/6/22. 14 | */ 15 | public class RedisDedup extends DeDup { 16 | 17 | private static final String DEDUP_SET_PREFIX = "jscrapy_dedup_set_"; 18 | private JedisPool pool; 19 | 20 | public RedisDedup() { 21 | 22 | } 23 | 24 | public void setJscrapyConfig(JscrapyConfig jscrapyConfig) { 25 | super.setJscrapyConfig(jscrapyConfig); 26 | RedisDedupConfig redisDedepConfig = null;//(RedisDedupConfig) jscrapyConfig.get(ComponentName.DEDUP_REDIS); 27 | String redisHost = redisDedepConfig.getHost(); 28 | this.pool = new JedisPool(new JedisPoolConfig(), redisHost); 29 | } 30 | 31 | /** 32 | * @param request 33 | * @return 已经存在返回true, 否则false 34 | */ 35 | @Override 36 | protected boolean isDup(Request request) { 37 | 38 | try (Jedis jedis = pool.getResource()) { 39 | boolean isDuplicate = jedis.sismember(getDedupSetKey(), request.uniqId()); 40 | if (!isDuplicate) { 41 | jedis.sadd(getDedupSetKey(), request.uniqId()); 42 | } 43 | return isDuplicate; 44 | } 45 | } 46 | 47 | /** 48 | * 去重的redis 集合(Set)的key 49 | * 50 | * @return 51 | */ 52 | private String getDedupSetKey() { 53 | String dedupSetKey = DEDUP_SET_PREFIX + getJscrapyConfig().getTaskFp(); 54 | 55 | return dedupSetKey; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/downloader/DownloadResponse.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.downloader; 2 | 3 | import org.apache.commons.collections4.MultiValuedMap; 4 | import org.apache.commons.collections4.multimap.ArrayListValuedHashMap; 5 | 6 | import java.util.Collection; 7 | 8 | /** 9 | * Created by cxu on 2015/9/30. 10 | */ 11 | public class DownloadResponse { 12 | private MultiValuedMap headers = new ArrayListValuedHashMap(); 13 | private boolean success = true; 14 | private int statusCode; 15 | private String charset; 16 | private byte[] content; 17 | 18 | public boolean isSuccess() { 19 | return success; 20 | } 21 | 22 | public void setSuccess(boolean success) { 23 | this.success = success; 24 | } 25 | 26 | public int getStatusCode() { 27 | return statusCode; 28 | } 29 | 30 | public void setStatusCode(int statusCode) { 31 | this.statusCode = statusCode; 32 | } 33 | 34 | public String getCharset() { 35 | return charset; 36 | } 37 | 38 | public void setCharset(String charset) { 39 | this.charset = charset; 40 | } 41 | 42 | public byte[] getContent() { 43 | return content; 44 | } 45 | 46 | public void setContent(byte[] content) { 47 | this.content = content; 48 | } 49 | 50 | public void addHeader(String key, String value) { 51 | headers.put(key, value); 52 | } 53 | 54 | public Collection getHeaders(String key) { 55 | Collection values = headers.get(key); 56 | return values; 57 | } 58 | 59 | public String getHeader(String key) { 60 | String value = null; 61 | 62 | Collection values = getHeaders(key); 63 | if (values != null && values.size() > 0) { 64 | value = values.iterator().next(); 65 | } 66 | 67 | return value; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/PageCacheDo.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal; 2 | 3 | import java.util.Date; 4 | 5 | /** 6 | * Created by cxu on 2017/1/8. 7 | */ 8 | public class PageCacheDo { 9 | private Long id;// record id 10 | private String pageId; //去重ID 11 | private Date gmtCreated; 12 | private Date gmtAccess;//服务器返回的最后修改时间 13 | private String etag;//服务器返回的etag 14 | private String pageContent;//网页内容 15 | 16 | public PageCacheDo() { 17 | 18 | } 19 | 20 | public PageCacheDo(Long id, String pageId, Date gmtCreated, Date gmtAccess, String etag, String pageContent) { 21 | this.id = id; 22 | this.pageId = pageId; 23 | this.gmtCreated = gmtCreated; 24 | this.gmtAccess = gmtAccess; 25 | this.etag = etag; 26 | this.pageContent = pageContent; 27 | } 28 | 29 | public long getId() { 30 | return id; 31 | } 32 | 33 | public void setId(long id) { 34 | this.id = id; 35 | } 36 | 37 | public String getPageId() { 38 | return pageId; 39 | } 40 | 41 | public void setPageId(String pageId) { 42 | this.pageId = pageId; 43 | } 44 | 45 | public Date getGmtCreated() { 46 | return gmtCreated; 47 | } 48 | 49 | public void setGmtCreated(Date gmtCreated) { 50 | this.gmtCreated = gmtCreated; 51 | } 52 | 53 | public Date getGmtAccess() { 54 | return gmtAccess; 55 | } 56 | 57 | public void setGmtAccess(Date gmtAccess) { 58 | this.gmtAccess = gmtAccess; 59 | } 60 | 61 | public String getEtag() { 62 | return etag; 63 | } 64 | 65 | public void setEtag(String etag) { 66 | this.etag = etag; 67 | } 68 | 69 | public String getPageContent() { 70 | return pageContent; 71 | } 72 | 73 | public void setPageContent(String pageContent) { 74 | this.pageContent = pageContent; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/spider/Spider.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.spider; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.cacher.Cacher; 5 | import org.jscrapy.core.comsumer.UrlConsumer; 6 | import org.jscrapy.core.config.JscrapyConfig; 7 | import org.jscrapy.core.dedup.DeDup; 8 | import org.jscrapy.core.downloader.Downloader; 9 | import org.jscrapy.core.pipline.Pipline; 10 | import org.jscrapy.core.processor.Processor; 11 | import org.jscrapy.core.producer.UrlProducer; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | import org.springframework.beans.factory.annotation.Autowired; 15 | 16 | /** 17 | * 一个线程要做的事情 18 | * Created by cxu on 2015/6/20. 19 | */ 20 | public abstract class Spider extends JscrapyComponent implements Runnable { 21 | final static Logger logger = LoggerFactory.getLogger(Spider.class); 22 | 23 | @Autowired 24 | protected DeDup dedup; 25 | @Autowired 26 | protected Cacher cacher; 27 | @Autowired 28 | protected Downloader downloader; 29 | /** 30 | * 这里为什么没有采用webmagic一样的存到多个存储里? 31 | * 1,作者认为一份数据存多个地方是多此一举,不是爬虫的核心功能,核心应该尽量 32 | * 简单易理解维护,因为爬虫面临的情况太多了。 33 | * 2,存储多个数据源可以利用其他工具进行同步。 34 | * 3,与其存储到多个地方,不如开发存储成功之后发消息的方式更实用 35 | */ 36 | @Autowired 37 | protected Pipline pipline; 38 | @Autowired 39 | protected Processor processor; 40 | @Autowired 41 | protected UrlProducer urlProducer; 42 | @Autowired 43 | protected UrlConsumer urlConsumer; 44 | 45 | public Spider(JscrapyConfig jscrapyConfig) { 46 | setJscrapyConfig(jscrapyConfig); 47 | } 48 | 49 | @Override 50 | public void run() { 51 | 52 | while (!Thread.currentThread().isInterrupted() && canRun()) { 53 | 54 | mainLoop(); 55 | 56 | } 57 | } 58 | 59 | private boolean canRun(){ 60 | return true;//TODO 61 | } 62 | 63 | protected abstract void mainLoop(); 64 | } 65 | -------------------------------------------------------------------------------- /jscrapy-common/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | org.jscrapy 5 | jscrapy 6 | 1.0-SNAPSHOT 7 | 8 | org.jscrapy 9 | jscrapy-common 10 | 1.0-SNAPSHOT 11 | jar 12 | jscrapy-common 13 | 14 | 15 | commons-io 16 | commons-io 17 | ${commons-io.version} 18 | 19 | 20 | org.apache.commons 21 | commons-lang3 22 | ${commons-lang3.version} 23 | 24 | 25 | org.yaml 26 | snakeyaml 27 | ${snakyaml.version} 28 | 29 | 30 | org.jodd 31 | jodd-http 32 | ${jodd.version} 33 | 34 | 35 | log4j 36 | log4j 37 | ${log4j.version} 38 | 39 | 40 | org.apache.commons 41 | commons-collections4 42 | ${commons-collections.version} 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/util/Yaml2BeanUtilTest.java: -------------------------------------------------------------------------------- 1 | package util; 2 | 3 | import org.apache.commons.io.FileUtils; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.util.Yaml2BeanUtil; 6 | import org.springframework.core.io.ClassPathResource; 7 | import org.springframework.core.io.Resource; 8 | import org.testng.annotations.Test; 9 | 10 | import java.io.File; 11 | import java.io.FileNotFoundException; 12 | import java.io.IOException; 13 | 14 | import static org.testng.Assert.assertEquals; 15 | import static org.testng.AssertJUnit.assertNotNull; 16 | 17 | /** 18 | * Created by cxu on 2016/12/29. 19 | */ 20 | public class Yaml2BeanUtilTest { 21 | 22 | @Test 23 | public void testString2Bean() throws IOException { 24 | String path = ResourcePathUtils.getResourceFileAbsPath(Yaml2BeanUtil.class, "/yaml2beanTest.yaml"); 25 | String yamlString = FileUtils.readFileToString(new File(path)); 26 | JscrapyConfig config = (JscrapyConfig) Yaml2BeanUtil.loadAsBean(JscrapyConfig.class, yamlString); 27 | assertNotNull(config); 28 | assertEquals(config.getTaskId(), "task.id"); 29 | 30 | } 31 | 32 | @Test 33 | public void testFile2Bean() throws FileNotFoundException { 34 | String path = ResourcePathUtils.getResourceFileAbsPath(Yaml2BeanUtil.class, "/yaml2beanTest.yaml"); 35 | File f = new File(path); 36 | JscrapyConfig config = (JscrapyConfig) Yaml2BeanUtil.loadAsBean(JscrapyConfig.class, f); 37 | assertNotNull(config); 38 | assertEquals(config.getTaskId(), "task.id"); 39 | } 40 | 41 | @Test 42 | public void testResource2Bean() throws IOException { 43 | Resource resource = new ClassPathResource("yaml2beanTest.yaml"); 44 | JscrapyConfig config = (JscrapyConfig) Yaml2BeanUtil.loadAsBean(JscrapyConfig.class, resource); 45 | assertNotNull(config); 46 | assertEquals(config.getTaskId(), "task.id"); 47 | } 48 | 49 | 50 | } 51 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/status/TaskStatus.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.status; 2 | 3 | import java.util.concurrent.atomic.AtomicLong; 4 | 5 | /** 6 | * Created by cxu on 2015/9/30. 7 | */ 8 | public class TaskStatus { 9 | public enum Status{ 10 | RUN, PAUSE, CANCEL 11 | } 12 | 13 | private AtomicLong cacheHitUrlCnt = new AtomicLong(0); //缓存命中的个数 14 | private AtomicLong netUrlCnt = new AtomicLong(0);//网络抓取的个数 15 | private AtomicLong failedUrlCnt = new AtomicLong(0);//失败抓取的次数 16 | private AtomicLong pageSizeKb = new AtomicLong(0);//页面大小 17 | private AtomicLong dataItemCnt = new AtomicLong(0);//解析出的数据条数 18 | 19 | public AtomicLong getCacheHitUrlCnt() { 20 | return cacheHitUrlCnt; 21 | } 22 | 23 | public void setCacheHitUrlCnt(AtomicLong cacheHitUrlCnt) { 24 | this.cacheHitUrlCnt = cacheHitUrlCnt; 25 | } 26 | 27 | public AtomicLong getNetUrlCnt() { 28 | return netUrlCnt; 29 | } 30 | 31 | public void setNetUrlCnt(AtomicLong netUrlCnt) { 32 | this.netUrlCnt = netUrlCnt; 33 | } 34 | 35 | public AtomicLong getFailedUrlCnt() { 36 | return failedUrlCnt; 37 | } 38 | 39 | public void setFailedUrlCnt(AtomicLong failedUrlCnt) { 40 | this.failedUrlCnt = failedUrlCnt; 41 | } 42 | 43 | public AtomicLong getPageSizeKb() { 44 | return pageSizeKb; 45 | } 46 | 47 | public void setPageSizeKb(AtomicLong pageSizeKb) { 48 | this.pageSizeKb = pageSizeKb; 49 | } 50 | 51 | public AtomicLong getDataItemCnt() { 52 | return dataItemCnt; 53 | } 54 | 55 | public void setDataItemCnt(AtomicLong dataItemCnt) { 56 | this.dataItemCnt = dataItemCnt; 57 | } 58 | 59 | @Override 60 | public String toString() { 61 | return "TaskStatus{" + 62 | "cacheHitUrlCnt=" + cacheHitUrlCnt + 63 | ", netUrlCnt=" + netUrlCnt + 64 | ", failedUrlCnt=" + failedUrlCnt + 65 | ", pageSizeKb=" + pageSizeKb + 66 | ", dataItemCnt=" + dataItemCnt + 67 | '}'; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/cacher/MongoCacher.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.cacher; 2 | 3 | import com.mongodb.*; 4 | import org.jscrapy.core.cacher.Cacher; 5 | import org.jscrapy.core.config.JscrapyConfig; 6 | import org.jscrapy.core.page.Page; 7 | import org.jscrapy.core.request.HttpRequest; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | /** 12 | * Created by cxu on 2015/7/12. 13 | */ 14 | public class MongoCacher extends Cacher { 15 | final static Logger logger = LoggerFactory.getLogger(MongoCacher.class); 16 | private static final String DB_PRIMARY_KEY = "id"; 17 | private static final String DB_CACHE_FIELD_NAME = "page"; 18 | protected DB db = null; 19 | protected DBCollection collection = null; 20 | private Mongo mongo = null; 21 | 22 | public MongoCacher(JscrapyConfig jscrapyConfig) { 23 | setJscrapyConfig(jscrapyConfig); 24 | String dbHost = jscrapyConfig.getMongoCacheHost(); 25 | int dbPort = jscrapyConfig.getMongoCachePort(); 26 | String dbName = jscrapyConfig.getMongoCacheDbName(); 27 | String tableName = jscrapyConfig.getMongoCacheTableName(); 28 | this.mongo = new MongoClient(dbHost, dbPort); 29 | this.db = mongo.getDB(dbName); 30 | this.collection = db.getCollection(tableName); 31 | } 32 | 33 | @Override 34 | public Page loadPage(HttpRequest request) { 35 | BasicDBObject query = new BasicDBObject(); 36 | query.append(DB_PRIMARY_KEY, request.uniqId()); 37 | BasicDBObject obj = (BasicDBObject) collection.findOne(query); 38 | if (obj != null) { 39 | Page pg = new Page(obj.getString(DB_CACHE_FIELD_NAME)); 40 | pg.setIsFromCache(true); 41 | return pg; 42 | } 43 | 44 | return null; 45 | } 46 | 47 | @Override 48 | public void cachePage(Page page) { 49 | DBObject pageDoc = new BasicDBObject(); 50 | pageDoc.put(DB_PRIMARY_KEY, page.getRequest().uniqId()); 51 | pageDoc.put(DB_CACHE_FIELD_NAME, page.getRawText()); 52 | this.collection.insert(pageDoc); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/dedup/MongoDedup.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.dedup; 2 | 3 | import com.mongodb.*; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.config.modulecfg.MongoDedepConfig; 6 | import org.jscrapy.core.dedup.DeDup; 7 | import org.jscrapy.core.request.Request; 8 | 9 | /** 10 | * Created by cxu on 2015/7/12. 11 | */ 12 | public class MongoDedup extends DeDup { 13 | private static final String DB_PRIMARY_KEY = "_id"; 14 | private static final String DB_CACHE_FIELD_NAME = "is_page_exit"; 15 | protected DB db = null; 16 | protected DBCollection collection = null; 17 | private Mongo mongo = null; 18 | 19 | public MongoDedup(JscrapyConfig jscrapyConfig) { 20 | setJscrapyConfig(jscrapyConfig); 21 | } 22 | 23 | public MongoDedup() { 24 | 25 | } 26 | 27 | /** 28 | * 29 | * @param jscrapyConfig 30 | */ 31 | public void setJscrapyConfig(JscrapyConfig jscrapyConfig) { 32 | super.setJscrapyConfig(jscrapyConfig); 33 | MongoDedepConfig mongoDedupConfig = null;//(MongoDedepConfig)jscrapyConfig.get(ComponentName.DEDUP_MONGO); 34 | String dbHost = mongoDedupConfig.getHost(); 35 | int dbPort = mongoDedupConfig.getPort(); 36 | String dbName = mongoDedupConfig.getDbName(); 37 | String tableName = jscrapyConfig.getTaskFp(); 38 | 39 | this.mongo = new MongoClient(dbHost, dbPort); 40 | this.db = mongo.getDB(dbName); 41 | this.collection = db.getCollection(tableName); 42 | } 43 | 44 | @Override 45 | protected boolean isDup(Request request) { 46 | String id = request.uniqId(); 47 | BasicDBObject query = new BasicDBObject(); 48 | query.append(DB_PRIMARY_KEY, id); 49 | BasicDBObject obj = (BasicDBObject) collection.findOne(query); 50 | if (obj == null) { 51 | DBObject pageDoc = new BasicDBObject(); 52 | pageDoc.put(DB_PRIMARY_KEY, id); 53 | pageDoc.put(DB_CACHE_FIELD_NAME, 1); 54 | this.collection.insert(pageDoc); 55 | } 56 | return obj != null; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/scheduler/RedisScheduler.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.scheduler; 2 | 3 | import com.alibaba.fastjson.JSONException; 4 | import org.jscrapy.core.JscrapyComponent; 5 | import org.jscrapy.core.config.JscrapyConfig; 6 | import org.jscrapy.core.request.HttpRequest; 7 | import org.jscrapy.core.request.Request; 8 | import org.jscrapy.ext.modulecfg.RedisSchedulerConfig; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import redis.clients.jedis.Jedis; 12 | import redis.clients.jedis.JedisPool; 13 | import redis.clients.jedis.JedisPoolConfig; 14 | 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | 18 | /** 19 | * Created by cxu on 2015/6/21. 20 | */ 21 | public class RedisScheduler extends JscrapyComponent { 22 | final static Logger logger = LoggerFactory.getLogger(RedisScheduler.class); 23 | private JedisPool pool; 24 | 25 | public RedisScheduler(JscrapyConfig jscrapyConfig) { 26 | setJscrapyConfig(jscrapyConfig); 27 | RedisSchedulerConfig redisDedupConfig = null;//RedisSchedulerConfig) jscrapyConfig.get(ComponentName.DEDUP_REDIS); 28 | String redisHost = redisDedupConfig.getHost(); 29 | this.pool = new JedisPool(new JedisPoolConfig(), redisHost); 30 | } 31 | 32 | 33 | public int push(List requests) { 34 | 35 | try (Jedis jedis = this.pool.getResource()) { 36 | for (Request req : requests) { 37 | jedis.rpush(this.getQueueName(), req.asJson()); 38 | } 39 | } 40 | 41 | return requests.size(); 42 | } 43 | 44 | 45 | public List poll(int n) { 46 | List req = new ArrayList(); 47 | try (Jedis jedis = this.pool.getResource()) { 48 | for (int i = 0; i < n; i++) { 49 | String reqJson = jedis.lpop(this.getQueueName()); 50 | HttpRequest request = HttpRequest.build(reqJson); 51 | req.add(request); 52 | } 53 | 54 | } catch (JSONException e) { 55 | 56 | } 57 | return req; 58 | } 59 | 60 | 61 | 62 | /** 63 | * request队列的名字 64 | * 65 | * @return 66 | */ 67 | private String getQueueName() { 68 | return "jscrapy_queue_" + this.getJscrapyConfig().getTaskId(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/plugin/PluginChain.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.plugin; 2 | 3 | import cn.hutool.core.util.ClassUtil; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | import org.springframework.context.ApplicationContext; 6 | 7 | import java.util.*; 8 | 9 | /** 10 | * Created by cxu on 2018/2/11. 11 | */ 12 | public class PluginChain { 13 | //切点前方法链 14 | protected List beforePluginChain; 15 | //切点后方法 16 | protected List afterPluginChain; 17 | 18 | @Autowired 19 | private static ApplicationContext applicationContext; 20 | 21 | protected PluginChain(String basePkgName){ 22 | initBeforePluginChain(basePkgName); 23 | initAfterPluginChain(basePkgName); 24 | } 25 | 26 | /** 27 | * 28 | * @param basePkgName 29 | */ 30 | private void initBeforePluginChain(String basePkgName) { 31 | String fullPkgName = basePkgName + ".before"; 32 | beforePluginChain = scanOrderedPlugin(fullPkgName); 33 | } 34 | 35 | /** 36 | * 37 | * @param basePkgName 38 | */ 39 | private void initAfterPluginChain(String basePkgName) { 40 | String fullPkgName = basePkgName + ".after"; 41 | afterPluginChain = scanOrderedPlugin(fullPkgName); 42 | } 43 | 44 | /** 45 | * 扫描pkgName里的全部非抽象类,并按照PluginChainOrder从小到大排列 46 | * @param pkgName 47 | * @return 48 | */ 49 | private List scanOrderedPlugin(String pkgName) { 50 | Map orderedPluginMap = new TreeMap(); 51 | Set> plugins = ClassUtil.scanPackage(pkgName); 52 | for (Class c : plugins) {//探测每一个Plugin在Chain中的顺序,方便后面排序 53 | PluginOrder pluginOrder = (PluginOrder)c.getAnnotation(PluginOrder.class); 54 | int order = pluginOrder.value(); 55 | String className = c.getName();//full package name 56 | Plugin plugin = (Plugin)applicationContext.getBean(className); 57 | orderedPluginMap.put(order, plugin); 58 | } 59 | 60 | /** 61 | * 对Plugin进行排序 62 | */ 63 | List resultMethodList = new ArrayList(); 64 | for (Integer order : orderedPluginMap.keySet()) { 65 | resultMethodList.add(orderedPluginMap.get(order)); 66 | } 67 | 68 | return resultMethodList; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/cacher/impl/H2Cacher.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.cacher.impl; 2 | 3 | import org.jscrapy.core.cacher.Cacher; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.dal.PageCacheDo; 6 | import org.jscrapy.core.dal.PageCacheMapper; 7 | import org.jscrapy.core.page.Page; 8 | import org.jscrapy.core.request.HttpRequest; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.context.annotation.Configuration; 13 | 14 | /** 15 | * Created by cxu on 2015/7/12. 16 | */ 17 | @Configuration 18 | public class H2Cacher extends Cacher { 19 | final static Logger logger = LoggerFactory.getLogger(H2Cacher.class); 20 | 21 | @Autowired 22 | private PageCacheMapper pageCacheMapper; 23 | 24 | /** 25 | * 构造函数 26 | * 27 | * @param jscrapyConfig 28 | */ 29 | public H2Cacher(JscrapyConfig jscrapyConfig) { 30 | setJscrapyConfig(jscrapyConfig); 31 | initCache(); 32 | } 33 | 34 | public H2Cacher() { 35 | 36 | } 37 | 38 | public void setJscrapyConfig(JscrapyConfig jscrapyConfig) { 39 | super.setJscrapyConfig(jscrapyConfig); 40 | initCache(); 41 | } 42 | 43 | @Override 44 | public Page loadPage(HttpRequest request) { 45 | String taskName = getJscrapyConfig().getTaskName(); 46 | String pageId = request.uniqId(); 47 | PageCacheDo pageCacheDo = pageCacheMapper.find(taskName, pageId); 48 | Page page = new Page(); 49 | if (pageCacheDo != null) { 50 | page.setIsFromCache(true); 51 | page.setRawText(pageCacheDo.getPageContent()); 52 | page.setRequest(request); 53 | } 54 | 55 | return page; 56 | } 57 | 58 | @Override 59 | public void cachePage(Page page) { 60 | String tableName = getJscrapyConfig().getTaskName(); 61 | PageCacheDo pageCacheDo = new PageCacheDo(); 62 | pageCacheDo.setPageId(page.getRequest().uniqId()); 63 | pageCacheDo.setPageContent(page.getRawText()); 64 | 65 | pageCacheMapper.insert(tableName, pageCacheDo); 66 | } 67 | 68 | /** 69 | * 为任务创建缓存表 70 | */ 71 | private void initCache() { 72 | String taskName = getJscrapyConfig().getTaskName(); 73 | pageCacheMapper.createCacherTable(taskName); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/cacher/CacherTest.java: -------------------------------------------------------------------------------- 1 | package cacher; 2 | 3 | import org.jscrapy.core.cacher.Cacher; 4 | import org.jscrapy.core.cacher.impl.H2Cacher; 5 | import org.jscrapy.core.config.JscrapyConfig; 6 | import org.jscrapy.core.exception.MySpiderFetalException; 7 | import org.jscrapy.core.page.Page; 8 | import org.jscrapy.core.request.HttpRequest; 9 | import org.jscrapy.core.request.HttpRequestMethod; 10 | import org.junit.BeforeClass; 11 | import org.junit.Test; 12 | import org.junit.runner.RunWith; 13 | import org.springframework.beans.factory.annotation.Autowired; 14 | import org.springframework.test.context.ContextConfiguration; 15 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 16 | import util.ResourcePathUtils; 17 | 18 | import java.io.IOException; 19 | 20 | import static org.testng.Assert.assertNotNull; 21 | 22 | /** 23 | * Created by cxu on 2015/9/19. 24 | */ 25 | @RunWith(SpringJUnit4ClassRunner.class) 26 | @ContextConfiguration({"classpath:applicationContext.xml"}) 27 | public class CacherTest { 28 | @Autowired 29 | private H2Cacher h2Cacher; 30 | 31 | private static Page page; 32 | private static HttpRequest request; 33 | 34 | @BeforeClass 35 | public static void setup() { 36 | request = new HttpRequest("http://jscrapy.org/test", HttpRequestMethod.DELETE, null); 37 | page = new Page("this is html content, hahaha!"); 38 | page.setRequest(request); 39 | } 40 | 41 | @Test 42 | public void testSave() throws IOException, MySpiderFetalException { 43 | Cacher[] cachers = new Cacher[]{initH2Cacher()}; 44 | for (Cacher c : cachers) { 45 | doTest(c); 46 | } 47 | } 48 | 49 | /** 50 | * @param cacher 51 | * @throws MySpiderFetalException 52 | */ 53 | private void doTest(Cacher cacher) throws MySpiderFetalException { 54 | cacher.cachePage(page); 55 | Page pg = cacher.loadPage(request); 56 | assertNotNull(pg); 57 | } 58 | 59 | /** 60 | * @return 61 | */ 62 | private Cacher initH2Cacher() throws IOException { 63 | String path = ResourcePathUtils.getResourceFileAbsPath(CacherTest.class, "/H2CacherTest.yaml"); 64 | JscrapyConfig jscrapyConfig = null;//(JscrapyConfig) Yaml2BeanUtil.loadAsBean(JscrapyConfig.class, new File(path)); 65 | h2Cacher.setJscrapyConfig(jscrapyConfig); 66 | return h2Cacher; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/producer/impl/H2UrlProducer.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.producer.impl; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | import org.jscrapy.core.dal.UrlQueueDo; 5 | import org.jscrapy.core.dal.h2queue.H2UrlQueueDo; 6 | import org.jscrapy.core.dal.h2queue.H2UrlQueueMapper; 7 | import org.jscrapy.core.producer.UrlProducer; 8 | import org.jscrapy.core.request.RequestContext; 9 | import org.jscrapy.core.request.UrlStatus; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * Created by cxu on 2017/1/19. 16 | */ 17 | 18 | public class H2UrlProducer extends UrlProducer { 19 | 20 | private H2UrlQueueMapper h2UrlQueueMapper; 21 | 22 | @Override 23 | public int push(List requests) { 24 | int inserted = 0; 25 | if (requests != null) { 26 | String tableName = getJscrapyConfig().getTaskFp(); 27 | List newUrl = getNew(requests); 28 | h2UrlQueueMapper.batchInsert(tableName, newUrl); 29 | 30 | inserted = requests.size(); 31 | }else{ 32 | inserted = 0; 33 | } 34 | 35 | return inserted; 36 | } 37 | 38 | @Override 39 | public void update(List requests) { 40 | 41 | } 42 | 43 | /** 44 | * 入队列的时候有的url是因为错误原因再次被放入到队列里, 45 | * 这个函数只要找出来新产生的 46 | * @param requests 47 | * @return 48 | */ 49 | private List getNew(List requests) { 50 | List newReq = new ArrayList<>(); 51 | for (RequestContext requestContext : requests) { 52 | UrlQueueDo urlQueueDo = new H2UrlQueueDo(); 53 | Long id = requestContext.getId(); 54 | if (id == null) { 55 | urlQueueDo.setUrl(requestContext.getFullUrl()); 56 | urlQueueDo.setUrlStatus(UrlStatus.NEW); 57 | urlQueueDo.setRetryTimes(0); 58 | urlQueueDo.setUrlType(requestContext.getUrlType()); 59 | urlQueueDo.setSiteId(requestContext.getSiteId()); 60 | newReq.add(urlQueueDo); 61 | } 62 | } 63 | 64 | return newReq; 65 | } 66 | 67 | @Override 68 | public void setJscrapyConfig(JscrapyConfig jscrapyConfig) { 69 | super.setJscrapyConfig(jscrapyConfig); 70 | h2UrlQueueMapper.createNewQueue(jscrapyConfig.getTaskFp()); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/resources/mapper/h2pagecache.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | CREATE TABLE IF NOT EXISTS ${table_name} 19 | ( 20 | id BIGSERIAL PRIMARY KEY, -- 唯一键 21 | page_id VARCHAR(64) , -- request.fp() 22 | gmt_created TIMESTAMP, -- 插入时间 23 | gmt_access TIMESTAMP, -- 被访问时间 24 | etag VARCHAR(64), -- etag 25 | page_content TEXT, -- html 26 | ); 27 | 28 | 29 | 30 | INSERT INTO ${table_name} (page_id, gmt_created, gmt_access, etag, page_content) 31 | VALUES 32 | 33 | ( 34 | #{page.pageId, jdbcType=VARCHAR}, 35 | NOW(), 36 | NOW(), 37 | #{page.etag,jdbcType=VARCHAR}, 38 | #{page.pageContent,jdbcType=VARCHAR}, 39 | ) 40 | 41 | 42 | 43 | 44 | INSERT INTO ${table_name} (page_id, gmt_created, gmt_access, etag, page_content) 45 | VALUES 46 | ( 47 | #{page.pageId, jdbcType=VARCHAR}, 48 | NOW(), 49 | NOW(), 50 | #{page.etag,jdbcType=VARCHAR}, 51 | #{page.pageContent,jdbcType=VARCHAR}, 52 | ) 53 | 54 | 55 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /jscrapy-ext/src/test/java/pipline/LocalFilePiplineTest.java: -------------------------------------------------------------------------------- 1 | package pipline; 2 | 3 | import org.apache.commons.io.FileUtils; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.data.DataItem; 6 | import org.jscrapy.core.exception.MySpiderFetalException; 7 | import org.jscrapy.core.exception.MySpiderRecoverableException; 8 | import org.jscrapy.core.pipline.Pipline; 9 | import org.jscrapy.core.util.Yaml2BeanUtil; 10 | import org.jscrapy.ext.pipline.LocalFilePipline; 11 | import org.springframework.core.io.ClassPathResource; 12 | import org.springframework.core.io.Resource; 13 | import org.testng.annotations.AfterClass; 14 | import org.testng.annotations.BeforeClass; 15 | import org.testng.annotations.Test; 16 | import util.ResourcePathUtils; 17 | 18 | import java.io.File; 19 | import java.io.IOException; 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * Created by cxu on 2015/6/21. 25 | */ 26 | public class LocalFilePiplineTest { 27 | private JscrapyConfig jscrapyConfig; 28 | 29 | @BeforeClass 30 | public void setup() throws IOException, MySpiderFetalException { 31 | String path = ResourcePathUtils.getResourceFileAbsPath(LocalFilePiplineTest.class, "/LocalFilePiplineTest.yaml"); 32 | Resource resource = new ClassPathResource("LocalFilePiplineTest.yaml"); 33 | jscrapyConfig = (JscrapyConfig) Yaml2BeanUtil.loadAsBean(JscrapyConfig.class, resource); 34 | } 35 | 36 | @AfterClass 37 | public void tearDown() throws IOException { 38 | /*删除文件*/ 39 | String tempDir = jscrapyConfig.getTaskWorkDir(); 40 | FileUtils.forceDeleteOnExit(new File(tempDir)); 41 | } 42 | 43 | @Test 44 | public void testSingleThread() throws IOException, InterruptedException, MySpiderRecoverableException, MySpiderFetalException { 45 | Pipline pipline = new LocalFilePipline(this.jscrapyConfig); 46 | DataItem dt = new DataItem(); 47 | dt.put("a", "123") 48 | .put("b", "456"); 49 | List dataItems = new ArrayList<>(); 50 | dataItems.add(dt); 51 | 52 | for(int i=0; i<100; i++){ 53 | pipline.save(dataItems); 54 | } 55 | 56 | Thread.sleep(1000); 57 | 58 | // try { 59 | // String dataSavePath = JscrapyConfig.loadString(ConfigKeys.RT_EXT_RT_LOCAL_FILE_PIPLINE_DATA_FILE); 60 | // List lines = FileUtils.readLines(new File(dataSavePath)); 61 | // assertEquals(100, lines.size()); 62 | // }finally { 63 | // pipline.close(); 64 | // } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/dal/QueueTest.java: -------------------------------------------------------------------------------- 1 | package dal; 2 | 3 | import org.jscrapy.core.dal.UrlQueueDo; 4 | import org.jscrapy.core.dal.UrlQueueMapper; 5 | import org.jscrapy.core.dal.h2queue.H2UrlQueueDo; 6 | import org.jscrapy.core.request.UrlStatus; 7 | import org.junit.After; 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | import static org.junit.Assert.assertNotEquals; 15 | import static org.junit.Assert.assertTrue; 16 | import static org.testng.AssertJUnit.assertEquals; 17 | 18 | /** 19 | * Created by cxu on 2016/8/10. 20 | */ 21 | public abstract class QueueTest { 22 | 23 | private final static String QUEUE_NAME = "jscrapy_queue_name"; 24 | private final static int QUEUE_SIZE = 5; 25 | 26 | /** 27 | * 建表、插入5条数据 28 | */ 29 | @Before 30 | public void setup() { 31 | getQueueMapper().dropQueue(QUEUE_NAME); 32 | getQueueMapper().createNewQueue(QUEUE_NAME); 33 | 34 | List urls = new ArrayList<>(); 35 | for (long i = 0; i < QUEUE_SIZE; i++) { 36 | H2UrlQueueDo dt = new H2UrlQueueDo(); 37 | dt.setUrl("http://url" + i + ".com"); 38 | dt.setRetryTimes(1); 39 | dt.setUrlStatus(UrlStatus.NEW); 40 | urls.add(dt); 41 | } 42 | getQueueMapper().batchInsert(QUEUE_NAME, urls); 43 | } 44 | 45 | @Test 46 | public void testUpdate() { 47 | int itemSelected = 1; 48 | List item = getQueueMapper().selectUrlByStatus(QUEUE_NAME, UrlStatus.NEW, itemSelected); 49 | assertEquals(itemSelected, item.size()); 50 | 51 | String urlNewValue = "com.jscrapy.www"; 52 | item.get(0).setUrl(urlNewValue); 53 | getQueueMapper().batchUpdate(QUEUE_NAME, item); 54 | 55 | boolean result = false; 56 | item = getQueueMapper().selectUrlByStatus(QUEUE_NAME, UrlStatus.NEW, Integer.MAX_VALUE); 57 | for (UrlQueueDo itm : item) { 58 | if (urlNewValue.equalsIgnoreCase(itm.getUrl())) { 59 | result = true; 60 | break; 61 | } 62 | } 63 | 64 | assertTrue(result); 65 | } 66 | 67 | @Test 68 | public void testSelect() { 69 | List all = getQueueMapper().selectUrlByStatus(QUEUE_NAME, UrlStatus.NEW, Integer.MAX_VALUE); 70 | assertNotEquals(0, all.size()); 71 | } 72 | 73 | @After 74 | public void tearDown() { 75 | 76 | List all = getQueueMapper().selectUrlByStatus(QUEUE_NAME, UrlStatus.NEW, Integer.MAX_VALUE); 77 | assertEquals(QUEUE_SIZE, all.size()); 78 | 79 | getQueueMapper().dropQueue(QUEUE_NAME); 80 | } 81 | 82 | protected abstract UrlQueueMapper getQueueMapper(); 83 | } 84 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/pipline/LocalFilePipline.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.pipline; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import org.apache.commons.io.FileUtils; 5 | import org.apache.commons.io.FilenameUtils; 6 | import org.jscrapy.core.config.JscrapyConfig; 7 | import org.jscrapy.core.config.SysDefaultConfig; 8 | import org.jscrapy.core.data.DataItem; 9 | import org.jscrapy.core.exp.ExceptionCode; 10 | import org.jscrapy.core.exception.MySpiderFetalException; 11 | import org.jscrapy.core.pipline.Pipline; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import java.io.File; 16 | import java.io.IOException; 17 | import java.nio.charset.StandardCharsets; 18 | import java.util.List; 19 | 20 | /** 21 | * Created by cxu on 2015/6/21. 22 | */ 23 | public class LocalFilePipline extends Pipline { 24 | private final static Logger logger = LoggerFactory.getLogger(LocalFilePipline.class); 25 | private String dataFilePath;//物理的数据文件位置path+file 26 | 27 | /** 28 | * @param jscrapyConfig 29 | * @throws IOException 30 | */ 31 | public LocalFilePipline(JscrapyConfig jscrapyConfig) throws MySpiderFetalException { 32 | 33 | super(jscrapyConfig); 34 | String taskWorkDir = jscrapyConfig.getTaskWorkDir(); 35 | 36 | this.dataFilePath = taskWorkDir + "pipline" + SysDefaultConfig.FILE_PATH_SEPERATOR + jscrapyConfig.getTaskName() + ".json";//完整的目录+文件名字。解析之后的数据保存的位置 37 | String baseDir = FilenameUtils.getFullPath(dataFilePath); 38 | try { 39 | FileUtils.forceMkdir(new File(baseDir)); 40 | } catch (IOException e) { 41 | 42 | MySpiderFetalException exp = new MySpiderFetalException(ExceptionCode.LOCAL_PIPLINE_MK_DIR_ERROR); 43 | exp.setErrorMessage(e.getLocalizedMessage()); 44 | throw exp; 45 | } 46 | } 47 | 48 | @Override 49 | public void save(List dataItems) throws MySpiderFetalException { 50 | if (dataItems != null && dataItems.size()>0) { 51 | for (DataItem dataItem : dataItems) { 52 | try { 53 | File dataFile = new File(dataFilePath); 54 | String data = JSON.toJSONString(dataItem.getDataItem()); 55 | synchronized (super.getJscrapyConfig()) {//任务级别的锁,只锁住同一个任务的多个线程 56 | FileUtils.writeStringToFile(dataFile, data + "\n", StandardCharsets.UTF_8.name(), true); 57 | } 58 | } catch (IOException e) { 59 | 60 | MySpiderFetalException exp = new MySpiderFetalException(ExceptionCode.LOCAL_PIPLINE_WRITE_FILE_ERROR); 61 | exp.setErrorMessage(e.getLocalizedMessage()); 62 | throw exp; 63 | } 64 | } 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/resources/mapper/queue_lock.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | delete from jscrapy_queue_lock 20 | where id = #{id,jdbcType=BIGINT} 21 | 22 | 23 | 24 | insert into jscrapy_queue_lock (id, gmt_create, gmt_modified, 25 | task_id, description 26 | ) 27 | values (#{id,jdbcType=BIGINT}, #{gmtCreate,jdbcType=TIMESTAMP}, #{gmtModified,jdbcType=TIMESTAMP}, 28 | #{taskId,jdbcType=VARCHAR}, #{description,jdbcType=VARCHAR} 29 | ) 30 | 31 | 32 | 33 | update jscrapy_queue_lock 34 | set gmt_create = #{gmtCreate,jdbcType=TIMESTAMP}, 35 | gmt_modified = #{gmtModified,jdbcType=TIMESTAMP}, 36 | task_id = #{taskId,jdbcType=VARCHAR}, 37 | description = #{description,jdbcType=VARCHAR} 38 | where id = #{id,jdbcType=BIGINT} 39 | 40 | 41 | 46 | 47 | 51 | 52 | 56 | 57 | 58 | delete from jscrapy_queue_lock 59 | where task_id = #{taskId,jdbcType=VARCHAR} 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /jscrapy-service/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | org.jscrapy 7 | jscrapy 8 | 1.0-SNAPSHOT 9 | 10 | org.jscrapy 11 | jscrapy-service 12 | 1.0-SNAPSHOT 13 | war 14 | jscrapy-service 15 | 16 | 17 | org.jscrapy 18 | jscrapy-core 19 | ${project.version} 20 | 21 | 22 | org.yaml 23 | snakeyaml 24 | ${snakyaml.version} 25 | 26 | 27 | org.springframework.boot 28 | spring-boot-starter-web 29 | 1.5.10.RELEASE 30 | 31 | 32 | 33 | 34 | 35 | org.apache.maven.plugins 36 | maven-war-plugin 37 | ${maven-war-plugin.version} 38 | 39 | false 40 | 41 | 42 | 43 | org.springframework.boot 44 | spring-boot-maven-plugin 45 | ${spring-boot-plugin.version} 46 | 47 | 48 | 1 49 | 50 | jscrapy-service-${project.version} 51 | exec 52 | org.jscrapy.service.Application 53 | WAR 54 | 55 | 56 | repackage 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/util/ClassLoadUtil.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.util; 2 | 3 | import org.jscrapy.core.cacher.Cacher; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.dedup.DeDup; 6 | import org.jscrapy.core.downloader.Downloader; 7 | import org.jscrapy.core.pipline.Pipline; 8 | import org.jscrapy.core.processor.Processor; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import org.slf4j.helpers.MessageFormatter; 12 | 13 | import java.lang.reflect.Constructor; 14 | 15 | /** 16 | * Created by cxu on 2015/10/2. 17 | */ 18 | public class ClassLoadUtil { 19 | final static Logger logger = LoggerFactory.getLogger(ClassLoadUtil.class); 20 | 21 | 22 | public static DeDup loadDedup(String className, JscrapyConfig arg) { 23 | Object o = loadClass(className, arg); 24 | if (o != null) { 25 | return (DeDup) o; 26 | } else { 27 | return null; 28 | } 29 | } 30 | 31 | public static Downloader loadDownloader(String className, JscrapyConfig arg) { 32 | Object o = loadClass(className, arg); 33 | if (o != null) { 34 | return (Downloader) o; 35 | } else { 36 | return null; 37 | } 38 | } 39 | 40 | public static Processor loadProcessor(String className, JscrapyConfig arg) { 41 | Object o = loadClass(className, arg); 42 | if (o != null) { 43 | return (Processor) o; 44 | } else { 45 | return null; 46 | } 47 | } 48 | 49 | public static Cacher loadCacher(String className, JscrapyConfig arg) { 50 | Object o = loadClass(className, arg); 51 | if (o != null) { 52 | return (Cacher) o; 53 | } else { 54 | return null; 55 | } 56 | } 57 | 58 | public static Pipline loadPipline(String className, JscrapyConfig arg) { 59 | Object o = loadClass(className, arg); 60 | if (o != null) { 61 | return (Pipline) o; 62 | } else { 63 | return null; 64 | } 65 | } 66 | 67 | private static Object loadClass(String className, JscrapyConfig arg) { 68 | Object o = null; 69 | try { 70 | Class c = Class.forName(className); 71 | Constructor constructor = c.getConstructor(new Class[]{JscrapyConfig.class}); 72 | o = constructor.newInstance(arg); 73 | } catch (Exception e) { 74 | logger.error("构造{}时出错{}", className, e); 75 | String errorMessage = MessageFormatter.format("构造对象{}时出错", className).getMessage(); 76 | // MySpiderFetalException exp = new MySpiderFetalException(ExceptionCode.CLASS_LOAD_ERROR); 77 | // exp.setErrorMessage(errorMessage); 78 | // throw exp; 79 | } 80 | 81 | return o; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/urlproducer/UrlProducerTest.java: -------------------------------------------------------------------------------- 1 | package urlproducer; 2 | 3 | import dedup.DeDupTest; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.dal.h2queue.H2UrlQueueDo; 6 | import org.jscrapy.core.producer.UrlProducer; 7 | import org.jscrapy.core.request.HttpRequest; 8 | import org.jscrapy.core.request.RequestContext; 9 | import org.jscrapy.core.util.Yaml2BeanUtil; 10 | import org.junit.Test; 11 | import org.junit.runner.RunWith; 12 | import org.springframework.beans.factory.annotation.Autowired; 13 | import org.springframework.beans.factory.annotation.Qualifier; 14 | import org.springframework.test.context.TestPropertySource; 15 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 16 | import util.ResourcePathUtils; 17 | 18 | import java.io.File; 19 | import java.io.FileNotFoundException; 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | import static org.testng.Assert.*; 24 | 25 | /** 26 | * Created by cxu on 2017/1/21. 27 | */ 28 | @RunWith(SpringJUnit4ClassRunner.class) 29 | @TestPropertySource("classpath:db.properties") 30 | public class UrlProducerTest { 31 | @Autowired 32 | @Qualifier("h2UrlProducer") 33 | private UrlProducer urlProducer; 34 | 35 | @Test 36 | public void test() { 37 | JscrapyConfig jscrapyConfig = getConfig(); 38 | 39 | UrlProducer[] producers = dataProvider(); 40 | for (UrlProducer urlProducer : producers) { 41 | //插入1个,看个数增加1个 42 | assertNotNull(urlProducer); 43 | urlProducer.setJscrapyConfig(jscrapyConfig); 44 | List requestContexts = rendUrl(); 45 | int insertCount = urlProducer.push(requestContexts); 46 | 47 | assertEquals(1, insertCount); 48 | } 49 | } 50 | 51 | private UrlProducer[] dataProvider() { 52 | return new UrlProducer[]{ 53 | urlProducer, 54 | }; 55 | } 56 | 57 | /** 58 | * 生成模拟数据 59 | * @return 60 | */ 61 | private List rendUrl() { 62 | List requestContexts = new ArrayList<>(); 63 | RequestContext req = new RequestContext(new HttpRequest("http://jscrapy.org"), new H2UrlQueueDo()); 64 | req.setRetryTimes(1); 65 | requestContexts.add(req); 66 | return requestContexts; 67 | } 68 | 69 | private JscrapyConfig getConfig() { 70 | String path = ResourcePathUtils.getResourceFileAbsPath(DeDupTest.class, "/H2UrlConsumerTest.yaml"); 71 | JscrapyConfig jscrapyConfig = null; 72 | try { 73 | jscrapyConfig = (JscrapyConfig) Yaml2BeanUtil.loadAsBean(JscrapyConfig.class, new File(path)); 74 | } catch (FileNotFoundException e) { 75 | e.printStackTrace(); 76 | fail(""); 77 | } 78 | return jscrapyConfig; 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/resources/mapper/pgqueue.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | CREATE TABLE IF NOT EXISTS ${queue_name} 10 | ( 11 | id BIGSERIAL PRIMARY KEY, -- 唯一键 12 | sched_id INT, -- 调度的唯一性标识,每次都不一样即可,可以取当前任务启动时间戳 13 | url TEXT, -- url 14 | url_status CHAR(5), -- 种子状态[NEW:新进入, OUT:出队列, ERR:出错] 15 | retry_times SMALLINT, -- 重试次数 16 | url_type CHAR(10), -- 种子类型[SEED:种子(Seed), LIST:列表(List), DETAIL: 详情页(Detail)] 17 | site_id VARCHAR(64), -- 站点ID 18 | gmt_created TIMESTAMP, -- url插入时间 19 | gmt_access TIMESTAMP, -- 被访问时间 20 | error_code CHAR(20), -- 错误编码 21 | error_msg TEXT -- 错误详细信息 22 | ); 23 | 24 | 25 | 26 | DROP TABLE IF EXISTS ${queue_name} 27 | 28 | 29 | 30 | INSERT INTO ${queue_name} (url, url_status, retry_times, url_type, site_id, gmt_created, gmt_access, error_code, error_msg) 31 | VALUES 32 | 33 | ( 34 | #{url.url,jdbcType=VARCHAR}, 35 | #{url.urlStatus,jdbcType=VARCHAR}, 36 | #{url.retryTimes,jdbcType=BIGINT}, 37 | #{url.urlType,jdbcType=VARCHAR}, 38 | #{url.siteId,jdbcType=VARCHAR}, 39 | NOW(), 40 | NOW(), 41 | #{url.errorCode,jdbcType=VARCHAR}, 42 | #{url.errorMsg,jdbcType=VARCHAR} 43 | ) 44 | 45 | 46 | 47 | 50 | 51 | 52 | INSERT INTO ${queue_name} (id, url, gmt_access) 53 | VALUES 54 | 55 | ( 56 | #{u.id, jdbcType=BIGINT}, 57 | #{u.url, jdbcType=VARCHAR}, 58 | NOW() 59 | ) 60 | 61 | ON CONFLICT(id) DO UPDATE SET url=EXCLUDED.url 62 | 63 | 64 | 65 | DELETE FROM ${queue_name} 66 | WHERE id IN 67 | 68 | #{urlQueueDo.id,jdbcType=BIGINT} 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /jscrapy-ext/src/test/java/dedup/DeDupExtTest.java: -------------------------------------------------------------------------------- 1 | package dedup; 2 | 3 | import org.jscrapy.ext.dedup.RedisDedup; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.dedup.DeDup; 6 | import org.jscrapy.core.exception.MySpiderFetalException; 7 | import org.jscrapy.core.exception.MySpiderRecoverableException; 8 | import org.jscrapy.core.request.HttpRequestMethod; 9 | import org.jscrapy.core.request.HttpRequest; 10 | import org.jscrapy.core.util.Yaml2BeanUtil; 11 | import org.testng.annotations.DataProvider; 12 | import org.testng.annotations.Test; 13 | import util.ResourcePathUtils; 14 | 15 | import java.io.File; 16 | import java.io.IOException; 17 | import java.util.ArrayList; 18 | import java.util.List; 19 | 20 | import static org.testng.Assert.assertEquals; 21 | 22 | /** 23 | * Created by cxu on 2015/12/19. 24 | */ 25 | public class DeDupExtTest { 26 | private HttpRequest rq = new HttpRequest("http://url1", HttpRequestMethod.DELETE, null); 27 | 28 | @DataProvider(name = "dp") 29 | public DeDup[][] dataProvider() throws IOException, MySpiderFetalException { 30 | return new DeDup[][]{ 31 | {initRedisDedup()}, 32 | }; 33 | } 34 | 35 | @Test(dataProvider = "dp") 36 | public void test(DeDup dedup) throws MySpiderRecoverableException { 37 | 38 | List req1 = new ArrayList<>(); 39 | req1.add(rq); 40 | req1 = dedup.deDup(req1); 41 | assertEquals(1, req1.size()); 42 | 43 | //先测试写入原来一样的,返回非空 44 | HttpRequest rq1 = rq; 45 | HttpRequest rq2 = new HttpRequest("http://url2", HttpRequestMethod.DELETE, null); 46 | HttpRequest rq3 = new HttpRequest("http://url3", HttpRequestMethod.DELETE, null); 47 | 48 | List req = new ArrayList<>(); 49 | req.add(rq1); 50 | req.add(rq2); 51 | req.add(rq3); 52 | 53 | req = dedup.deDup(req); 54 | assertEquals(2, req.size()); 55 | assertEquals(0, dedup.deDup(req).size()); 56 | 57 | teardown(dedup); 58 | } 59 | 60 | /** 61 | * @return 62 | */ 63 | private DeDup initRedisDedup() throws IOException, MySpiderFetalException { 64 | String path = ResourcePathUtils.getResourceFileAbsPath(DeDupExtTest.class, "/RedisDedupTest.yaml"); 65 | JscrapyConfig jscrapyConfig = (JscrapyConfig) Yaml2BeanUtil.loadAsBean(JscrapyConfig.class, new File(path)); 66 | DeDup dp = new RedisDedup(); 67 | dp.setJscrapyConfig(jscrapyConfig); 68 | return dp; 69 | } 70 | 71 | /** 72 | * 73 | */ 74 | public void teardown(DeDup dedup) { 75 | // String dedepSetName = "jscrapy_dedup_set_" + dedup.getJscrapyConfig().getTaskFp(); 76 | // 77 | // RedisDedupConfig dedupConfig = (RedisDedupConfig)dedup.getJscrapyConfig().get(ComponentName.DEDUP_REDIS); 78 | // String redisHost = dedupConfig.getHost(); 79 | // JedisPool pool = new JedisPool(new JedisPoolConfig(), redisHost); 80 | // Jedis jedis = pool.getResource(); 81 | // jedis.del(dedepSetName); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/fetcher/HttpCharsetDetector.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.fetcher; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | import org.apache.http.HttpResponse; 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import us.codecraft.webmagic.utils.UrlUtils;//TODO 去掉webmagic的依赖 12 | 13 | import java.io.IOException; 14 | import java.io.UnsupportedEncodingException; 15 | import java.nio.charset.Charset; 16 | 17 | /** 18 | * 探测网页返回的编码 19 | */ 20 | public class HttpCharsetDetector { 21 | 22 | final static Logger logger = LoggerFactory.getLogger(HttpCharsetDetector.class); 23 | 24 | public static String detectEncode(HttpResponse httpResponse, byte[] contentBytes) 25 | throws IOException { 26 | String contentType = httpResponse.getEntity().getContentType().getValue(); 27 | String charset = detectEncode(contentType, contentBytes); 28 | if (StringUtils.isBlank(charset)) {//TODO webmagic的bug?会返回null, https://baidu.com的时候 29 | charset = Charset.defaultCharset().name(); 30 | } 31 | 32 | return charset; 33 | } 34 | 35 | private static String detectEncode(String contentTypeValue, byte[] contentBytes) throws UnsupportedEncodingException { 36 | String charset; 37 | // charset 38 | // 1、encoding in http header Content-Type 39 | charset = UrlUtils.getCharset(contentTypeValue); 40 | if (org.apache.commons.lang.StringUtils.isNotBlank(charset)) { 41 | logger.debug("Auto get charset: {}", charset); 42 | return charset; 43 | } 44 | // use default charset to decode first time 45 | Charset defaultCharset = Charset.defaultCharset(); 46 | String content = new String(contentBytes, defaultCharset.name()); 47 | // 2、charset in meta 48 | if (org.apache.commons.lang.StringUtils.isNotEmpty(content)) { 49 | Document document = Jsoup.parse(content); 50 | Elements links = document.select("meta"); 51 | for (Element link : links) { 52 | // 2.1、html4.01 53 | String metaContent = link.attr("content"); 54 | String metaCharset = link.attr("charset"); 55 | if (metaContent.indexOf("charset") != -1) { 56 | metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length()); 57 | charset = metaContent.split("=")[1]; 58 | break; 59 | } 60 | // 2.2、html5 61 | else if (org.apache.commons.lang.StringUtils.isNotEmpty(metaCharset)) { 62 | charset = metaCharset; 63 | break; 64 | } 65 | } 66 | } 67 | logger.debug("Auto get charset: {}", charset); 68 | // 3、todo use tools as cpdetector for content decode 69 | return charset; 70 | } 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/task/Task.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.task; 2 | 3 | import org.jscrapy.core.JscrapyComponent; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.util.concurrent.ExecutorService; 9 | import java.util.concurrent.Executors; 10 | 11 | /** 12 | * 对线程池的抽象 13 | * Created by cxu on 2015/6/20. 14 | */ 15 | public class Task extends JscrapyComponent { 16 | final static Logger logger = LoggerFactory.getLogger(Task.class); 17 | private ExecutorService threads; 18 | 19 | public Task(JscrapyConfig jscrapyConfig) { 20 | setJscrapyConfig(jscrapyConfig); 21 | int threadCount = jscrapyConfig.getThreadCount(); 22 | threads = Executors.newFixedThreadPool(threadCount); 23 | } 24 | 25 | public void cancel() { 26 | 27 | } 28 | 29 | public void run() { 30 | // JscrapyConfig cfg = getJscrapyConfig(); 31 | // cfg.setTaskStatus(TaskStatus.Status.RUN); 32 | // 33 | // String schedulerClass = cfg.getSchedulerClassName(); 34 | // String dedupClass = cfg.getDedupClassName(); 35 | // String downloaderClass = cfg.getDownloaderClassName(); 36 | // 37 | // String processorClass = cfg.getProcessorClassName(); 38 | // String cacherClass = cfg.getCacherClassName(); 39 | // String[] piplineClass = cfg.getPiplineClassName().split(","); 40 | // 41 | // Scheduler scheduler = ClassLoadUtil.loadScheduler(schedulerClass, cfg); 42 | // cfg.setSchedulerObject(scheduler); 43 | // 44 | // String logPath = cfg.getTaskLogDir(); 45 | // org.apache.log4j.Logger logger = MyLoggerFactory.getModuleLogger(cfg.getTaskFp(), logPath); 46 | // cfg.setTaskLogger(logger); 47 | // int threadCount = getJscrapyConfig().getThreadCount(); 48 | // for (int i = 0; i < threadCount; i++) { 49 | // Spider sp = new Spider(getJscrapyConfig()); 50 | // DeDup dedup = ClassLoadUtil.loadDedup(dedupClass, cfg); 51 | // Downloader downloader = ClassLoadUtil.loadDownloader(downloaderClass, cfg); 52 | // Processor processor = ClassLoadUtil.loadProcessor(processorClass, cfg); 53 | // Cacher cacher = ClassLoadUtil.loadCacher(cacherClass, cfg); 54 | // sp.setDedup(dedup); 55 | // sp.setDownloader(downloader); 56 | // sp.setProcessor(processor); 57 | // sp.setCacher(cacher); 58 | // for (String p : piplineClass) { 59 | // Pipline pipline = ClassLoadUtil.loadPipline(p, cfg); 60 | // sp.addPipline(pipline); 61 | // } 62 | // 63 | // threads.submit(sp); 64 | // } 65 | } 66 | 67 | public void pause() { 68 | // getJscrapyConfig().setTaskStatus(TaskStatus.Status.PAUSE); 69 | // threads.shutdown(); 70 | } 71 | 72 | @Override 73 | public String toString() { 74 | JscrapyConfig cfg = getJscrapyConfig(); 75 | String taskName = cfg.getTaskName(); 76 | String taskFp = cfg.getTaskFp(); 77 | return "Task{" + 78 | "tasiName=" + taskName + 79 | "taskFp=" + taskFp + 80 | '}'; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /jscrapy-common/src/main/java/org/jscrapy/common/js/JsExecuteUtil.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.common.js; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | 5 | import javax.script.Invocable; 6 | import javax.script.ScriptEngine; 7 | import javax.script.ScriptEngineManager; 8 | import javax.script.ScriptException; 9 | 10 | /** 11 | * Created by cxu on 2016/2/23. 12 | */ 13 | public class JsExecuteUtil { 14 | 15 | private static final String JS_ENGINE_NAME = "javascript"; 16 | 17 | private ScriptEngine jsEngine; 18 | 19 | public JsExecuteUtil() { 20 | jsEngine = new ScriptEngineManager().getEngineByName(JS_ENGINE_NAME); 21 | } 22 | 23 | /** 24 | * 执行脚本 25 | * 26 | * @param script 27 | * @throws ScriptException 28 | * @throws NoSuchMethodException 29 | */ 30 | public void executeScript(String script) throws ScriptException, NoSuchMethodException{ 31 | this.doExecute(script, null); 32 | } 33 | 34 | /** 35 | * 执行脚本方法 36 | * 37 | * @param script 脚本内容 38 | * @param funName 方法名称 39 | * @param args 参数列表 40 | * @return 脚本返回对象 41 | * @throws ScriptException 42 | * @throws NoSuchMethodException 43 | */ 44 | public JsExecuteResult executeFunction(String script, String funName, Object... args) throws ScriptException, NoSuchMethodException { 45 | JsExecuteResult result = this.doExecute(script, funName, args); 46 | return result; 47 | } 48 | 49 | /** 50 | * 获取参数 51 | * @param arg 52 | * @return 53 | */ 54 | public JsExecuteResult getParameter(String arg){ 55 | JsExecuteResult result = new JsExecuteResult(); 56 | if (StringUtils.isBlank(arg) && null == jsEngine) { 57 | return result; 58 | } 59 | result.setResult(String.valueOf(jsEngine.get(arg))); 60 | result.setIsSuccess(Boolean.TRUE); 61 | return result; 62 | } 63 | 64 | /** 65 | * 执行脚本 66 | * @param script 67 | * @param funName 68 | * @param args 69 | * @return 70 | * @throws ScriptException 71 | * @throws NoSuchMethodException 72 | */ 73 | private JsExecuteResult doExecute(String script, String funName, Object... args) throws ScriptException, NoSuchMethodException{ 74 | JsExecuteResult result = new JsExecuteResult(); 75 | 76 | if(null == jsEngine){ 77 | result.setMessage("您的JDK不支持javascript引擎"); 78 | return result; 79 | } 80 | if(StringUtils.isBlank(script)){ 81 | result.setMessage("javascript source code is empty"); 82 | return result; 83 | } 84 | 85 | if (StringUtils.isBlank(funName) && (null == args || args.length == 0)) {//只有script 86 | result.setResult(jsEngine.eval(script)); 87 | result.setIsSuccess(Boolean.TRUE); 88 | } 89 | 90 | if (StringUtils.isNotBlank(funName)) { 91 | jsEngine.eval(script); 92 | Invocable inv = (Invocable) jsEngine; 93 | result.setResult(inv.invokeFunction(funName, args)); 94 | result.setIsSuccess(Boolean.TRUE); 95 | } 96 | 97 | return result; 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/TaskManager.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | import org.jscrapy.core.task.Task; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.util.Map; 9 | import java.util.concurrent.ConcurrentHashMap; 10 | 11 | /** 12 | * 管理一个节点上的所有Task,存放Task的状态(status) 13 | * Created by cxu on 2015/6/20. 14 | */ 15 | public class TaskManager { 16 | final static Logger logger = LoggerFactory.getLogger(TaskManager.class); 17 | 18 | private static TaskManager TASK_MANAGER = new TaskManager(); 19 | 20 | private Map tasks = new ConcurrentHashMap(); 21 | 22 | public static TaskManager instance() { 23 | return TASK_MANAGER; 24 | } 25 | 26 | public void runTask(JscrapyConfig JscrapyConfig) { 27 | String taskId = JscrapyConfig.getTaskId(); 28 | Task task = tasks.get(taskId); 29 | if (task != null) { 30 | logger.warn("task {} 已经存在,不能再次调度启动", task); 31 | } else { 32 | task = new Task(JscrapyConfig); 33 | tasks.put(taskId, task); 34 | task.run(); 35 | } 36 | } 37 | 38 | public void cancelTask(JscrapyConfig JscrapyConfig) { 39 | String taskId = JscrapyConfig.getTaskId(); 40 | Task task = tasks.get(taskId); 41 | if (task == null) { 42 | logger.warn("task {} 不存在,不能取消", task); 43 | } else { 44 | task.cancel(); 45 | tasks.remove(taskId); 46 | } 47 | } 48 | 49 | public void pauseTask(JscrapyConfig JscrapyConfig) { 50 | String taskId = JscrapyConfig.getTaskId(); 51 | Task task = tasks.get(taskId); 52 | if (task == null) { 53 | logger.warn("task {} 不存在,不能暂停", task); 54 | } else { 55 | task.pause(); 56 | tasks.remove(taskId); 57 | } 58 | } 59 | 60 | // public List getTaskStatus() { 61 | // List status = new ArrayList<>(); 62 | // Set taskIds = tasks.keySet(); 63 | // for (String taskId : taskIds) { 64 | // Task task = tasks.get(taskId); 65 | // if (task != null) { 66 | // TaskStatus sts = task.getJscrapyConfig().getTaskStatusObject(); 67 | // status.add(sts); 68 | // } 69 | // } 70 | // 71 | // return status; 72 | // } 73 | 74 | // public TaskStatus getTaskStatus(String taskId) { 75 | // TaskStatus status = null; 76 | // 77 | // Task task = tasks.get(taskId); 78 | // if (task != null) { 79 | // status = task.getJscrapyConfig().getTaskStatusObject(); 80 | // } 81 | // 82 | // return status; 83 | // } 84 | 85 | // /** 86 | // * 没找到返回null 87 | // * @param taskId 88 | // * @return 89 | // */ 90 | // public String getTaskLogFilePath(String taskId) { 91 | // String logFilePath = null; 92 | // Task task = tasks.get(taskId); 93 | // if (task != null) { 94 | // logFilePath = task.getJscrapyConfig().getTaskLogDir() + File.separator + DatetimeUtil.getTime("yyyyMMdd") + ".log"; 95 | // } 96 | // 97 | // return logFilePath; 98 | // } 99 | } 100 | -------------------------------------------------------------------------------- /jscrapy-core/src/test/java/urlconsumer/UrlConsumerTest.java: -------------------------------------------------------------------------------- 1 | package urlconsumer; 2 | 3 | import dedup.DeDupTest; 4 | import org.jscrapy.core.comsumer.UrlConsumer; 5 | import org.jscrapy.core.config.JscrapyConfig; 6 | import org.jscrapy.core.dal.h2queue.H2UrlQueueDo; 7 | import org.jscrapy.core.producer.UrlProducer; 8 | import org.jscrapy.core.request.HttpRequest; 9 | import org.jscrapy.core.request.RequestContext; 10 | import org.jscrapy.core.request.UrlStatus; 11 | import org.jscrapy.core.util.Yaml2BeanUtil; 12 | import org.junit.Test; 13 | import org.springframework.beans.factory.annotation.Autowired; 14 | import org.springframework.beans.factory.annotation.Qualifier; 15 | import org.springframework.test.context.TestPropertySource; 16 | import util.ResourcePathUtils; 17 | 18 | import java.io.File; 19 | import java.io.FileNotFoundException; 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | import static org.testng.Assert.*; 24 | 25 | /** 26 | * Created by cxu on 2017/1/25. 27 | */ 28 | 29 | @TestPropertySource("classpath:db.properties") 30 | public class UrlConsumerTest { 31 | @Autowired 32 | @Qualifier("h2UrlConsumer") 33 | private UrlConsumer urlConsumer; 34 | 35 | @Autowired 36 | @Qualifier("h2UrlProducer") 37 | private UrlProducer urlProducer; 38 | 39 | @Test 40 | public void test() { 41 | JscrapyConfig jscrapyConfig = getConfig(); 42 | assertNotNull(urlProducer); 43 | urlProducer.setJscrapyConfig(jscrapyConfig); 44 | 45 | List requestContexts = rendUrl(); 46 | int insertCount = urlProducer.push(requestContexts); 47 | assertEquals(1, insertCount); 48 | 49 | UrlConsumer[] consumers = dataProvider(); 50 | for (UrlConsumer consumer : consumers) { 51 | consumer.setJscrapyConfig(jscrapyConfig); 52 | List dequeueRequestContexts = urlConsumer.poll(1); 53 | for (RequestContext requestContext : dequeueRequestContexts) { 54 | UrlStatus status = requestContext.getUrlStatus(); 55 | assertEquals(status, UrlStatus.NEW); 56 | assertEquals(requestContext.getRetryTimes(), 0); 57 | } 58 | 59 | urlConsumer.delete(dequeueRequestContexts); 60 | } 61 | } 62 | 63 | public UrlConsumer[] dataProvider() { 64 | return new UrlConsumer[]{ 65 | urlConsumer, 66 | }; 67 | } 68 | 69 | /** 70 | * 生成模拟数据 71 | * @return 72 | */ 73 | private List rendUrl() { 74 | List requestContexts = new ArrayList<>(); 75 | RequestContext req = new RequestContext(new HttpRequest("http://jscrapy.org"), new H2UrlQueueDo()); 76 | req.setRetryTimes(1); 77 | req.setUrlStatus(UrlStatus.NEW); 78 | requestContexts.add(req); 79 | return requestContexts; 80 | } 81 | 82 | private JscrapyConfig getConfig() { 83 | String path = ResourcePathUtils.getResourceFileAbsPath(DeDupTest.class, "/H2UrlConsumerTest.yaml"); 84 | JscrapyConfig jscrapyConfig = null; 85 | try { 86 | jscrapyConfig = (JscrapyConfig) Yaml2BeanUtil.loadAsBean(JscrapyConfig.class, new File(path)); 87 | } catch (FileNotFoundException e) { 88 | e.printStackTrace(); 89 | fail(""); 90 | } 91 | return jscrapyConfig; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /jscrapy-ext/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | org.jscrapy 7 | jscrapy 8 | 1.0-SNAPSHOT 9 | 10 | org.jscrapy 11 | jscrapy-ext 12 | 1.0-SNAPSHOT 13 | jar 14 | jscrapy-ext 15 | 16 | 17 | org.jscrapy 18 | jscrapy-core 19 | ${project.version} 20 | 21 | 22 | org.mongodb 23 | mongo-java-driver 24 | ${mongo-java-driver.version} 25 | 26 | 27 | org.mapdb 28 | mapdb 29 | ${mapdb.version} 30 | 31 | 32 | com.leansoft 33 | bigqueue 34 | ${bigqueue.version} 35 | 36 | 37 | us.codecraft 38 | webmagic-core 39 | ${webmagic.version} 40 | 41 | 42 | commons-io 43 | commons-io 44 | 45 | 46 | org.slf4j 47 | slf4j-api 48 | 49 | 50 | com.google.guava 51 | guava 52 | 53 | 54 | commons-logging 55 | commons-logging 56 | 57 | 58 | 59 | 60 | redis.clients 61 | jedis 62 | ${jedis.version} 63 | 64 | 65 | org.mongodb 66 | mongo-java-driver 67 | ${mongo-java-driver.version} 68 | 69 | 70 | 71 | 72 | 73 | src/main/resources 74 | true 75 | 76 | 77 | 78 | 79 | src/test/resources 80 | true 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/dal/UrlQueueDo.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.dal; 2 | 3 | import org.jscrapy.core.request.UrlStatus; 4 | import org.jscrapy.core.request.UrlType; 5 | 6 | import java.util.Date; 7 | 8 | /** 9 | * Created by cxu on 2016/8/10. 10 | */ 11 | public class UrlQueueDo { 12 | private Long id; 13 | private String url; 14 | private String schedId; 15 | private UrlStatus urlStatus; 16 | private Integer retryTimes; 17 | private UrlType urlType; 18 | private String siteId; 19 | private Date gmtCreated; 20 | private Date gmtAccess; 21 | private String errorCode; 22 | private String errorMsg; 23 | 24 | public UrlQueueDo(Long id, String schedId,String url, UrlStatus urlStatus, Integer retryTimes, UrlType urlType, String siteId, Date gmtCreated, Date gmtAccess, String errorCode, String errorMsg) { 25 | this.id = id; 26 | this.url = url; 27 | this.schedId = schedId; 28 | this.urlStatus = urlStatus; 29 | this.retryTimes = retryTimes; 30 | this.urlType = urlType; 31 | this.siteId = siteId; 32 | this.gmtCreated = gmtCreated; 33 | this.gmtAccess = gmtAccess; 34 | this.errorCode = errorCode; 35 | this.errorMsg = errorMsg; 36 | } 37 | 38 | public UrlQueueDo() { 39 | 40 | } 41 | 42 | 43 | public void setId(Long id) { 44 | this.id = id; 45 | } 46 | 47 | public Long getId() { 48 | return id; 49 | } 50 | 51 | public void setId(long id) { 52 | this.id = id; 53 | } 54 | 55 | public String getUrl() { 56 | return url; 57 | } 58 | 59 | public void setUrl(String url) { 60 | this.url = url; 61 | } 62 | 63 | public String getSchedId() { 64 | return schedId; 65 | } 66 | 67 | public void setSchedId(String schedId) { 68 | this.schedId = schedId; 69 | } 70 | 71 | public UrlStatus getUrlStatus() { 72 | return urlStatus; 73 | } 74 | 75 | public void setUrlStatus(UrlStatus urlStatus) { 76 | this.urlStatus = urlStatus; 77 | } 78 | 79 | public Integer getRetryTimes() { 80 | return retryTimes; 81 | } 82 | 83 | public void setRetryTimes(Integer retryTimes) { 84 | this.retryTimes = retryTimes; 85 | } 86 | 87 | public UrlType getUrlType() { 88 | return urlType; 89 | } 90 | 91 | public void setUrlType(UrlType urlType) { 92 | this.urlType = urlType; 93 | } 94 | 95 | public String getSiteId() { 96 | return siteId; 97 | } 98 | 99 | public void setSiteId(String siteId) { 100 | this.siteId = siteId; 101 | } 102 | 103 | public Date getGmtCreated() { 104 | return gmtCreated; 105 | } 106 | 107 | public void setGmtCreated(Date gmtCreated) { 108 | this.gmtCreated = gmtCreated; 109 | } 110 | 111 | public Date getGmtAccess() { 112 | return gmtAccess; 113 | } 114 | 115 | public void setGmtAccess(Date gmtAccess) { 116 | this.gmtAccess = gmtAccess; 117 | } 118 | 119 | public String getErrorCode() { 120 | return errorCode; 121 | } 122 | 123 | public void setErrorCode(String errorCode) { 124 | this.errorCode = errorCode; 125 | } 126 | 127 | public String getErrorMsg() { 128 | return errorMsg; 129 | } 130 | 131 | public void setErrorMsg(String errorMsg) { 132 | this.errorMsg = errorMsg; 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/data/ProcessResult.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.data; 2 | 3 | import org.apache.commons.lang3.StringUtils; 4 | import org.jscrapy.common.datetime.DatetimeUtil; 5 | import org.jscrapy.core.request.HttpRequest; 6 | import org.jscrapy.core.request.HttpRequestMethod; 7 | 8 | import java.util.LinkedList; 9 | import java.util.List; 10 | 11 | /** 12 | * Created by cxu on 2015/10/1. 13 | */ 14 | public class ProcessResult { 15 | private static final String FIELD_CREATE_TIME = "_create_time"; 16 | private static final String FIELD_URL = "_from_url"; 17 | private static final String SCHEDULER_VERSION = "_scheduler_version"; 18 | 19 | private HttpRequest request; 20 | private String schedulerVersion; 21 | private List data; 22 | private List links; 23 | 24 | public ProcessResult() { 25 | data = new LinkedList<>(); 26 | links = new LinkedList<>(); 27 | } 28 | 29 | public ProcessResult addDataItem(DataItem item) { 30 | if(item!=null && !item.isEmpty()) { 31 | item = addOptionField(item); 32 | data.add(item); 33 | } 34 | 35 | return this; 36 | } 37 | 38 | public ProcessResult addDataItem(List items) { 39 | if (items != null && items.size() > 0) { 40 | for (DataItem item : items) { 41 | if (!item.isEmpty()) { 42 | item = addOptionField(item); 43 | data.add(item); 44 | } 45 | } 46 | } 47 | return this; 48 | } 49 | 50 | public ProcessResult addLinks(String link) { 51 | if (StringUtils.isNotBlank(link)) { 52 | HttpRequest req = new HttpRequest(link, HttpRequestMethod.GET, null); 53 | links.add(req); 54 | } 55 | 56 | return this; 57 | } 58 | 59 | public ProcessResult addLinks(List links) { 60 | if (links != null) { 61 | for (String s : links) { 62 | HttpRequest req = new HttpRequest(s, HttpRequestMethod.GET, null); 63 | this.links.add(req); 64 | } 65 | } 66 | return this; 67 | } 68 | 69 | public ProcessResult addRequest(HttpRequest link) { 70 | if (link != null) { 71 | links.add(link); 72 | } 73 | return this; 74 | } 75 | 76 | public ProcessResult addRequest(List links) { 77 | if (links != null && links.size() > 0) { 78 | this.links.addAll(links); 79 | } 80 | return this; 81 | } 82 | 83 | public List getData() { 84 | return data; 85 | } 86 | 87 | public List getLinks() { 88 | return links; 89 | } 90 | 91 | private String getTimeStrNow() { 92 | return DatetimeUtil.getTime("yyyy-MM-dd HH:mm:ss"); 93 | } 94 | 95 | public void setRequest(HttpRequest request) { 96 | this.request = request; 97 | } 98 | 99 | public String getSchedulerVersion() { 100 | return schedulerVersion; 101 | } 102 | 103 | public void setSchedulerVersion(String schedulerVersion) { 104 | this.schedulerVersion = schedulerVersion; 105 | } 106 | 107 | private DataItem addOptionField(DataItem item) { 108 | item.put(FIELD_CREATE_TIME, getTimeStrNow()); 109 | item.put(FIELD_URL, request.asJson()); 110 | item.put(SCHEDULER_VERSION, schedulerVersion); 111 | return item; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/request/RequestContext.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.request; 2 | 3 | import org.jscrapy.core.dal.UrlQueueDo; 4 | 5 | import java.util.Date; 6 | import java.util.Map; 7 | 8 | /** 9 | * Created by cxu on 2017/1/19. 10 | */ 11 | public class RequestContext { 12 | private HttpRequest request; 13 | private UrlQueueDo urlQueueDo; 14 | 15 | public RequestContext() { 16 | request = new HttpRequest(); 17 | urlQueueDo = new UrlQueueDo(); 18 | } 19 | 20 | public RequestContext(UrlQueueDo urlQueueDo) { 21 | this.urlQueueDo = urlQueueDo; 22 | request = HttpRequest.build(urlQueueDo.getUrl()); 23 | } 24 | 25 | public RequestContext(HttpRequest request, UrlQueueDo urlQueueDo) { 26 | this.request = request; 27 | this.urlQueueDo = urlQueueDo; 28 | } 29 | 30 | public String getUrl() { 31 | return request.getUrl(); 32 | } 33 | 34 | public String getFullUrl() { 35 | return request.asJson(); 36 | } 37 | 38 | public HttpRequestMethod getHttpMethod() { 39 | return request.getHttpMethod(); 40 | } 41 | 42 | public Map getParameters() { 43 | return request.getParameters(); 44 | } 45 | 46 | public String asJson() { 47 | return request.asJson(); 48 | } 49 | 50 | public String fp() { 51 | return request.uniqId(); 52 | } 53 | public void setUrl(String url) { 54 | request.setUrl(url); 55 | } 56 | 57 | 58 | 59 | public Long getId() { 60 | return urlQueueDo.getId(); 61 | } 62 | 63 | public void setId(long id) { 64 | urlQueueDo.setId(id); 65 | } 66 | 67 | public void setUrlStatus(UrlStatus urlStatus) { 68 | urlQueueDo.setUrlStatus(urlStatus); 69 | } 70 | 71 | public UrlStatus getUrlStatus() { 72 | return urlQueueDo.getUrlStatus(); 73 | } 74 | 75 | public int getRetryTimes() { 76 | return urlQueueDo.getRetryTimes(); 77 | } 78 | 79 | public void setRetryTimes(Integer retryTimes) { 80 | urlQueueDo.setRetryTimes(retryTimes); 81 | } 82 | 83 | public UrlType getUrlType() { 84 | return urlQueueDo.getUrlType(); 85 | } 86 | 87 | public void setUrlType(UrlType urlType) { 88 | urlQueueDo.setUrlType(urlType); 89 | } 90 | 91 | public String getSiteId() { 92 | return urlQueueDo.getSiteId(); 93 | } 94 | 95 | public void setSiteId(String siteId) { 96 | urlQueueDo.setSiteId(siteId); 97 | } 98 | 99 | public Date getGmtCreated() { 100 | return urlQueueDo.getGmtCreated(); 101 | } 102 | 103 | public void setGmtCreated(Date gmtCreated) { 104 | urlQueueDo.setGmtCreated(gmtCreated); 105 | } 106 | 107 | public Date getGmtAccess() { 108 | return urlQueueDo.getGmtAccess(); 109 | } 110 | 111 | public void setGmtAccess(Date gmtAccess) { 112 | urlQueueDo.setGmtAccess(gmtAccess); 113 | } 114 | 115 | public String getErrorCode() { 116 | return urlQueueDo.getErrorCode(); 117 | } 118 | 119 | public void setErrorCode(String errorCode) { 120 | urlQueueDo.setErrorCode(errorCode); 121 | } 122 | 123 | public String getErrorMsg() { 124 | return urlQueueDo.getErrorMsg(); 125 | } 126 | 127 | public void setErrorMsg(String errorMsg) { 128 | urlQueueDo.setErrorMsg(errorMsg); 129 | } 130 | 131 | public HttpRequest toHttpRequest() { 132 | //TODO 133 | return null; 134 | } 135 | 136 | public UrlQueueDo toUrlQueueDo() { 137 | //TODO 138 | return null; 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/spider/impl/GenericSpider.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.spider.impl; 2 | 3 | import org.jscrapy.core.config.JscrapyConfig; 4 | import org.jscrapy.core.data.ProcessResult; 5 | import org.jscrapy.core.page.Page; 6 | import org.jscrapy.core.request.HttpRequest; 7 | import org.jscrapy.core.request.RequestContext; 8 | import org.jscrapy.core.spider.Spider; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import java.util.List; 13 | import java.util.concurrent.TimeUnit; 14 | 15 | /** 16 | * Created by cxu on 2018/2/8. 17 | */ 18 | public class GenericSpider extends Spider{ 19 | final static Logger logger = LoggerFactory.getLogger(Spider.class); 20 | 21 | public GenericSpider(JscrapyConfig jscrapyConfig) { 22 | super(jscrapyConfig); 23 | } 24 | 25 | /** 26 | * 27 | */ 28 | @Override 29 | protected void mainLoop() { 30 | 31 | int n = poolBigQueue(); 32 | if (n != 0) {//集中式队列里有URL 33 | processMemQueue(); 34 | }else{ //没URL的情况下考虑睡眠,但不考虑停止,是否停止听从admin里的调度模块发送的指令 35 | int sleepTimeMs = 100; 36 | try { 37 | TimeUnit.MILLISECONDS.sleep(sleepTimeMs); 38 | } catch (InterruptedException e) { 39 | logger.info("等待新的URL过程中发生InterruptedException"); 40 | //TODO 41 | } 42 | logger.info("睡眠{}秒,等待新的URL", sleepTimeMs); 43 | //睡然后等待,可能队列里的URL被处理完了 TODO 44 | } 45 | } 46 | 47 | /** 48 | * 49 | * @return 50 | */ 51 | private int poolBigQueue() { 52 | int fetchSize = getJscrapyConfig().getUrlFetchSize(); 53 | 54 | //从集中的队列取URL 55 | List requestContexts = urlConsumer.poll(fetchSize); 56 | return requestContexts.size(); 57 | } 58 | 59 | /** 60 | * 61 | */ 62 | private void processMemQueue() { 63 | HttpRequest request = null; 64 | while ((request = getRequest()) != null) {//处理内存里的request,直到结束 65 | processOneRequest(request); 66 | } 67 | } 68 | 69 | /** 70 | * 71 | * @return 72 | */ 73 | private HttpRequest getRequest() { 74 | return urlConsumer.getMemQueue().poll(); 75 | } 76 | 77 | /** 78 | * 79 | * @param httpRequest 80 | */ 81 | private void processOneRequest(HttpRequest httpRequest) { 82 | Page pg = null; 83 | try { 84 | pg = cacher.loadPage(httpRequest); 85 | } catch (Throwable e) { 86 | logger.error("读取缓存页面文件失败{}", e); 87 | //TODO 这个错误可以跳过,如果缓存没有直接网上下载也可以 88 | } 89 | if (pg == null) {//缓存么有命中或者缓存出错 90 | pg = downloader.download(httpRequest); 91 | if (pg != null) { 92 | logger.debug("网络下载成功{}", pg); 93 | } else { 94 | logger.info("网络下载失败{}", pg); 95 | } 96 | 97 | } else {//缓存命中了 98 | logger.debug("命中缓存{}", httpRequest); 99 | } 100 | if (pg == null) { 101 | logger.error("页面未命中缓存且下载失败 page=null"); 102 | //TODO 这里出错时候考虑一下是否要调整速率 103 | return; 104 | } 105 | ProcessResult result = processor.process(pg); 106 | //处理链接 107 | List newLinks = result.getLinks(); 108 | urlProducer.push(null); 109 | 110 | 111 | //存储数据 112 | try{ 113 | pipline.save(result.getData()); 114 | }catch (Throwable e) { 115 | logger.error("保存文件时出错 {}", e); 116 | //TODO 数据保存出错统计 117 | } 118 | 119 | if (!pg.isFromCache()) {//sleep 120 | try { 121 | TimeUnit.MILLISECONDS.sleep(1000);//TODO 参数化 122 | } catch (InterruptedException e) { 123 | logger.info("等待新的URL过程中发生InterruptedException"); 124 | return; 125 | } 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/comsumer/impl/H2UrlConsumer.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.comsumer.impl; 2 | 3 | import org.jscrapy.core.comsumer.UrlConsumer; 4 | import org.jscrapy.core.config.JscrapyConfig; 5 | import org.jscrapy.core.config.modulecfg.H2QueueConfig; 6 | import org.jscrapy.core.dal.UrlQueueDo; 7 | import org.jscrapy.core.dal.UrlQueueMapper; 8 | import org.jscrapy.core.request.RequestContext; 9 | import org.jscrapy.core.request.UrlStatus; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | import java.util.concurrent.locks.ReentrantLock; 14 | 15 | /** 16 | * Created by cxu on 2016/7/30. 17 | */ 18 | public class H2UrlConsumer extends UrlConsumer { 19 | 20 | private UrlQueueMapper urlQueueMapper; 21 | 22 | /** 23 | * 从队列里拿出来n个 24 | * 25 | * @param n 每次拿出多少个Request 26 | * @return 27 | */ 28 | @Override 29 | public List poll(int n) { 30 | List requestContexts = null; 31 | H2QueueConfig h2ComponentConfig = null;//(H2QueueConfig)getJscrapyConfig().get(ComponentName.QUEUE_H2); 32 | ReentrantLock taskQueueLock = h2ComponentConfig.getH2QueueLock(); 33 | try { 34 | taskQueueLock.lock(); //lock 35 | List urls = selectUrl(n); 36 | requestContexts = toRequestContext(urls); 37 | } catch (Exception e) { 38 | e.printStackTrace(); 39 | } 40 | finally { 41 | taskQueueLock.unlock(); 42 | } 43 | return requestContexts; 44 | } 45 | 46 | /** 47 | * 删除 48 | * 49 | * @param requests 50 | * @return 51 | */ 52 | @Override 53 | public int delete(List requests) { 54 | List urlQueueDos = toUrlQueueDo(requests); 55 | 56 | if (urlQueueDos.size() > 0) { 57 | String queueName = getJscrapyConfig().getTaskFp(); 58 | urlQueueMapper.batchUpdate(queueName, urlQueueDos); 59 | } 60 | 61 | return 0; 62 | } 63 | 64 | /** 65 | * 66 | * @param n 67 | * @return 68 | */ 69 | private List selectUrl(int n) { 70 | String queueName = getJscrapyConfig().getTaskFp(); 71 | List urls = urlQueueMapper.selectUrlByStatus(queueName, 72 | UrlStatus.NEW, 73 | n); 74 | 75 | int m = n - urls.size(); 76 | if (m > 0) {//找已经出队列的类型补全N个 77 | List urlsOutQueue = urlQueueMapper.selectUrlByStatus(queueName, 78 | UrlStatus.OUT_QUEUE, n); 79 | urls.addAll(urlsOutQueue); 80 | } 81 | // Update {gmt_access,url_status} 82 | if (urls.size() > 0) { 83 | urlQueueMapper.batchUpdateUrlStatus(queueName, UrlStatus.OUT_QUEUE, urls); 84 | } 85 | 86 | return urls; 87 | } 88 | 89 | /** 90 | * @return 91 | */ 92 | private List toRequestContext(List urlQueueDos) { 93 | List requestContexts = new ArrayList<>(); 94 | for (UrlQueueDo urlQueueDo : urlQueueDos) { 95 | requestContexts.add(new RequestContext(urlQueueDo)); 96 | } 97 | 98 | return requestContexts; 99 | } 100 | 101 | /** 102 | * 103 | * @param requestContexts 104 | * @return 105 | */ 106 | private List toUrlQueueDo(List requestContexts) { 107 | List urlQueueDos = new ArrayList<>(); 108 | for (RequestContext rcx : requestContexts) { 109 | urlQueueDos.add(rcx.toUrlQueueDo()); 110 | } 111 | 112 | return urlQueueDos; 113 | } 114 | 115 | /** 116 | * 117 | * @param jscrapyConfig 118 | */ 119 | public void setJscrapyConfig(JscrapyConfig jscrapyConfig) { 120 | super.setJscrapyConfig(jscrapyConfig); 121 | urlQueueMapper.createNewQueue(jscrapyConfig.getTaskFp()); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/request/HttpRequest.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.request; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import com.alibaba.fastjson.JSONException; 5 | import com.alibaba.fastjson.JSONObject; 6 | import com.alibaba.fastjson.annotation.JSONField; 7 | import org.apache.commons.codec.digest.DigestUtils; 8 | import org.apache.commons.lang3.StringUtils; 9 | import org.jscrapy.common.http.HttpHeaderConstant; 10 | import org.jscrapy.core.proxy.WatchableSpiderProxy; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | import java.util.*; 15 | 16 | /** 17 | * Created by cxu on 2014/11/21. 18 | */ 19 | public class HttpRequest extends Request{ 20 | final static Logger logger = LoggerFactory.getLogger(HttpRequest.class); 21 | private String url;//请求的url 22 | private HttpRequestMethod httpMethod;//请求的http方法,GET|POST等 23 | private Map parameters;//如果是post请求,这里存放请求参数 24 | 25 | private WatchableSpiderProxy proxy; 26 | private Map header = new HashMap(); 27 | private List acceptCode = new ArrayList();; 28 | private String charset;//站点的编码 29 | 30 | /** 31 | * 构造函数 32 | * @param url 33 | * @param httpMethod 34 | * @param parameters 35 | */ 36 | public HttpRequest(String url, HttpRequestMethod httpMethod, Map parameters) { 37 | this.url = url; 38 | this.httpMethod = httpMethod; 39 | this.parameters = parameters; 40 | 41 | header.put(HttpHeaderConstant.USER_AGENT, "myspider@github"); 42 | acceptCode.add(200); 43 | } 44 | 45 | public HttpRequest(String url) { 46 | this.url = url; 47 | this.httpMethod = HttpRequestMethod.GET; 48 | } 49 | 50 | public HttpRequest(){} 51 | 52 | @Override 53 | public String getUrl() { 54 | return this.url; 55 | } 56 | 57 | @Override 58 | public HttpRequestMethod getHttpMethod() { 59 | return this.httpMethod; 60 | } 61 | 62 | @Override 63 | public Map getParameters() { 64 | return this.parameters; 65 | } 66 | 67 | @Override 68 | public String asJson() { 69 | JSONObject jsonObject = new JSONObject(true); 70 | jsonObject.put("url", this.url); 71 | jsonObject.put("http_method", this.httpMethod.name()); 72 | 73 | /*Map里的key一定要排序之后,因为去重用的是md5*/ 74 | if (parameters != null && !parameters.isEmpty()) { 75 | Map jsonParam = new TreeMap(); 76 | Set> entrySet = this.parameters.entrySet(); 77 | for (Map.Entry entry : entrySet) { 78 | jsonParam.put(entry.getKey(), entry.getValue()); 79 | } 80 | JSONObject params = new JSONObject(jsonParam); 81 | jsonObject.put("parameters", params); 82 | } 83 | 84 | return jsonObject.toJSONString(); 85 | } 86 | 87 | @Override 88 | public String uniqId() { 89 | String s = this.asJson(); 90 | return DigestUtils.sha1Hex(s); 91 | } 92 | 93 | @Override 94 | public String toString(){ 95 | return this.asJson(); 96 | } 97 | 98 | @JSONField(name="url") 99 | public void setUrl(String url){ 100 | this.url = url; 101 | } 102 | 103 | @JSONField(name="http_method") 104 | public void setHttpMethod(String method) 105 | { 106 | this.httpMethod = HttpRequestMethod.valueOf(method); 107 | } 108 | 109 | @JSONField(name="parameters") 110 | public void SetFormParameters(Map params){ 111 | this.parameters = params; 112 | } 113 | 114 | /** 115 | * 从队列里的json字符串来创建一个HttpRequest 116 | * @param jsonString 117 | * @return 118 | */ 119 | public static HttpRequest build(String jsonString) throws JSONException { 120 | if(StringUtils.isNotBlank(jsonString)) { 121 | HttpRequest req = (HttpRequest) JSON.parseObject(jsonString, HttpRequest.class); 122 | return req; 123 | } 124 | 125 | return null; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/java/org/jscrapy/core/config/ConfigKeys.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.core.config; 2 | 3 | /** 4 | * _开头:的是一般的常量key, 5 | * RT_VAR开头:运行过程中会变化的量。比如任务的状态从开始(RUN)到结束(FINISH) 6 | * RT_EXT开头:派生出的配置。比如使用mongodb作为缓存的时候,会自动生成一个db,名字以任务的名字一样。 7 | * 8 | * Created by cxu on 2015/6/21. 9 | */ 10 | public class ConfigKeys { 11 | 12 | ///////////////////////////////////////////////////////////////////////// 13 | public static final String TASK_ID = "task.id"; 14 | public static final String TASK_NAME = "task.name"; 15 | public static final String TASK_VIRTUAL_ID = "task.virtual.id"; 16 | 17 | //保存本地任务的配置,缓存,数据等的总目录地址 18 | public static final String TASK_WORK_DIR = "task.work.dir"; 19 | public static final String TASK_WAIT_URL_TIMEOUT = "task.wait.url.timeout"; 20 | public static final String TASK_THREAD_COUNT = "task.thread.count"; 21 | 22 | ///////////////////////////////////////////////////////////////////////// 23 | public static final String SCHEDULER_BATCH_SIZE = "scheduler.batch.size"; 24 | public static final String SCHEDULER_CLASS_NAME = "scheduler.class.name"; 25 | public static final String SCHEDULER_REDIS_HOST = "scheduler.redis.host"; 26 | public static final String SCHEDULER_MONGO_HOST = "scheduler.mongo.host"; 27 | public static final String SCHEDULER_MONGO_PORT = "scheduler.mongo.port"; 28 | 29 | ///////////////////////////////////////////////////////////////////////// 30 | public static final String DEDUP_CLASS_NAME = "dedup_class_name"; 31 | public static final String DEDUP_REDIS_HOST = "dedup.redis.host"; 32 | public static final String DEDUP_MONGO_HOST = "dedup.mongo.host"; 33 | public static final String DEDUP_MONGO_PORT = "dedup.mongo.port"; 34 | 35 | ///////////////////////////////////////////////////////////////////////// 36 | public static final String DOWNLOADER_CLASS_NAME = "downloader.class.name"; 37 | 38 | ///////////////////////////////////////////////////////////////////////// 39 | public static final String PIPLINE_CLASS_NAME = "pipline.class.name"; 40 | 41 | ///////////////////////////////////////////////////////////////////////// 42 | public static final String PROCESSOR_CLASS_NAME = "processor.class.name"; 43 | 44 | ///////////////////////////////////////////////////////////////////////// 45 | public static final String CACHER_CLASS_NAME = "cacher.class.name"; 46 | public static final String CACHER_MONGODB_HOST = "cacher.mongo.host"; 47 | public static final String CACHER_MONGODB_PORT = "cacher.mongo.port"; 48 | 49 | ///////////////////////////////////////////////////////////////////////// 50 | // 发不到网络(nuxus)上的jar包(内含解析,提取链接、业务流程控制等逻辑) 51 | public static final String PLUGIN_URL = "plugin.url"; 52 | 53 | ///////////////////////////////////////////////////////////////////////// 54 | public static final String RT_EXT_DEDUP_MONGODB_DB_NAME = "rt.ext.mongodb.dedup.db.name";//去重 55 | public static final String RT_EXT_CACHER_MONGODB_DB_NAME = "rt.ext.mongodb.cacher.db.name";//缓存 56 | public static final String RT_EXT_TASK_LOGGER = "rt.ext.task.logger"; 57 | public static final String RT_EXT_RT_LOCAL_FILE_PIPLINE_DATA_FILE = "rt.ext.local.file.pipline.data.file"; 58 | public static final String RT_EXT_RT_LOCAL_TASK_WORK_DIR = "rt.ext.local.task.work.dir"; 59 | public static final String RT_EXT_RT_LOCAL_TASK_CACHER_DIR = "rt.ext.local.task.cacher.dir"; 60 | public static final String RT_EXT_RT_LOCAL_QUEUE_DIR = "rt.ext.local.queue.dir"; 61 | public static final String RT_EXT_RT_TASK_LOG_DIR = "rt.ext.task.log.dir";//分任务记录的日志 62 | 63 | ///////////////////////////////////////////////////////////////////////// 64 | /*任务当时应该处于的状态:运行、暂停(保存队列)、结束(需要清空队列)*/ 65 | public static final String RT_VAR_TASK_CTL_CMD = "rt.var.task.control.cmd"; 66 | 67 | ///////////////////////////////////////////////////////////////////////// 68 | public static final String _SCHEDULER_MEM_QUEUE_OBJ = "_scheduler_mem_queue"; 69 | public static final String _SCHEDULER_DISK_QUEUE_OBJ = "_scheduler_disk_queue"; 70 | public static final String _PROCESSOR_OBJ = "_processor_obj"; 71 | public static final String _DEDUP_MEM_SET_OBJ = "_dedup_mem_set_obj"; 72 | public static final String _DEDUP_DISK_SET_OBJ = "_dedup_disk_set_obj"; 73 | public static final String _SCHEDULER_OBJ = "_scheduler_obj"; 74 | public static final String _TASK_STATUS_OBJ = "_task_status_obj"; 75 | public static final String _GROOVY_SCRIPT_OBJ = "_groovy_script_obj"; 76 | } 77 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/resources/spring/spring-mybatis.xml: -------------------------------------------------------------------------------- 1 | 2 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /jscrapy-core/src/main/resources/mapper/h2queue.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | CREATE TABLE IF NOT EXISTS ${queue_name} 24 | ( 25 | id IDENTITY PRIMARY KEY, -- 唯一键 26 | sched_id VARCHAR(64), -- 调度的唯一性标识,每次都不一样即可,可以取当前任务启动时间戳 27 | url TEXT, -- url 28 | url_status CHAR(16), -- 种子状态[NEW:新进入, OUT:出队列, ERR:出错] 29 | retry_times TINYINT, -- 重试次数 30 | url_type CHAR(16), -- 种子类型[SEED:种子(Seed), LIST:列表(List), DETAIL: 详情页(Detail)] 31 | site_id VARCHAR(64), -- 站点ID 32 | gmt_created TIMESTAMP, -- url插入时间 33 | gmt_access TIMESTAMP, -- 被访问时间 34 | error_code CHAR(16), -- 错误编码 35 | error_msg TEXT -- 错误详细信息 36 | ); 37 | 38 | 39 | 40 | DROP TABLE IF EXISTS ${queue_name} 41 | 42 | 43 | 44 | INSERT INTO ${queue_name} (url, sched_id, url_status, retry_times, url_type, site_id, gmt_created, gmt_access, error_code, error_msg) 45 | VALUES 46 | 47 | ( 48 | #{url.url,jdbcType=VARCHAR}, 49 | #{url.schedId,jdbcType=VARCHAR}, 50 | #{url.urlStatus,jdbcType=VARCHAR}, 51 | #{url.retryTimes,jdbcType=BIGINT}, 52 | #{url.urlType,jdbcType=VARCHAR}, 53 | #{url.siteId,jdbcType=VARCHAR}, 54 | CURRENT_TIMESTAMP(), 55 | CURRENT_TIMESTAMP(), 56 | #{url.errorCode,jdbcType=VARCHAR}, 57 | #{url.errorMsg,jdbcType=VARCHAR} 58 | ) 59 | 60 | 61 | 62 | 63 | 66 | 67 | 68 | INSERT INTO ${queue_name} (id, url, url_status, gmt_access) 69 | VALUES 70 | 71 | ( 72 | #{url.id, jdbcType=BIGINT}, 73 | #{url.url, jdbcType=VARCHAR}, 74 | #{url.urlStatus, jdbcType=VARCHAR}, 75 | CURRENT_TIMESTAMP() 76 | ) 77 | 78 | ON DUPLICATE KEY UPDATE 79 | url=VALUES(url), 80 | url_status=VALUES(url_status), 81 | gmt_access=CURRENT_TIMESTAMP(); 82 | 83 | 84 | 85 | INSERT INTO ${queue_name} (id, url_status, gmt_access) 86 | VALUES 87 | 88 | ( 89 | #{url.id, jdbcType=BIGINT}, 90 | #{url.urlStatus, jdbcType=VARCHAR}, 91 | CURRENT_TIMESTAMP() 92 | ) 93 | 94 | ON DUPLICATE KEY UPDATE 95 | url_status= #{url_status, jdbcType=VARCHAR}, 96 | gmt_access=CURRENT_TIMESTAMP(); 97 | 98 | 99 | 100 | DELETE FROM ${queue_name} 101 | WHERE id IN 102 | 103 | #{urlQueueDo.id,jdbcType=BIGINT} 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /jscrapy-ext/src/main/java/org/jscrapy/ext/fetcher/ApacheHttpFetcher.java: -------------------------------------------------------------------------------- 1 | package org.jscrapy.ext.fetcher; 2 | 3 | /** 4 | * Created by cxu on 2015/9/29. 5 | */ 6 | public class ApacheHttpFetcher { 7 | 8 | // 9 | // public FetchResponse download(FetchRequest fetchRequest) throws SocketTimeoutException, URISyntaxException, UnsupportedEncodingException, IOException { 10 | // FetchResponse finalResponse = new FetchResponse(); 11 | // 12 | // HttpRequestBase req = this.buildRequest(fetchRequest); 13 | // CloseableHttpClient httpClient = this.buildHttpClient(fetchRequest); 14 | // CloseableHttpResponse response = null; 15 | // try { 16 | // response = httpClient.execute(req); 17 | // int statusCode = response.getStatusLine().getStatusCode(); 18 | // if (!fetchRequest.isAccept(statusCode)) { 19 | // //TODO log it 20 | // finalResponse.setSuccess(false); 21 | // finalResponse.setStatusCode(statusCode); 22 | // return finalResponse; 23 | // } 24 | // else{ 25 | // finalResponse.setSuccess(true); 26 | // finalResponse.setStatusCode(statusCode); 27 | // byte[] fetchBytes = IOUtils.toByteArray(response.getEntity().getContent()); 28 | // finalResponse.setContent(fetchBytes); 29 | // 30 | // String httpCharset = HttpCharsetDetector.detectEncode(response, fetchBytes); 31 | // if (httpCharset != null) { 32 | // finalResponse.setCharset(httpCharset); 33 | // } 34 | // //TODO log it 35 | // return finalResponse; 36 | // } 37 | // 38 | // } 39 | // finally { 40 | // response.close(); 41 | // httpClient.close(); 42 | // } 43 | // } 44 | // 45 | // /** 46 | // * 组装http请求 47 | // * @param fetchRequest 48 | // * @return 49 | // * @throws URISyntaxException 50 | // * @throws UnsupportedEncodingException 51 | // */ 52 | // private HttpRequestBase buildRequest(FetchRequest fetchRequest) throws URISyntaxException, UnsupportedEncodingException{ 53 | // 54 | // HttpRequestBase httpMethod = null; 55 | // 56 | // if (fetchRequest.getHttpMethod() == HttpMethod.GET) { 57 | // URIBuilder bd = new URIBuilder(fetchRequest.getUrl()); 58 | // 59 | // //设置GET的请求参数 60 | // final Map parameters = fetchRequest.getRequestParameters(); 61 | // if (parameters != null) { 62 | // Set parameterNames = parameters.keySet(); 63 | // for (String pName : parameterNames) { 64 | // bd.addParameter(pName, parameters.get(pName)); 65 | // } 66 | // } 67 | // 68 | // URI uri = bd.build(); 69 | // httpMethod = new HttpGet(uri); 70 | // } 71 | // else if (fetchRequest.getHttpMethod() == HttpMethod.POST) { 72 | // httpMethod = new HttpPost(fetchRequest.getUrl()); 73 | // //设置POST参数 74 | // List nvps = new ArrayList(); 75 | // final Map parameters = fetchRequest.getRequestParameters(); 76 | // Set parameterNames = parameters.keySet(); 77 | // for (String pName : parameterNames) { 78 | // nvps.add(new BasicNameValuePair(pName, parameters.get(pName))); 79 | // } 80 | // ((HttpPost)httpMethod).setEntity(new UrlEncodedFormEntity(nvps)); 81 | // } 82 | // 83 | // //设置头:cookie, isAjax, referer, userAgent等 84 | // final Map headers = fetchRequest.getHeader(); 85 | // Set headerNames = headers.keySet(); 86 | // for (String hdName : headerNames) { 87 | // httpMethod.setHeader(hdName, headers.get(hdName)); 88 | // } 89 | // 90 | // //设置代理 91 | // WatchableSpiderProxy proxy = fetchRequest.getProxy(); 92 | // RequestConfig.Builder requestConfigBuilder = RequestConfig.custom() 93 | // /*.setConnectionRequestTimeout(this.taskConfig.getHttpTimeoutMs()) 94 | // .setSocketTimeout(this.taskConfig.getHttpTimeoutMs()) 95 | // .setConnectTimeout(this.taskConfig.getHttpTimeoutMs())*/ 96 | // .setMaxRedirects(3)//最大跳转次数 97 | // .setCookieSpec(CookieSpecs.BEST_MATCH); 98 | // if (proxy != null) { 99 | // HttpHost httpProxy = new HttpHost(proxy.getHost(), proxy.getPort()); 100 | // if (proxy != null) { 101 | // requestConfigBuilder.setProxy(httpProxy); 102 | // } 103 | // } 104 | // httpMethod.setConfig(requestConfigBuilder.build()); 105 | // 106 | // return httpMethod; 107 | // } 108 | // 109 | // /** 110 | // * 下载客户端 111 | // * @param fetchRequest 112 | // * @return 113 | // */ 114 | // private CloseableHttpClient buildHttpClient(FetchRequest fetchRequest) { 115 | // 116 | // HttpClientBuilder httpClientBuilder = HttpClients.custom(); 117 | // WatchableSpiderProxy proxy = fetchRequest.getProxy(); 118 | // 119 | // if (proxy != null) {//代理认证 120 | // CredentialsProvider credsProvider = new BasicCredentialsProvider(); 121 | // credsProvider.setCredentials(new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT), 122 | // new UsernamePasswordCredentials(proxy.getUserName(), proxy.getPassword())); 123 | // httpClientBuilder.setDefaultCredentialsProvider(credsProvider); 124 | // } 125 | // 126 | // Registry reg = RegistryBuilder. create() 127 | // .register(HttpHeaderConstant.HTTP, PlainConnectionSocketFactory.INSTANCE) 128 | // .register(HttpHeaderConstant.HTTPS, SSLConnectionSocketFactory.getSocketFactory()).build(); 129 | // HttpClientConnectionManager connectionManager = new BasicHttpClientConnectionManager(reg); 130 | // httpClientBuilder.setConnectionManager(connectionManager); 131 | // 132 | // SocketConfig socketConfig = SocketConfig.custom().setSoKeepAlive(true).setTcpNoDelay(true) 133 | // .build(); 134 | // httpClientBuilder.setDefaultSocketConfig(socketConfig); 135 | // 136 | // return httpClientBuilder.build(); 137 | // } 138 | } 139 | --------------------------------------------------------------------------------