├── .travis.yml ├── assets ├── logo.jpg ├── image1.pdf ├── logo3.png ├── logo4.png ├── webmagic.psd ├── logo-simple.jpg ├── logo2.graffle │ └── image1.tiff ├── logo3.graffle │ ├── image1.tiff │ ├── image2.tiff │ ├── image4.tiff │ └── image5.tiff ├── webmagic-create-spider.png ├── webmagic-spider-manage.png └── page-extract-rule.bmml ├── lib ├── guava-15.0.jar ├── junit-4.11.jar ├── jedis-2.0.0.jar ├── jsoup-1.7.2.jar ├── log4j-1.2.17.jar ├── xsoup-0.2.4.jar ├── fastjson-1.1.37.jar ├── httpcore-4.3.2.jar ├── json-path-0.8.1.jar ├── slf4j-api-1.7.6.jar ├── commons-codec-1.6.jar ├── commons-io-1.3.2.jar ├── commons-lang-2.6.jar ├── commons-lang3-3.1.jar ├── hamcrest-core-1.3.jar ├── httpclient-4.3.3.jar ├── json-smart-1.1.1.jar ├── assertj-core-1.5.0.jar ├── commons-pool-1.5.5.jar ├── slf4j-log4j12-1.7.6.jar ├── webmagic-core-0.5.2.jar ├── commons-logging-1.1.3.jar ├── commons-collections-3.2.1.jar └── webmagic-extension-0.5.2.jar ├── webmagic-samples ├── README.md ├── src │ ├── main │ │ ├── java │ │ │ └── us │ │ │ │ └── codecraft │ │ │ │ └── webmagic │ │ │ │ ├── samples │ │ │ │ ├── pipeline │ │ │ │ │ └── ReplacePipeline.java │ │ │ │ ├── formatter │ │ │ │ │ └── StringTemplateFormatter.java │ │ │ │ ├── scheduler │ │ │ │ │ └── LevelLimitScheduler.java │ │ │ │ ├── GithubRepo.java │ │ │ │ ├── NjuBBSProcessor.java │ │ │ │ ├── TianyaPageProcesser.java │ │ │ │ ├── HuxiuProcessor.java │ │ │ │ ├── IteyeBlogProcessor.java │ │ │ │ ├── OschinaPageProcesser.java │ │ │ │ ├── F58PageProcesser.java │ │ │ │ ├── KaichibaProcessor.java │ │ │ │ ├── QzoneBlogProcessor.java │ │ │ │ ├── InfoQMiniBookProcessor.java │ │ │ │ └── GithubRepoPageProcessor.java │ │ │ │ └── model │ │ │ │ └── samples │ │ │ │ ├── Blog.java │ │ │ │ ├── QQMeishi.java │ │ │ │ ├── OschinaAnswer.java │ │ │ │ ├── IteyeBlog.java │ │ │ │ ├── DianpingFtlDataScanner.java │ │ │ │ ├── BaiduNews.java │ │ │ │ └── JokejiModel.java │ │ └── resources │ │ │ ├── crawl.js │ │ │ └── log4j.xml │ └── test │ │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ ├── samples │ │ └── scheduler │ │ │ └── DelayQueueSchedulerTest.java │ │ └── processor │ │ └── SinablogProcessorTest.java └── pom.xml ├── webmagic-avalon ├── webmagic-worker │ ├── README.md │ └── src │ │ ├── main │ │ ├── resources │ │ │ ├── freemarker.properties │ │ │ └── log │ │ │ │ └── log4j.xml │ │ ├── webapp │ │ │ └── WEB-INF │ │ │ │ └── jsp │ │ │ │ └── 500.jsp │ │ └── java │ │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ └── worker │ │ │ ├── controller │ │ │ └── SpiderController.java │ │ │ └── Worker.java │ │ └── test │ │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ └── worker │ │ └── WorkerTest.java ├── webmagic-admin │ ├── README.md │ ├── src │ │ └── main │ │ │ ├── webapp │ │ │ ├── static │ │ │ │ ├── favicon.jpg │ │ │ │ ├── js │ │ │ │ │ └── jquery.flot.resize.min.js │ │ │ │ └── css │ │ │ │ │ ├── jquery.cleditor.css │ │ │ │ │ └── fullcalendar.print.css │ │ │ └── WEB-INF │ │ │ │ ├── pages │ │ │ │ └── create_spider.ftl │ │ │ │ └── jsp │ │ │ │ └── 500.jsp │ │ │ ├── resources │ │ │ ├── freemarker.properties │ │ │ └── log │ │ │ │ └── log4j.xml │ │ │ └── java │ │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ └── avalon │ │ │ └── web │ │ │ ├── DashBoardController.java │ │ │ └── SpiderController.java │ └── pom.xml ├── README.md ├── forger │ ├── src │ │ ├── test │ │ │ ├── java │ │ │ │ └── us │ │ │ │ │ └── codecraft │ │ │ │ │ └── forger │ │ │ │ │ ├── Fooable.java │ │ │ │ │ ├── compiler │ │ │ │ │ └── GroovyForgerCompilerTest.java │ │ │ │ │ ├── Bar.java │ │ │ │ │ └── Foo.java │ │ │ └── resources │ │ │ │ └── log4j.xml │ │ └── main │ │ │ ├── java │ │ │ └── us │ │ │ │ └── codecraft │ │ │ │ └── forger │ │ │ │ ├── compiler │ │ │ │ ├── ForgerCompiler.java │ │ │ │ └── GroovyForgerCompiler.java │ │ │ │ ├── property │ │ │ │ ├── format │ │ │ │ │ ├── ObjectFormatter.java │ │ │ │ │ ├── TypeFormatter.java │ │ │ │ │ ├── DateFormatter.java │ │ │ │ │ ├── ObjectFormatterWithParams.java │ │ │ │ │ └── Formatter.java │ │ │ │ ├── PropertyLoader.java │ │ │ │ ├── Inject.java │ │ │ │ ├── PropertyType.java │ │ │ │ ├── SimpleFieldPropertyLoader.java │ │ │ │ ├── AnnotationPropertyLoader.java │ │ │ │ └── Property.java │ │ │ │ ├── ForgerFactory.java │ │ │ │ └── Forger.java │ │ │ └── resources │ │ │ └── log4j.xml │ └── README.md └── webmagic-avalon-common │ └── src │ ├── main │ ├── resources │ │ ├── config │ │ │ ├── freemarker.properties │ │ │ ├── log │ │ │ │ └── log4j.xml │ │ │ ├── spring │ │ │ │ ├── applicationContext-tx.xml │ │ │ │ ├── applicationContext-service.xml │ │ │ │ ├── applicationContext-component.xml │ │ │ │ ├── applicationContext-myBatis.xml │ │ │ │ └── applicationContext-webmvc.xml │ │ │ └── mapper │ │ │ │ └── DynamicClass.xml │ │ └── sql │ │ │ ├── h2 │ │ │ └── schema.sql │ │ │ └── mysql │ │ │ └── schema.sql │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ ├── dao │ │ └── DynamicClassDao.java │ │ ├── service │ │ ├── DynamicClassService.java │ │ └── impl │ │ │ └── DynamicClassServiceImpl.java │ │ ├── exception │ │ └── DynamicClassCompileException.java │ │ └── model │ │ └── DynamicClass.java │ └── test │ └── java │ └── us │ └── codecraft │ └── webmagic │ ├── AbstractTest.java │ ├── dao │ └── DynamicClassDaoTest.java │ └── Foo.java ├── webmagic-extension ├── README.md ├── src │ ├── main │ │ ├── java │ │ │ └── us │ │ │ │ └── codecraft │ │ │ │ └── webmagic │ │ │ │ ├── model │ │ │ │ ├── annotation │ │ │ │ │ ├── package.html │ │ │ │ │ ├── Formatter.java │ │ │ │ │ ├── TargetUrl.java │ │ │ │ │ ├── HelpUrl.java │ │ │ │ │ └── ExtractByUrl.java │ │ │ │ ├── package.html │ │ │ │ ├── formatter │ │ │ │ │ ├── ObjectFormatter.java │ │ │ │ │ ├── DateFormatter.java │ │ │ │ │ └── ObjectFormatters.java │ │ │ │ ├── AfterExtractor.java │ │ │ │ ├── HasKey.java │ │ │ │ ├── ConsolePageModelPipeline.java │ │ │ │ ├── PageMapper.java │ │ │ │ ├── Extractor.java │ │ │ │ ├── FieldExtractor.java │ │ │ │ └── PageModelCollectorPipeline.java │ │ │ │ ├── configurable │ │ │ │ └── ExpressionType.java │ │ │ │ ├── pipeline │ │ │ │ ├── PageModelPipeline.java │ │ │ │ ├── CollectorPageModelPipeline.java │ │ │ │ └── JsonFilePipeline.java │ │ │ │ ├── handler │ │ │ │ ├── PatternProcessor.java │ │ │ │ ├── SubPageProcessor.java │ │ │ │ ├── RequestMatcher.java │ │ │ │ ├── SubPipeline.java │ │ │ │ ├── PatternRequestMatcher.java │ │ │ │ └── CompositePipeline.java │ │ │ │ ├── monitor │ │ │ │ └── SpiderStatusMXBean.java │ │ │ │ ├── utils │ │ │ │ ├── ClassUtils.java │ │ │ │ ├── MultiKeyMapBase.java │ │ │ │ └── IPUtils.java │ │ │ │ ├── example │ │ │ │ ├── MonitorExample.java │ │ │ │ ├── GithubRepoPageMapper.java │ │ │ │ └── AppStore.java │ │ │ │ └── MultiPageModel.java │ │ └── resources │ │ │ ├── crawl.js │ │ │ ├── log4j.xml │ │ │ └── spider-config-draft.xml │ └── test │ │ ├── java │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ ├── monitor │ │ │ ├── CustomSpiderStatusMXBean.java │ │ │ ├── CustomSpiderStatus.java │ │ │ └── SpiderMonitorTest.java │ │ │ ├── MockPipeline.java │ │ │ ├── model │ │ │ ├── BaseRepo.java │ │ │ ├── MockModel.java │ │ │ ├── GithubRepoTest.java │ │ │ └── GithubRepo.java │ │ │ ├── utils │ │ │ └── IPUtilsTest.java │ │ │ ├── MockPageModelPipeline.java │ │ │ ├── downloader │ │ │ └── FileCacheTest.java │ │ │ ├── formatter │ │ │ └── DateFormatterTest.java │ │ │ ├── scheduler │ │ │ └── RedisSchedulerTest.java │ │ │ └── processor │ │ │ └── GithubRepoProcessor.java │ │ └── resources │ │ ├── log4j.xml │ │ └── html │ │ └── mock-webmagic.html ├── pom.xml ├── pom.xml.versionsBackup └── pom.xml.releaseBackup ├── webmagic-core ├── README.md └── src │ ├── main │ ├── java │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ ├── utils │ │ │ ├── package.html │ │ │ ├── Experimental.java │ │ │ ├── NumberUtils.java │ │ │ ├── HttpConstant.java │ │ │ └── FilePersistentBase.java │ │ │ ├── scheduler │ │ │ ├── component │ │ │ │ ├── package.html │ │ │ │ ├── DuplicateRemover.java │ │ │ │ └── HashSetDuplicateRemover.java │ │ │ ├── package.html │ │ │ ├── MonitorableScheduler.java │ │ │ ├── Scheduler.java │ │ │ └── QueueScheduler.java │ │ │ ├── processor │ │ │ ├── package.html │ │ │ ├── PageProcessor.java │ │ │ ├── SimplePageProcessor.java │ │ │ └── example │ │ │ │ ├── OschinaBlogPageProcessor.java │ │ │ │ └── GithubRepoPageProcessor.java │ │ │ ├── pipeline │ │ │ ├── package.html │ │ │ ├── CollectorPipeline.java │ │ │ ├── Pipeline.java │ │ │ ├── ResultItemsCollectorPipeline.java │ │ │ └── ConsolePipeline.java │ │ │ ├── package.html │ │ │ ├── downloader │ │ │ ├── package.html │ │ │ └── Downloader.java │ │ │ ├── selector │ │ │ ├── package.html │ │ │ ├── RegexResult.java │ │ │ ├── Selector.java │ │ │ ├── ElementSelector.java │ │ │ ├── ReplaceSelector.java │ │ │ ├── OrSelector.java │ │ │ ├── Selectors.java │ │ │ ├── BaseElementSelector.java │ │ │ └── XpathSelector.java │ │ │ ├── proxy │ │ │ └── ProxyPool.java │ │ │ ├── SpiderListener.java │ │ │ └── Task.java │ └── resources │ │ └── log4j.xml │ └── test │ ├── java │ └── us │ │ └── codecraft │ │ └── webmagic │ │ ├── ResultItemsTest.java │ │ ├── selector │ │ ├── RegexSelectorTest.java │ │ ├── JsonTest.java │ │ ├── SelectorTest.java │ │ └── ExtractorsTest.java │ │ ├── downloader │ │ └── MockGithubDownloader.java │ │ ├── example │ │ └── GithubRepoPageProcessorTest.java │ │ └── pipeline │ │ └── FilePipelineTest.java │ └── resources │ └── log4j.xml ├── webmagic-saxon ├── README.md └── pom.xml ├── webmagic-selenium ├── README.md ├── config.ini ├── src │ └── test │ │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ ├── downloader │ │ ├── selenium │ │ │ └── WebDriverPoolTest.java │ │ └── SeleniumTest.java │ │ └── samples │ │ └── GooglePlayProcessor.java └── pom.xml ├── .gitignore ├── make.sh ├── webmagic-scripts ├── src │ ├── main │ │ ├── resources │ │ │ ├── python │ │ │ │ ├── oschina.py │ │ │ │ └── defines.py │ │ │ ├── ruby │ │ │ │ ├── oschina.rb │ │ │ │ ├── defines.rb │ │ │ │ └── github.rb │ │ │ ├── js │ │ │ │ ├── defines.js │ │ │ │ ├── oschina.js │ │ │ │ └── github.js │ │ │ └── log4j.xml │ │ └── java │ │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ └── scripts │ │ │ ├── Language.java │ │ │ └── ScriptEnginePool.java │ └── test │ │ ├── resouces │ │ └── log4j.xml │ │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ └── scripts │ │ └── ScriptProcessorTest.java └── deploy.sh ├── webmagic-avalon.md └── release.properties /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk7 4 | -------------------------------------------------------------------------------- /assets/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo.jpg -------------------------------------------------------------------------------- /assets/image1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/image1.pdf -------------------------------------------------------------------------------- /assets/logo3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.png -------------------------------------------------------------------------------- /assets/logo4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo4.png -------------------------------------------------------------------------------- /lib/guava-15.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/guava-15.0.jar -------------------------------------------------------------------------------- /lib/junit-4.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/junit-4.11.jar -------------------------------------------------------------------------------- /assets/webmagic.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/webmagic.psd -------------------------------------------------------------------------------- /lib/jedis-2.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/jedis-2.0.0.jar -------------------------------------------------------------------------------- /lib/jsoup-1.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/jsoup-1.7.2.jar -------------------------------------------------------------------------------- /lib/log4j-1.2.17.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/log4j-1.2.17.jar -------------------------------------------------------------------------------- /lib/xsoup-0.2.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/xsoup-0.2.4.jar -------------------------------------------------------------------------------- /webmagic-samples/README.md: -------------------------------------------------------------------------------- 1 | webmagic-samples 2 | ------- 3 | webmagic的一些示例。包括抓取常见 博客、信息类网站等。 -------------------------------------------------------------------------------- /assets/logo-simple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo-simple.jpg -------------------------------------------------------------------------------- /lib/fastjson-1.1.37.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/fastjson-1.1.37.jar -------------------------------------------------------------------------------- /lib/httpcore-4.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/httpcore-4.3.2.jar -------------------------------------------------------------------------------- /lib/json-path-0.8.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/json-path-0.8.1.jar -------------------------------------------------------------------------------- /lib/slf4j-api-1.7.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/slf4j-api-1.7.6.jar -------------------------------------------------------------------------------- /lib/commons-codec-1.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-codec-1.6.jar -------------------------------------------------------------------------------- /lib/commons-io-1.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-io-1.3.2.jar -------------------------------------------------------------------------------- /lib/commons-lang-2.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-lang-2.6.jar -------------------------------------------------------------------------------- /lib/commons-lang3-3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-lang3-3.1.jar -------------------------------------------------------------------------------- /lib/hamcrest-core-1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/hamcrest-core-1.3.jar -------------------------------------------------------------------------------- /lib/httpclient-4.3.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/httpclient-4.3.3.jar -------------------------------------------------------------------------------- /lib/json-smart-1.1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/json-smart-1.1.1.jar -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-worker/README.md: -------------------------------------------------------------------------------- 1 | WebMagic-Worker 2 | ===== 3 | Worker is the spider container. -------------------------------------------------------------------------------- /webmagic-extension/README.md: -------------------------------------------------------------------------------- 1 | webmagic-extension 2 | ------- 3 | webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。 -------------------------------------------------------------------------------- /lib/assertj-core-1.5.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/assertj-core-1.5.0.jar -------------------------------------------------------------------------------- /lib/commons-pool-1.5.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-pool-1.5.5.jar -------------------------------------------------------------------------------- /lib/slf4j-log4j12-1.7.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/slf4j-log4j12-1.7.6.jar -------------------------------------------------------------------------------- /lib/webmagic-core-0.5.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/webmagic-core-0.5.2.jar -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/README.md: -------------------------------------------------------------------------------- 1 | WebMagic-Admin 2 | ===== 3 | Admin is the control web of workers. -------------------------------------------------------------------------------- /lib/commons-logging-1.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-logging-1.1.3.jar -------------------------------------------------------------------------------- /assets/logo2.graffle/image1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo2.graffle/image1.tiff -------------------------------------------------------------------------------- /assets/logo3.graffle/image1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.graffle/image1.tiff -------------------------------------------------------------------------------- /assets/logo3.graffle/image2.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.graffle/image2.tiff -------------------------------------------------------------------------------- /assets/logo3.graffle/image4.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.graffle/image4.tiff -------------------------------------------------------------------------------- /assets/logo3.graffle/image5.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.graffle/image5.tiff -------------------------------------------------------------------------------- /assets/webmagic-create-spider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/webmagic-create-spider.png -------------------------------------------------------------------------------- /assets/webmagic-spider-manage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/webmagic-spider-manage.png -------------------------------------------------------------------------------- /lib/commons-collections-3.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-collections-3.2.1.jar -------------------------------------------------------------------------------- /lib/webmagic-extension-0.5.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/webmagic-extension-0.5.2.jar -------------------------------------------------------------------------------- /webmagic-core/README.md: -------------------------------------------------------------------------------- 1 | webmagic-core 2 | ------- 3 | webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 -------------------------------------------------------------------------------- /webmagic-saxon/README.md: -------------------------------------------------------------------------------- 1 | webmagic-extension 2 | ------- 3 | webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。 -------------------------------------------------------------------------------- /webmagic-selenium/README.md: -------------------------------------------------------------------------------- 1 | webmagic-extension 2 | ------- 3 | webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | *.iml 3 | out/ 4 | .idea 5 | .classpath 6 | .project 7 | .settings/ 8 | bin/ 9 | .myeclipse 10 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | Static utils of webmagic. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-avalon/README.md: -------------------------------------------------------------------------------- 1 | WebMagic-Avalon 2 | ======== 3 | > Spiders Manage Web 4 | 5 | see [#issue43](https://github.com/code4craft/webmagic/issues/43) -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | mvn clean dependency:copy-dependencies -DoutputDirectory=target/lib 3 | rsync -avz --delete ./webmagic-samples/target/lib/ ./lib/ 4 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Component of scheduler. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Scheduler is the part of url management. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Annotations for defining a extractor. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/webapp/static/favicon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/webmagic-avalon/webmagic-admin/src/main/webapp/static/favicon.jpg -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | PageProcessor custom part of a crawler for specific site. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Page model and annotations used to customize a crawler. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Pipeline is the persistent and offline process part of crawler. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
13 | * A PatternHandler is in charge of both page extraction and data processing by implementing
14 | * its two abstract methods.
15 | */
16 | public abstract class PatternRequestMatcher implements RequestMatcher {
17 |
18 | /**
19 | * match pattern. only matched page should be handled.
20 | */
21 | protected String pattern;
22 |
23 | private Pattern patternCompiled;
24 |
25 | /**
26 | * @param pattern url pattern to handle
27 | */
28 | public PatternRequestMatcher(String pattern) {
29 | this.pattern = pattern;
30 | this.patternCompiled = Pattern.compile(pattern);
31 | }
32 |
33 | @Override
34 | public boolean match(Request request) {
35 | return patternCompiled.matcher(request.getUrl()).matches();
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/webmagic-extension/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
12 | */
13 | public class HuxiuProcessor implements PageProcessor {
14 | @Override
15 | public void process(Page page) {
16 | List
10 | * @since 0.3.2
11 | */
12 | @TargetUrl("https://github.com/\\w+/\\w+")
13 | @HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"})
14 | public class GithubRepo extends BaseRepo{
15 |
16 | @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()")
17 | private int fork;
18 |
19 | public static void main(String[] args) {
20 | OOSpider.create(Site.me().setSleepTime(100)
21 | , new ConsolePageModelPipeline(), GithubRepo.class)
22 | .addUrl("https://github.com/code4craft").thread(10).run();
23 | }
24 |
25 | public int getStar() {
26 | return star;
27 | }
28 |
29 | public int getFork() {
30 | return fork;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/webmagic-samples/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
12 | */
13 | @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*")
14 | @HelpUrl("http://www.oschina.net/question/*")
15 | @ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true)
16 | public class OschinaAnswer implements AfterExtractor{
17 |
18 | @ExtractBy("//img/@title")
19 | private String user;
20 |
21 | @ExtractBy("//div[@class='detail']")
22 | private String content;
23 |
24 | public static void main(String[] args) {
25 | OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run();
26 | }
27 |
28 | @Override
29 | public void afterProcess(Page page) {
30 |
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/model/DynamicClass.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.model;
2 |
3 | import java.util.Date;
4 |
5 | /**
6 | * @author code4crafter@gmail.com
7 | */
8 | public class DynamicClass {
9 |
10 | private String className;
11 |
12 | private String sourceCode;
13 |
14 | private Date addTime;
15 |
16 | private Date updateTime;
17 |
18 | public String getClassName() {
19 | return className;
20 | }
21 |
22 | public void setClassName(String className) {
23 | this.className = className;
24 | }
25 |
26 | public String getSourceCode() {
27 | return sourceCode;
28 | }
29 |
30 | public void setSourceCode(String sourceCode) {
31 | this.sourceCode = sourceCode;
32 | }
33 |
34 | public Date getAddTime() {
35 | return addTime;
36 | }
37 |
38 | public void setAddTime(Date addTime) {
39 | this.addTime = addTime;
40 | }
41 |
42 | public Date getUpdateTime() {
43 | return updateTime;
44 | }
45 |
46 | public void setUpdateTime(Date updateTime) {
47 | this.updateTime = updateTime;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.samples;
2 |
3 | import us.codecraft.webmagic.Page;
4 | import us.codecraft.webmagic.Site;
5 | import us.codecraft.webmagic.Spider;
6 | import us.codecraft.webmagic.processor.PageProcessor;
7 |
8 | /**
9 | * @author code4crafter@gmail.com
10 | */
11 | public class IteyeBlogProcessor implements PageProcessor {
12 |
13 | private Site site;
14 |
15 | @Override
16 | public void process(Page page) {
17 | page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all());
18 | page.putField("title",page.getHtml().xpath("//title").toString());
19 | page.putField("content",page.getHtml().smartContent().toString());
20 | }
21 |
22 | @Override
23 | public Site getSite() {
24 | if (site == null) {
25 | site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/");
26 | }
27 | return site;
28 | }
29 |
30 | public static void main(String[] args) {
31 | Spider.create(new IteyeBlogProcessor()).thread(5).run();
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.model;
2 |
3 | import us.codecraft.webmagic.selector.Selector;
4 |
5 | /**
6 | * The object contains 'ExtractBy' information.
7 | * @author code4crafter@gmail.com
8 | * @since 0.2.0
9 | */
10 | class Extractor {
11 |
12 | protected Selector selector;
13 |
14 | protected final Source source;
15 |
16 | protected final boolean notNull;
17 |
18 | protected final boolean multi;
19 |
20 | static enum Source {Html, Url, RawHtml}
21 |
22 | public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
23 | this.selector = selector;
24 | this.source = source;
25 | this.notNull = notNull;
26 | this.multi = multi;
27 | }
28 |
29 | Selector getSelector() {
30 | return selector;
31 | }
32 |
33 | Source getSource() {
34 | return source;
35 | }
36 |
37 | boolean isNotNull() {
38 | return notNull;
39 | }
40 |
41 | boolean isMulti() {
42 | return multi;
43 | }
44 |
45 | void setSelector(Selector selector) {
46 | this.selector = selector;
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.downloader;
2 |
3 | import org.apache.commons.io.IOUtils;
4 | import us.codecraft.webmagic.Page;
5 | import us.codecraft.webmagic.Request;
6 | import us.codecraft.webmagic.Task;
7 | import us.codecraft.webmagic.selector.PlainText;
8 |
9 | import java.io.IOException;
10 | import java.io.InputStream;
11 |
12 | /**
13 | * @author code4crafter@gmail.com
14 | */
15 | public class MockGithubDownloader implements Downloader {
16 |
17 | @Override
18 | public Page download(Request request, Task task) {
19 | Page page = new Page();
20 | InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html");
21 | try {
22 | page.setRawText(IOUtils.toString(resourceAsStream));
23 | } catch (IOException e) {
24 | e.printStackTrace();
25 | }
26 | page.setRequest(new Request("https://github.com/code4craft/webmagic"));
27 | page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
28 | return page;
29 | }
30 |
31 | @Override
32 | public void setThread(int threadNum) {
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.samples;
2 |
3 | import us.codecraft.webmagic.Site;
4 | import us.codecraft.webmagic.Page;
5 | import us.codecraft.webmagic.processor.PageProcessor;
6 |
7 | import java.util.List;
8 |
9 | /**
10 | * @author code4crafter@gmail.com
11 | */
12 | public class OschinaPageProcesser implements PageProcessor {
13 |
14 | @Override
15 | public void process(Page page) {
16 | List
17 |
19 |
10 | * Date: 13-8-2
11 | * Time: 上午7:52
12 | */
13 | @TargetUrl("http://*.iteye.com/blog/*")
14 | public class IteyeBlog implements Blog{
15 |
16 | @ExtractBy("//title")
17 | private String title;
18 |
19 | @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css)
20 | private String content;
21 |
22 | @Override
23 | public String toString() {
24 | return "IteyeBlog{" +
25 | "title='" + title + '\'' +
26 | ", content='" + content + '\'' +
27 | '}';
28 | }
29 |
30 | public static void main(String[] args) {
31 | OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run();
32 | }
33 |
34 | public String getTitle() {
35 | return title;
36 | }
37 |
38 | public String getContent() {
39 | return content;
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/Formatter.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.forger.property.format;
2 |
3 | import java.lang.annotation.ElementType;
4 | import java.lang.annotation.Retention;
5 | import java.lang.annotation.Target;
6 |
7 | /**
8 | * Define how the result string is convert to an object for field.
9 | *
10 | * @author code4crafter@gmail.com
11 | * @since 0.3.2
12 | */
13 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
14 | @Target({ElementType.FIELD})
15 | public @interface Formatter {
16 |
17 | /**
18 | * Set formatter params.
19 | *
20 | * @return formatter params
21 | */
22 | String[] value();
23 |
24 | /**
25 | * Specific the class of field of class of elements in collection for field.
26 | * It is not necessary to be set because we can detect the class by class of field,
27 | * unless you use a collection as a field.
28 | *
29 | * @return the class of field
30 | */
31 | Class subClazz() default String.class;
32 |
33 | /**
34 | * If there are more than one formatter for a class, just specify the implement.
35 | * @return implement
36 | */
37 | Class extends TypeFormatter> formatter() default TypeFormatter.class;
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/webmagic-core/src/test/java/us/codecraft/webmagic/example/GithubRepoPageProcessorTest.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.example;
2 |
3 | import org.junit.Test;
4 | import us.codecraft.webmagic.ResultItems;
5 | import us.codecraft.webmagic.Spider;
6 | import us.codecraft.webmagic.Task;
7 | import us.codecraft.webmagic.downloader.MockGithubDownloader;
8 | import us.codecraft.webmagic.pipeline.Pipeline;
9 | import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor;
10 |
11 | import static org.assertj.core.api.Assertions.assertThat;
12 |
13 | /**
14 | * @author code4crafter@gmail.com
15 | * Date: 16/1/19
16 | * Time: 上午7:27
17 | */
18 | public class GithubRepoPageProcessorTest {
19 |
20 | @Test
21 | public void test_github() throws Exception {
22 | Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() {
23 | @Override
24 | public void process(ResultItems resultItems, Task task) {
25 | assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic");
26 | assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft");
27 | }
28 | }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.scheduler;
2 |
3 | import org.junit.Before;
4 | import org.junit.Ignore;
5 | import org.junit.Test;
6 | import us.codecraft.webmagic.Request;
7 | import us.codecraft.webmagic.Site;
8 | import us.codecraft.webmagic.Task;
9 |
10 | /**
11 | * @author code4crafter@gmail.com
12 | */
13 | public class RedisSchedulerTest {
14 |
15 | private RedisScheduler redisScheduler;
16 |
17 | @Before
18 | public void setUp() {
19 | redisScheduler = new RedisScheduler("localhost");
20 | }
21 |
22 | @Ignore("environment depended")
23 | @Test
24 | public void test() {
25 | Task task = new Task() {
26 | @Override
27 | public String getUUID() {
28 | return "1";
29 | }
30 |
31 | @Override
32 | public Site getSite() {
33 | return null;
34 | }
35 | };
36 | Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/");
37 | request.putExtra("1","2");
38 | redisScheduler.push(request, task);
39 | Request poll = redisScheduler.poll(task);
40 | System.out.println(poll);
41 |
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic;
2 |
3 | import us.codecraft.webmagic.utils.Experimental;
4 |
5 | import java.util.Collection;
6 |
7 | /**
8 | * Extract an object of more than one pages, such as news and articles.
9 | *
10 | * @author code4crafter@gmail.com
11 | * @since 0.2.0
12 | */
13 | @Experimental
14 | public interface MultiPageModel {
15 |
16 | /**
17 | * Page key is the identifier for the object.
18 | *
19 | * @return page key
20 | */
21 | public String getPageKey();
22 |
23 | /**
24 | * page is the identifier of a page in pages for one object.
25 | *
26 | * @return page
27 | */
28 | public String getPage();
29 |
30 | /**
31 | * other pages to be extracted.
32 | * It is used to judge whether an object contains more than one page, and whether the pages of the object are all extracted.
33 | *
34 | * @return other pages
35 | */
36 | public Collection
14 | * Date: 13-8-13
15 | * Time: 上午10:13
16 | */
17 | @TargetUrl("http://*.alpha.dp/*")
18 | public class DianpingFtlDataScanner implements AfterExtractor {
19 |
20 | @ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true)
21 | private List
13 | * Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap.
14 | *
15 | * @author code4crafter@gmail.com
16 | * @since 0.1.0
17 | */
18 | @ThreadSafe
19 | public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler {
20 |
21 | private BlockingQueue
13 | * Date: 13-4-21
14 | * Time: 下午1:48
15 | */
16 | public class F58PageProcesser implements PageProcessor {
17 |
18 | @Override
19 | public void process(Page page) {
20 | List
13 | * @since 0.3.2
14 | */
15 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
16 | @Target({ElementType.FIELD})
17 | public @interface Formatter {
18 |
19 | /**
20 | * Set formatter params.
21 | *
22 | * @return formatter params
23 | */
24 | String[] value() default "";
25 |
26 | /**
27 | * Specific the class of field of class of elements in collection for field.
28 | * It is not necessary to be set because we can detect the class by class of field,
29 | * unless you use a collection as a field.
30 | *
31 | * @return the class of field
32 | */
33 | Class subClazz() default Void.class;
34 |
35 | /**
36 | * If there are more than one formatter for a class, just specify the implement.
37 | * @return implement
38 | */
39 | Class extends ObjectFormatter> formatter() default ObjectFormatter.class;
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.model.annotation;
2 |
3 | import java.lang.annotation.ElementType;
4 | import java.lang.annotation.Retention;
5 | import java.lang.annotation.Target;
6 |
7 | /**
8 | * Define the url patterns for class.
9 | * All urls matching the pattern will be crawled and extracted for new objects.
10 | *
11 | * @author code4crafter@gmail.com
12 | * @since 0.2.0
13 | */
14 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
15 | @Target({ElementType.TYPE})
16 | public @interface TargetUrl {
17 |
18 | /**
19 | * The url patterns for class.
20 | * Use regex expression with some changes:
21 | * "." stand for literal character "." instead of "any character".
22 | * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
23 | *
24 | * @return the url patterns for class
25 | */
26 | String[] value();
27 |
28 | /**
29 | * Define the region for url extracting.
30 | * Only support XPath.
31 | * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
32 | *
33 | * @return the region for url extracting
34 | */
35 | String sourceRegion() default "";
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.model.samples;
2 |
3 | import us.codecraft.webmagic.Site;
4 | import us.codecraft.webmagic.model.OOSpider;
5 | import us.codecraft.webmagic.model.annotation.ExtractBy;
6 |
7 | /**
8 | * @author code4crafter@gmail.com
9 | * @date 14-4-9
10 | */
11 | public class BaiduNews {
12 |
13 | @ExtractBy("//h3[@class='c-title']/a/text()")
14 | private String name;
15 |
16 | @ExtractBy("//div[@class='c-summary']/text()")
17 | private String description;
18 |
19 | @Override
20 | public String toString() {
21 | return "BaiduNews{" +
22 | "name='" + name + '\'' +
23 | ", description='" + description + '\'' +
24 | '}';
25 | }
26 |
27 | public static void main(String[] args) {
28 | OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class);
29 | //single download
30 | BaiduNews baike = ooSpider.
9 | * All urls matching the pattern will be crawled and but not extracted for new objects.
10 | *
11 | * @author code4crafter@gmail.com
12 | * @since 0.2.0
13 | */
14 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
15 | @Target({ElementType.TYPE})
16 | public @interface HelpUrl {
17 |
18 | /**
19 | * The url patterns to crawl.
20 | * Use regex expression with some changes:
21 | * "." stand for literal character "." instead of "any character".
22 | * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
23 | *
24 | * @return the url patterns for class
25 | */
26 | String[] value();
27 |
28 | /**
29 | * Define the region for url extracting.
30 | * Only support XPath.
31 | * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
32 | *
33 | * @return the region for url extracting
34 | */
35 | String sourceRegion() default "";
36 | }
37 |
--------------------------------------------------------------------------------
/webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.scripts;
2 |
3 | import javax.script.ScriptEngine;
4 | import javax.script.ScriptEngineManager;
5 | import java.util.concurrent.LinkedBlockingQueue;
6 | import java.util.concurrent.atomic.AtomicInteger;
7 |
8 | /**
9 | * @author code4crafter@gmail.com
10 | * @since 0.4.1
11 | */
12 | public class ScriptEnginePool {
13 |
14 | private final int size;
15 |
16 | private final AtomicInteger availableCount;
17 |
18 | private final LinkedBlockingQueue
10 | *
11 | * @author code4crafter@gmail.com
12 | * @since 0.1.0
13 | */
14 | public class ReplaceSelector implements Selector {
15 |
16 | private String regexStr;
17 |
18 | private String replacement;
19 |
20 | private Pattern regex;
21 |
22 | public ReplaceSelector(String regexStr, String replacement) {
23 | this.regexStr = regexStr;
24 | this.replacement = replacement;
25 | try {
26 | regex = Pattern.compile(regexStr);
27 | } catch (PatternSyntaxException e) {
28 | throw new IllegalArgumentException("invalid regex", e);
29 | }
30 | }
31 |
32 | @Override
33 | public String select(String text) {
34 | Matcher matcher = regex.matcher(text);
35 | return matcher.replaceAll(replacement);
36 | }
37 |
38 | @Override
39 | public List
9 | * @since 0.2.0
10 | */
11 | public class FilePersistentBase {
12 |
13 | protected String path;
14 |
15 | public static String PATH_SEPERATOR = "/";
16 |
17 | static {
18 | String property = System.getProperties().getProperty("file.separator");
19 | if (property != null) {
20 | PATH_SEPERATOR = property;
21 | }
22 | }
23 |
24 | public void setPath(String path) {
25 | if (!path.endsWith(PATH_SEPERATOR)) {
26 | path += PATH_SEPERATOR;
27 | }
28 | this.path = path;
29 | }
30 |
31 | public File getFile(String fullName) {
32 | checkAndMakeParentDirecotry(fullName);
33 | return new File(fullName);
34 | }
35 |
36 | public void checkAndMakeParentDirecotry(String fullName) {
37 | int index = fullName.lastIndexOf(PATH_SEPERATOR);
38 | if (index > 0) {
39 | String path = fullName.substring(0, index);
40 | File file = new File(path);
41 | if (!file.exists()) {
42 | file.mkdirs();
43 | }
44 | }
45 | }
46 |
47 | public String getPath() {
48 | return path;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.selector;
2 |
3 | import org.junit.Test;
4 |
5 | import static org.assertj.core.api.Assertions.assertThat;
6 | import static us.codecraft.webmagic.selector.Selectors.*;
7 |
8 | /**
9 | * @author code4crafter@gmail.com
10 | */
11 | public class ExtractorsTest {
12 |
13 | String html = "testaabbcc
15 | * Date: 13-6-9
16 | * Time: 上午8:02
17 | */
18 | public class SinablogProcessorTest {
19 |
20 | @Ignore
21 | @Test
22 | public void test() throws IOException {
23 | SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor();
24 | //pipeline是抓取结束后的处理
25 | //默认放到/data/webmagic/ftl/[domain]目录下
26 | JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/");
27 | //Spider.me()是简化写法,其实就是new一个啦
28 | //Spider.pipeline()设定一个pipeline,支持链式调用
29 | //ConsolePipeline输出结果到控制台
30 | //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录
31 | //Spider.run()执行
32 | Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).
33 | run();
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/webmagic-selenium/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
13 | *
14 | * @author bob.li.0718@gmail.com
15 | * Date: 15-7-11
16 | */
17 | public class GooglePlayProcessor implements PageProcessor {
18 |
19 | private Site site;
20 |
21 | @Override
22 | public void process(Page page) {
23 |
24 | page.putField("whole-html", page.getHtml().toString());
25 |
26 | }
27 |
28 | @Override
29 | public Site getSite() {
30 | if (null == site) {
31 | site = Site.me().setDomain("play.google.com").setSleepTime(300);
32 | }
33 | return site;
34 | }
35 |
36 | public static void main(String[] args) {
37 | Spider.create(new GooglePlayProcessor())
38 | .thread(5)
39 | .addPipeline(
40 | new FilePipeline(
41 | "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/data/"))
42 | .setDownloader(new SeleniumDownloader())
43 | .addUrl("https://play.google.com/store/apps/details?id=com.tencent.mm")
44 | .runAsync();
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.selector;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | /**
7 | * All extractors will do extracting separately,
8 | * and the results of extractors will combined as the final result.
9 | * @author code4crafter@gmail.com
10 | * @since 0.2.0
11 | */
12 | public class OrSelector implements Selector {
13 |
14 | private List
10 | * Date: 13-5-20
11 | * Time: 下午5:31
12 | */
13 | public class KaichibaProcessor implements PageProcessor {
14 | @Override
15 | public void process(Page page) {
16 | //http://progressdaily.diandian.com/post/2013-01-24/40046867275
17 | int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1;
18 | page.addTargetRequest("http://kaichiba.com/shop/" + i);
19 | page.putField("title",page.getHtml().xpath("//Title"));
20 | page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*?", ""));
21 | }
22 |
23 | @Override
24 | public Site getSite() {
25 | return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8").
26 | setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");
27 | }
28 |
29 | public static void main(String[] args) {
30 | Spider.create(new KaichibaProcessor()).run();
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.example;
2 |
3 | import us.codecraft.webmagic.Page;
4 | import us.codecraft.webmagic.Site;
5 | import us.codecraft.webmagic.Spider;
6 | import us.codecraft.webmagic.model.PageMapper;
7 | import us.codecraft.webmagic.processor.PageProcessor;
8 |
9 | /**
10 | * @author code4crafter@gmail.com
11 | * @since 0.3.2
12 | */
13 | public class GithubRepoPageMapper implements PageProcessor {
14 |
15 | private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
16 |
17 | private PageMapper
9 | *
10 | * @author code4crafter@gmail.com
11 | * @since 0.2.0
12 | */
13 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME)
14 | @Target({ElementType.FIELD})
15 | public @interface ExtractByUrl {
16 |
17 | /**
18 | * Extractor expression, only regex can be used
19 | *
20 | * @return extractor expression
21 | */
22 | String value() default "";
23 |
24 | /**
25 | * Define whether the field can be null.
26 | * If set to 'true' and the extractor get no result, the entire class will be discarded.
27 | *
28 | * @return whether the field can be null
29 | */
30 | boolean notNull() default false;
31 |
32 | /**
33 | * Define whether the extractor return more than one result.
34 | * When set to 'true', the extractor return a list of string (so you should define the field as List).
35 | *
36 | * Deprecated since 0.4.2. This option is determined automatically by the class of field.
37 | * @deprecated since 0.4.2
38 | * @return whether the extractor return more than one result
39 | */
40 | boolean multi() default false;
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.processor;
2 |
3 | import us.codecraft.webmagic.Page;
4 | import us.codecraft.webmagic.Site;
5 | import us.codecraft.webmagic.utils.UrlUtils;
6 |
7 | import java.util.List;
8 |
9 | /**
10 | * A simple PageProcessor.
11 | *
12 | * @author code4crafter@gmail.com
13 | * @since 0.1.0
14 | */
15 | public class SimplePageProcessor implements PageProcessor {
16 |
17 | private String urlPattern;
18 |
19 | private Site site;
20 |
21 | public SimplePageProcessor(String startUrl, String urlPattern) {
22 | this.site = Site.me().addStartUrl(startUrl).
23 | setDomain(UrlUtils.getDomain(startUrl));
24 | //compile "*" expression to regex
25 | this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")";
26 |
27 | }
28 |
29 | @Override
30 | public void process(Page page) {
31 | List
12 | */
13 | public class OschinaBlogPageProcessor implements PageProcessor {
14 |
15 | private Site site = Site.me().setDomain("my.oschina.net");
16 |
17 | @Override
18 | public void process(Page page) {
19 | List
5 | *
6 | * @author code4crafter@gmail.com
7 | * @since 0.2.1
8 | */
9 | public abstract class Selectors {
10 |
11 | public static RegexSelector regex(String expr) {
12 | return new RegexSelector(expr);
13 | }
14 |
15 | public static RegexSelector regex(String expr, int group) {
16 | return new RegexSelector(expr,group);
17 | }
18 |
19 | public static SmartContentSelector smartContent() {
20 | return new SmartContentSelector();
21 | }
22 |
23 | public static CssSelector $(String expr) {
24 | return new CssSelector(expr);
25 | }
26 |
27 | public static CssSelector $(String expr, String attrName) {
28 | return new CssSelector(expr, attrName);
29 | }
30 |
31 | public static XpathSelector xpath(String expr) {
32 | return new XpathSelector(expr);
33 | }
34 |
35 | /**
36 | * @Deprecated
37 | * @see #xpath(String)
38 | * @param expr expr
39 | * @return new selector
40 | */
41 | public static XpathSelector xsoup(String expr) {
42 | return new XpathSelector(expr);
43 | }
44 |
45 | public static AndSelector and(Selector... selectors) {
46 | return new AndSelector(selectors);
47 | }
48 |
49 | public static OrSelector or(Selector... selectors) {
50 | return new OrSelector(selectors);
51 | }
52 |
53 | }
--------------------------------------------------------------------------------
/webmagic-avalon/forger/src/test/java/us/codecraft/forger/Foo.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.forger;
2 |
3 | import us.codecraft.forger.property.Inject;
4 | import us.codecraft.forger.property.format.Formatter;
5 |
6 | /**
7 | * @author code4crafter@gmail.com
8 | */
9 | public class Foo implements Fooable{
10 |
11 | @Formatter("")
12 | @Inject("fooa")
13 | private String foo;
14 |
15 | public static final String SOURCE_CODE="import us.codecraft.forger.*;\n" +
16 | "import us.codecraft.forger.property.Inject;\n" +
17 | "import us.codecraft.forger.property.Inject;\n" +
18 | "import us.codecraft.forger.property.format.Formatter;\n" +
19 | "\n" +
20 | "/**\n" +
21 | " * @author code4crafter@gmail.com\n" +
22 | " */\n" +
23 | "public class Foo implements Fooable{\n" +
24 | "\n" +
25 | " @Formatter(\"\")\n" +
26 | " @Inject(\"fooa\")\n" +
27 | " private String foo;\n" +
28 | "\n" +
29 | " public String getFoo() {\n" +
30 | " return foo;\n" +
31 | " }\n" +
32 | "\n" +
33 | " @Override\n" +
34 | " public String foo() {\n" +
35 | " return foo;\n" +
36 | " }\n" +
37 | "}";
38 |
39 | public String getFoo() {
40 | return foo;
41 | }
42 |
43 | @Override
44 | public String foo() {
45 | return foo;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.selector;
2 |
3 | import org.jsoup.Jsoup;
4 | import org.jsoup.nodes.Element;
5 |
6 | import java.util.ArrayList;
7 | import java.util.List;
8 |
9 | /**
10 | * @author code4crafter@gmail.com
11 | * @since 0.3.0
12 | */
13 | public abstract class BaseElementSelector implements Selector, ElementSelector {
14 |
15 | @Override
16 | public String select(String text) {
17 | if (text != null) {
18 | return select(Jsoup.parse(text));
19 | }
20 | return null;
21 | }
22 |
23 | @Override
24 | public List
12 | * @since 0.2.0
13 | */
14 | class FieldExtractor extends Extractor {
15 |
16 | private final Field field;
17 |
18 | private Method setterMethod;
19 |
20 | private ObjectFormatter objectFormatter;
21 |
22 | public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
23 | super(selector, source, notNull, multi);
24 | this.field = field;
25 | }
26 |
27 | Field getField() {
28 | return field;
29 | }
30 |
31 | Selector getSelector() {
32 | return selector;
33 | }
34 |
35 | Source getSource() {
36 | return source;
37 | }
38 |
39 | void setSetterMethod(Method setterMethod) {
40 | this.setterMethod = setterMethod;
41 | }
42 |
43 | Method getSetterMethod() {
44 | return setterMethod;
45 | }
46 |
47 | boolean isNotNull() {
48 | return notNull;
49 | }
50 |
51 | ObjectFormatter getObjectFormatter() {
52 | return objectFormatter;
53 | }
54 |
55 | void setObjectFormatter(ObjectFormatter objectFormatter) {
56 | this.objectFormatter = objectFormatter;
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.processor.example;
2 |
3 | import us.codecraft.webmagic.Page;
4 | import us.codecraft.webmagic.Site;
5 | import us.codecraft.webmagic.Spider;
6 | import us.codecraft.webmagic.processor.PageProcessor;
7 |
8 | /**
9 | * @author code4crafter@gmail.com
10 | * @since 0.3.2
11 | */
12 | public class GithubRepoPageProcessor implements PageProcessor {
13 |
14 | private Site site = Site.me().setRetryTimes(3).setSleepTime(0);
15 |
16 | @Override
17 | public void process(Page page) {
18 | page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-])").all());
19 | page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all());
20 | page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString());
21 | page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString());
22 | if (page.getResultItems().get("name")==null){
23 | //skip this page
24 | page.setSkip(true);
25 | }
26 | page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()"));
27 | }
28 |
29 | @Override
30 | public Site getSite() {
31 | return site;
32 | }
33 |
34 | public static void main(String[] args) {
35 | Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run();
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java:
--------------------------------------------------------------------------------
1 | package us.codecraft.webmagic.selector;
2 |
3 | import org.apache.commons.collections.CollectionUtils;
4 | import org.jsoup.nodes.Element;
5 | import us.codecraft.xsoup.XPathEvaluator;
6 | import us.codecraft.xsoup.Xsoup;
7 |
8 | import java.util.List;
9 |
10 | /**
11 | * XPath selector based on Xsoup.
12 | *
13 | * @author code4crafter@gmail.com
14 | * @since 0.3.0
15 | */
16 | public class XpathSelector extends BaseElementSelector {
17 |
18 | private XPathEvaluator xPathEvaluator;
19 |
20 | public XpathSelector(String xpathStr) {
21 | this.xPathEvaluator = Xsoup.compile(xpathStr);
22 | }
23 |
24 | @Override
25 | public String select(Element element) {
26 | return xPathEvaluator.evaluate(element).get();
27 | }
28 |
29 | @Override
30 | public List
11 | */
12 | public class QzoneBlogProcessor implements PageProcessor {
13 | @Override
14 | public void process(Page page) {
15 | //http://progressdaily.diandian.com/post/2013-01-24/40046867275
16 |
17 | //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106
18 | // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone
19 | List
16 |
26 |
20 |
22 |