├── .travis.yml ├── assets ├── logo.jpg ├── image1.pdf ├── logo3.png ├── logo4.png ├── webmagic.psd ├── logo-simple.jpg ├── logo2.graffle │ └── image1.tiff ├── logo3.graffle │ ├── image1.tiff │ ├── image2.tiff │ ├── image4.tiff │ └── image5.tiff ├── webmagic-create-spider.png ├── webmagic-spider-manage.png └── page-extract-rule.bmml ├── lib ├── guava-15.0.jar ├── junit-4.11.jar ├── jedis-2.0.0.jar ├── jsoup-1.7.2.jar ├── log4j-1.2.17.jar ├── xsoup-0.2.4.jar ├── fastjson-1.1.37.jar ├── httpcore-4.3.2.jar ├── json-path-0.8.1.jar ├── slf4j-api-1.7.6.jar ├── commons-codec-1.6.jar ├── commons-io-1.3.2.jar ├── commons-lang-2.6.jar ├── commons-lang3-3.1.jar ├── hamcrest-core-1.3.jar ├── httpclient-4.3.3.jar ├── json-smart-1.1.1.jar ├── assertj-core-1.5.0.jar ├── commons-pool-1.5.5.jar ├── slf4j-log4j12-1.7.6.jar ├── webmagic-core-0.5.2.jar ├── commons-logging-1.1.3.jar ├── commons-collections-3.2.1.jar └── webmagic-extension-0.5.2.jar ├── webmagic-samples ├── README.md ├── src │ ├── main │ │ ├── java │ │ │ └── us │ │ │ │ └── codecraft │ │ │ │ └── webmagic │ │ │ │ ├── samples │ │ │ │ ├── pipeline │ │ │ │ │ └── ReplacePipeline.java │ │ │ │ ├── formatter │ │ │ │ │ └── StringTemplateFormatter.java │ │ │ │ ├── scheduler │ │ │ │ │ └── LevelLimitScheduler.java │ │ │ │ ├── GithubRepo.java │ │ │ │ ├── NjuBBSProcessor.java │ │ │ │ ├── TianyaPageProcesser.java │ │ │ │ ├── HuxiuProcessor.java │ │ │ │ ├── IteyeBlogProcessor.java │ │ │ │ ├── OschinaPageProcesser.java │ │ │ │ ├── F58PageProcesser.java │ │ │ │ ├── KaichibaProcessor.java │ │ │ │ ├── QzoneBlogProcessor.java │ │ │ │ ├── InfoQMiniBookProcessor.java │ │ │ │ └── GithubRepoPageProcessor.java │ │ │ │ └── model │ │ │ │ └── samples │ │ │ │ ├── Blog.java │ │ │ │ ├── QQMeishi.java │ │ │ │ ├── OschinaAnswer.java │ │ │ │ ├── IteyeBlog.java │ │ │ │ ├── DianpingFtlDataScanner.java │ │ │ │ ├── BaiduNews.java │ │ │ │ └── JokejiModel.java │ │ └── resources │ │ │ ├── crawl.js │ │ │ └── log4j.xml │ └── test │ │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ ├── samples │ │ └── scheduler │ │ │ └── DelayQueueSchedulerTest.java │ │ └── processor │ │ └── SinablogProcessorTest.java └── pom.xml ├── webmagic-avalon ├── webmagic-worker │ ├── README.md │ └── src │ │ ├── main │ │ ├── resources │ │ │ ├── freemarker.properties │ │ │ └── log │ │ │ │ └── log4j.xml │ │ ├── webapp │ │ │ └── WEB-INF │ │ │ │ └── jsp │ │ │ │ └── 500.jsp │ │ └── java │ │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ └── worker │ │ │ ├── controller │ │ │ └── SpiderController.java │ │ │ └── Worker.java │ │ └── test │ │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ └── worker │ │ └── WorkerTest.java ├── webmagic-admin │ ├── README.md │ ├── src │ │ └── main │ │ │ ├── webapp │ │ │ ├── static │ │ │ │ ├── favicon.jpg │ │ │ │ ├── js │ │ │ │ │ └── jquery.flot.resize.min.js │ │ │ │ └── css │ │ │ │ │ ├── jquery.cleditor.css │ │ │ │ │ └── fullcalendar.print.css │ │ │ └── WEB-INF │ │ │ │ ├── pages │ │ │ │ └── create_spider.ftl │ │ │ │ └── jsp │ │ │ │ └── 500.jsp │ │ │ ├── resources │ │ │ ├── freemarker.properties │ │ │ └── log │ │ │ │ └── log4j.xml │ │ │ └── java │ │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ └── avalon │ │ │ └── web │ │ │ ├── DashBoardController.java │ │ │ └── SpiderController.java │ └── pom.xml ├── README.md ├── forger │ ├── src │ │ ├── test │ │ │ ├── java │ │ │ │ └── us │ │ │ │ │ └── codecraft │ │ │ │ │ └── forger │ │ │ │ │ ├── Fooable.java │ │ │ │ │ ├── compiler │ │ │ │ │ └── GroovyForgerCompilerTest.java │ │ │ │ │ ├── Bar.java │ │ │ │ │ └── Foo.java │ │ │ └── resources │ │ │ │ └── log4j.xml │ │ └── main │ │ │ ├── java │ │ │ └── us │ │ │ │ └── codecraft │ │ │ │ └── forger │ │ │ │ ├── compiler │ │ │ │ ├── ForgerCompiler.java │ │ │ │ └── GroovyForgerCompiler.java │ │ │ │ ├── property │ │ │ │ ├── format │ │ │ │ │ ├── ObjectFormatter.java │ │ │ │ │ ├── TypeFormatter.java │ │ │ │ │ ├── DateFormatter.java │ │ │ │ │ ├── ObjectFormatterWithParams.java │ │ │ │ │ └── Formatter.java │ │ │ │ ├── PropertyLoader.java │ │ │ │ ├── Inject.java │ │ │ │ ├── PropertyType.java │ │ │ │ ├── SimpleFieldPropertyLoader.java │ │ │ │ ├── AnnotationPropertyLoader.java │ │ │ │ └── Property.java │ │ │ │ ├── ForgerFactory.java │ │ │ │ └── Forger.java │ │ │ └── resources │ │ │ └── log4j.xml │ └── README.md └── webmagic-avalon-common │ └── src │ ├── main │ ├── resources │ │ ├── config │ │ │ ├── freemarker.properties │ │ │ ├── log │ │ │ │ └── log4j.xml │ │ │ ├── spring │ │ │ │ ├── applicationContext-tx.xml │ │ │ │ ├── applicationContext-service.xml │ │ │ │ ├── applicationContext-component.xml │ │ │ │ ├── applicationContext-myBatis.xml │ │ │ │ └── applicationContext-webmvc.xml │ │ │ └── mapper │ │ │ │ └── DynamicClass.xml │ │ └── sql │ │ │ ├── h2 │ │ │ └── schema.sql │ │ │ └── mysql │ │ │ └── schema.sql │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ ├── dao │ │ └── DynamicClassDao.java │ │ ├── service │ │ ├── DynamicClassService.java │ │ └── impl │ │ │ └── DynamicClassServiceImpl.java │ │ ├── exception │ │ └── DynamicClassCompileException.java │ │ └── model │ │ └── DynamicClass.java │ └── test │ └── java │ └── us │ └── codecraft │ └── webmagic │ ├── AbstractTest.java │ ├── dao │ └── DynamicClassDaoTest.java │ └── Foo.java ├── webmagic-extension ├── README.md ├── src │ ├── main │ │ ├── java │ │ │ └── us │ │ │ │ └── codecraft │ │ │ │ └── webmagic │ │ │ │ ├── model │ │ │ │ ├── annotation │ │ │ │ │ ├── package.html │ │ │ │ │ ├── Formatter.java │ │ │ │ │ ├── TargetUrl.java │ │ │ │ │ ├── HelpUrl.java │ │ │ │ │ └── ExtractByUrl.java │ │ │ │ ├── package.html │ │ │ │ ├── formatter │ │ │ │ │ ├── ObjectFormatter.java │ │ │ │ │ ├── DateFormatter.java │ │ │ │ │ └── ObjectFormatters.java │ │ │ │ ├── AfterExtractor.java │ │ │ │ ├── HasKey.java │ │ │ │ ├── ConsolePageModelPipeline.java │ │ │ │ ├── PageMapper.java │ │ │ │ ├── Extractor.java │ │ │ │ ├── FieldExtractor.java │ │ │ │ └── PageModelCollectorPipeline.java │ │ │ │ ├── configurable │ │ │ │ └── ExpressionType.java │ │ │ │ ├── pipeline │ │ │ │ ├── PageModelPipeline.java │ │ │ │ ├── CollectorPageModelPipeline.java │ │ │ │ └── JsonFilePipeline.java │ │ │ │ ├── handler │ │ │ │ ├── PatternProcessor.java │ │ │ │ ├── SubPageProcessor.java │ │ │ │ ├── RequestMatcher.java │ │ │ │ ├── SubPipeline.java │ │ │ │ ├── PatternRequestMatcher.java │ │ │ │ └── CompositePipeline.java │ │ │ │ ├── monitor │ │ │ │ └── SpiderStatusMXBean.java │ │ │ │ ├── utils │ │ │ │ ├── ClassUtils.java │ │ │ │ ├── MultiKeyMapBase.java │ │ │ │ └── IPUtils.java │ │ │ │ ├── example │ │ │ │ ├── MonitorExample.java │ │ │ │ ├── GithubRepoPageMapper.java │ │ │ │ └── AppStore.java │ │ │ │ └── MultiPageModel.java │ │ └── resources │ │ │ ├── crawl.js │ │ │ ├── log4j.xml │ │ │ └── spider-config-draft.xml │ └── test │ │ ├── java │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ ├── monitor │ │ │ ├── CustomSpiderStatusMXBean.java │ │ │ ├── CustomSpiderStatus.java │ │ │ └── SpiderMonitorTest.java │ │ │ ├── MockPipeline.java │ │ │ ├── model │ │ │ ├── BaseRepo.java │ │ │ ├── MockModel.java │ │ │ ├── GithubRepoTest.java │ │ │ └── GithubRepo.java │ │ │ ├── utils │ │ │ └── IPUtilsTest.java │ │ │ ├── MockPageModelPipeline.java │ │ │ ├── downloader │ │ │ └── FileCacheTest.java │ │ │ ├── formatter │ │ │ └── DateFormatterTest.java │ │ │ ├── scheduler │ │ │ └── RedisSchedulerTest.java │ │ │ └── processor │ │ │ └── GithubRepoProcessor.java │ │ └── resources │ │ ├── log4j.xml │ │ └── html │ │ └── mock-webmagic.html ├── pom.xml ├── pom.xml.versionsBackup └── pom.xml.releaseBackup ├── webmagic-core ├── README.md └── src │ ├── main │ ├── java │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ ├── utils │ │ │ ├── package.html │ │ │ ├── Experimental.java │ │ │ ├── NumberUtils.java │ │ │ ├── HttpConstant.java │ │ │ └── FilePersistentBase.java │ │ │ ├── scheduler │ │ │ ├── component │ │ │ │ ├── package.html │ │ │ │ ├── DuplicateRemover.java │ │ │ │ └── HashSetDuplicateRemover.java │ │ │ ├── package.html │ │ │ ├── MonitorableScheduler.java │ │ │ ├── Scheduler.java │ │ │ └── QueueScheduler.java │ │ │ ├── processor │ │ │ ├── package.html │ │ │ ├── PageProcessor.java │ │ │ ├── SimplePageProcessor.java │ │ │ └── example │ │ │ │ ├── OschinaBlogPageProcessor.java │ │ │ │ └── GithubRepoPageProcessor.java │ │ │ ├── pipeline │ │ │ ├── package.html │ │ │ ├── CollectorPipeline.java │ │ │ ├── Pipeline.java │ │ │ ├── ResultItemsCollectorPipeline.java │ │ │ └── ConsolePipeline.java │ │ │ ├── package.html │ │ │ ├── downloader │ │ │ ├── package.html │ │ │ └── Downloader.java │ │ │ ├── selector │ │ │ ├── package.html │ │ │ ├── RegexResult.java │ │ │ ├── Selector.java │ │ │ ├── ElementSelector.java │ │ │ ├── ReplaceSelector.java │ │ │ ├── OrSelector.java │ │ │ ├── Selectors.java │ │ │ ├── BaseElementSelector.java │ │ │ └── XpathSelector.java │ │ │ ├── proxy │ │ │ └── ProxyPool.java │ │ │ ├── SpiderListener.java │ │ │ └── Task.java │ └── resources │ │ └── log4j.xml │ └── test │ ├── java │ └── us │ │ └── codecraft │ │ └── webmagic │ │ ├── ResultItemsTest.java │ │ ├── selector │ │ ├── RegexSelectorTest.java │ │ ├── JsonTest.java │ │ ├── SelectorTest.java │ │ └── ExtractorsTest.java │ │ ├── downloader │ │ └── MockGithubDownloader.java │ │ ├── example │ │ └── GithubRepoPageProcessorTest.java │ │ └── pipeline │ │ └── FilePipelineTest.java │ └── resources │ └── log4j.xml ├── webmagic-saxon ├── README.md └── pom.xml ├── webmagic-selenium ├── README.md ├── config.ini ├── src │ └── test │ │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ ├── downloader │ │ ├── selenium │ │ │ └── WebDriverPoolTest.java │ │ └── SeleniumTest.java │ │ └── samples │ │ └── GooglePlayProcessor.java └── pom.xml ├── .gitignore ├── make.sh ├── webmagic-scripts ├── src │ ├── main │ │ ├── resources │ │ │ ├── python │ │ │ │ ├── oschina.py │ │ │ │ └── defines.py │ │ │ ├── ruby │ │ │ │ ├── oschina.rb │ │ │ │ ├── defines.rb │ │ │ │ └── github.rb │ │ │ ├── js │ │ │ │ ├── defines.js │ │ │ │ ├── oschina.js │ │ │ │ └── github.js │ │ │ └── log4j.xml │ │ └── java │ │ │ └── us │ │ │ └── codecraft │ │ │ └── webmagic │ │ │ └── scripts │ │ │ ├── Language.java │ │ │ └── ScriptEnginePool.java │ └── test │ │ ├── resouces │ │ └── log4j.xml │ │ └── java │ │ └── us │ │ └── codecraft │ │ └── webmagic │ │ └── scripts │ │ └── ScriptProcessorTest.java └── deploy.sh ├── webmagic-avalon.md └── release.properties /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk7 4 | -------------------------------------------------------------------------------- /assets/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo.jpg -------------------------------------------------------------------------------- /assets/image1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/image1.pdf -------------------------------------------------------------------------------- /assets/logo3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.png -------------------------------------------------------------------------------- /assets/logo4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo4.png -------------------------------------------------------------------------------- /lib/guava-15.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/guava-15.0.jar -------------------------------------------------------------------------------- /lib/junit-4.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/junit-4.11.jar -------------------------------------------------------------------------------- /assets/webmagic.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/webmagic.psd -------------------------------------------------------------------------------- /lib/jedis-2.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/jedis-2.0.0.jar -------------------------------------------------------------------------------- /lib/jsoup-1.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/jsoup-1.7.2.jar -------------------------------------------------------------------------------- /lib/log4j-1.2.17.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/log4j-1.2.17.jar -------------------------------------------------------------------------------- /lib/xsoup-0.2.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/xsoup-0.2.4.jar -------------------------------------------------------------------------------- /webmagic-samples/README.md: -------------------------------------------------------------------------------- 1 | webmagic-samples 2 | ------- 3 | webmagic的一些示例。包括抓取常见 博客、信息类网站等。 -------------------------------------------------------------------------------- /assets/logo-simple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo-simple.jpg -------------------------------------------------------------------------------- /lib/fastjson-1.1.37.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/fastjson-1.1.37.jar -------------------------------------------------------------------------------- /lib/httpcore-4.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/httpcore-4.3.2.jar -------------------------------------------------------------------------------- /lib/json-path-0.8.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/json-path-0.8.1.jar -------------------------------------------------------------------------------- /lib/slf4j-api-1.7.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/slf4j-api-1.7.6.jar -------------------------------------------------------------------------------- /lib/commons-codec-1.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-codec-1.6.jar -------------------------------------------------------------------------------- /lib/commons-io-1.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-io-1.3.2.jar -------------------------------------------------------------------------------- /lib/commons-lang-2.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-lang-2.6.jar -------------------------------------------------------------------------------- /lib/commons-lang3-3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-lang3-3.1.jar -------------------------------------------------------------------------------- /lib/hamcrest-core-1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/hamcrest-core-1.3.jar -------------------------------------------------------------------------------- /lib/httpclient-4.3.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/httpclient-4.3.3.jar -------------------------------------------------------------------------------- /lib/json-smart-1.1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/json-smart-1.1.1.jar -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-worker/README.md: -------------------------------------------------------------------------------- 1 | WebMagic-Worker 2 | ===== 3 | Worker is the spider container. -------------------------------------------------------------------------------- /webmagic-extension/README.md: -------------------------------------------------------------------------------- 1 | webmagic-extension 2 | ------- 3 | webmagic的扩展模块。包括注解格式定义爬虫、JSON、分布式等支持。 -------------------------------------------------------------------------------- /lib/assertj-core-1.5.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/assertj-core-1.5.0.jar -------------------------------------------------------------------------------- /lib/commons-pool-1.5.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-pool-1.5.5.jar -------------------------------------------------------------------------------- /lib/slf4j-log4j12-1.7.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/slf4j-log4j12-1.7.6.jar -------------------------------------------------------------------------------- /lib/webmagic-core-0.5.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/webmagic-core-0.5.2.jar -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/README.md: -------------------------------------------------------------------------------- 1 | WebMagic-Admin 2 | ===== 3 | Admin is the control web of workers. -------------------------------------------------------------------------------- /lib/commons-logging-1.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-logging-1.1.3.jar -------------------------------------------------------------------------------- /assets/logo2.graffle/image1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo2.graffle/image1.tiff -------------------------------------------------------------------------------- /assets/logo3.graffle/image1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.graffle/image1.tiff -------------------------------------------------------------------------------- /assets/logo3.graffle/image2.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.graffle/image2.tiff -------------------------------------------------------------------------------- /assets/logo3.graffle/image4.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.graffle/image4.tiff -------------------------------------------------------------------------------- /assets/logo3.graffle/image5.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/logo3.graffle/image5.tiff -------------------------------------------------------------------------------- /assets/webmagic-create-spider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/webmagic-create-spider.png -------------------------------------------------------------------------------- /assets/webmagic-spider-manage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/assets/webmagic-spider-manage.png -------------------------------------------------------------------------------- /lib/commons-collections-3.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/commons-collections-3.2.1.jar -------------------------------------------------------------------------------- /lib/webmagic-extension-0.5.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/lib/webmagic-extension-0.5.2.jar -------------------------------------------------------------------------------- /webmagic-core/README.md: -------------------------------------------------------------------------------- 1 | webmagic-core 2 | ------- 3 | webmagic核心部分。只包含爬虫基本模块和基本抽取器。webmagic-core的目标是成为网页爬虫的一个教科书般的实现。 -------------------------------------------------------------------------------- /webmagic-saxon/README.md: -------------------------------------------------------------------------------- 1 | webmagic-extension 2 | ------- 3 | webmagic的扩展模块,依赖Saxon进行xpath2.0解析支持。Saxon依赖包太大,不作为默认模块引入。 -------------------------------------------------------------------------------- /webmagic-selenium/README.md: -------------------------------------------------------------------------------- 1 | webmagic-extension 2 | ------- 3 | webmagic与selenium的集成,用于爬取ajax页面。selenium太重,所以单独抽出成一个包了。 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | *.iml 3 | out/ 4 | .idea 5 | .classpath 6 | .project 7 | .settings/ 8 | bin/ 9 | .myeclipse 10 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/utils/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Static utils of webmagic. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-avalon/README.md: -------------------------------------------------------------------------------- 1 | WebMagic-Avalon 2 | ======== 3 | > Spiders Manage Web 4 | 5 | see [#issue43](https://github.com/code4craft/webmagic/issues/43) -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | mvn clean dependency:copy-dependencies -DoutputDirectory=target/lib 3 | rsync -avz --delete ./webmagic-samples/target/lib/ ./lib/ 4 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Component of scheduler. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Scheduler is the part of url management. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Annotations for defining a extractor. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/webapp/static/favicon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bcqtt/webmagic/master/webmagic-avalon/webmagic-admin/src/main/webapp/static/favicon.jpg -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/processor/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | PageProcessor custom part of a crawler for specific site. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Page model and annotations used to customize a crawler. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Pipeline is the persistent and offline process part of crawler. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | Main class "Spider" and models. 5 |
6 | 7 | 8 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/python/oschina.py: -------------------------------------------------------------------------------- 1 | title=xpath("div[@class=BlogTitle]") 2 | urls="http://my\\.oschina\\.net/flashsword/blog/\\d+" 3 | 4 | result={"title":title,"urls":urls} 5 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/downloader/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Downloader is the part that downloads web pages and store in Page object. 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Selectors for page extraction. Core API is the interface Selectable,and internal core is the interface Selector。 4 | 5 | 6 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/ruby/oschina.rb: -------------------------------------------------------------------------------- 1 | urls "http://my\\.oschina\\.net/flashsword/blog/\\d+" 2 | title = css "div.BlogTitle h1" 3 | content = css "div.BlogContent" 4 | 5 | return {"title"=>title,"content"=>content} 6 | 7 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/test/java/us/codecraft/forger/Fooable.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | public interface Fooable { 7 | 8 | public String foo(); 9 | } 10 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/pipeline/ReplacePipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples.pipeline; 2 | 3 | /** 4 | * @author code4crafer@gmail.com 5 | */ 6 | public class ReplacePipeline { 7 | } 8 | -------------------------------------------------------------------------------- /webmagic-scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | VERSION="0.4.1-SNAPSHOT" 3 | mvn clean package 4 | cp target/webmagic-scripts-${VERSION}.jar /usr/local/webmagic/webmagic-console.jar 5 | rsync -avz --delete target/lib/ /usr/local/webmagic/lib/ 6 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/resources/freemarker.properties: -------------------------------------------------------------------------------- 1 | number_format=# 2 | classic_compatible=true 3 | 4 | default_encoding=UTF-8 5 | template_update_delay=0 6 | ######################### 7 | template_exception_handler=rethrow 8 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-worker/src/main/resources/freemarker.properties: -------------------------------------------------------------------------------- 1 | number_format=# 2 | classic_compatible=true 3 | 4 | default_encoding=UTF-8 5 | template_update_delay=0 6 | ######################### 7 | template_exception_handler=rethrow 8 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/utils/Experimental.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.utils; 2 | 3 | /** 4 | * Stands for features unstable. 5 | * @author code4crafter@gmail.com
6 | */ 7 | public @interface Experimental { 8 | } 9 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/config/freemarker.properties: -------------------------------------------------------------------------------- 1 | number_format=# 2 | classic_compatible=true 3 | 4 | default_encoding=UTF-8 5 | template_update_delay=0 6 | ######################### 7 | template_exception_handler=rethrow 8 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/ForgerCompiler.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.compiler; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | public interface ForgerCompiler { 7 | 8 | public Class compile(String sourceCode); 9 | } 10 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatter.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property.format; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | public interface ObjectFormatter { 7 | 8 | T format(String text); 9 | } 10 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/pages/create_spider.ftl: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |
7 | 8 |
9 | 10 |
11 | 12 |
13 | 14 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExpressionType.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.configurable; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | * @date 14-4-5 6 | */ 7 | public enum ExpressionType { 8 | 9 | XPath, Regex, Css, JsonPath; 10 | 11 | } 12 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/ruby/defines.rb: -------------------------------------------------------------------------------- 1 | def xpath str 2 | $page.getHtml().xpath(str).toString() 3 | end 4 | def css str 5 | $page.getHtml().css(str).toString() 6 | end 7 | def urls str 8 | links = $page.getHtml().links().regex(str).all(); 9 | $page.addTargetRequests(links); 10 | end 11 | 12 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatusMXBean.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.monitor; 2 | 3 | /** 4 | * @author code4crafer@gmail.com 5 | */ 6 | public interface CustomSpiderStatusMXBean extends SpiderStatusMXBean { 7 | 8 | public String getSchedulerName(); 9 | 10 | } 11 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/js/defines.js: -------------------------------------------------------------------------------- 1 | function $(str){ 2 | return page.getHtml().$(str).toString(); 3 | } 4 | function xpath(str){ 5 | return page.getHtml().xpath(str).toString(); 6 | } 7 | function urls(str){ 8 | links = page.getHtml().links().regex(str).all(); 9 | page.addTargetRequests(links); 10 | } 11 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/h2/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE DynamicClass( 2 | Id int(11) NOT NULL AUTO_INCREMENT PRIMARY KEY, 3 | `ClassName` varchar(200) NOT NULL, 4 | `SourceCode` text NOT NULL, 5 | `AddTime` datetime NOT NULL, 6 | `UpdateTime` datetime NOT NULL, 7 | UNIQUE INDEX `un_class_name` (`ClassName`) 8 | ); -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/dao/DynamicClassDao.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.dao; 2 | 3 | import us.codecraft.webmagic.model.DynamicClass; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public interface DynamicClassDao { 9 | 10 | public int add(DynamicClass dynamicClass); 11 | } 12 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/Blog.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.samples; 2 | 3 | /** 4 | * @author code4crafter@gmail.com
5 | * Date: 13-8-2
6 | * Time: 上午8:10
7 | */ 8 | public interface Blog { 9 | 10 | public String getTitle(); 11 | 12 | public String getContent(); 13 | } 14 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/js/oschina.js: -------------------------------------------------------------------------------- 1 | var result = { 2 | title: $("div.BlogTitle h1"), 3 | content: $("div.BlogContent") 4 | } 5 | var config = { 6 | ua: '', 7 | sleepTime : 20 8 | } 9 | title = $("div.BlogTitle h1"), 10 | content = $("div.BlogContent") 11 | urls("http://my\\.oschina\\.net/flashsword/blog/\\d+") 12 | config; 13 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/TypeFormatter.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property.format; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | public interface TypeFormatter extends ObjectFormatter { 7 | 8 | T format(String text, String[] params); 9 | 10 | Class clazz(); 11 | 12 | } 13 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/python/defines.py: -------------------------------------------------------------------------------- 1 | def xpath(str): 2 | return page.getHtml().xpath(str).toString() 3 | 4 | def css(str): 5 | return page.getHtml().css(str).toString() 6 | 7 | def urls(str): 8 | links=page.getHtml().links().regex(str).all() 9 | page.addTargetRequests(links); 10 | 11 | def tomap(key,value): 12 | return "hello world" 13 | 14 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/MockPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic; 2 | 3 | import us.codecraft.webmagic.pipeline.Pipeline; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public class MockPipeline implements Pipeline{ 9 | @Override 10 | public void process(ResultItems resultItems, Task task) { 11 | 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatter.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.formatter; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | public interface ObjectFormatter { 7 | 8 | T format(String raw) throws Exception; 9 | 10 | Class clazz(); 11 | 12 | void initParam(String[] extra); 13 | 14 | } 15 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyPool.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.proxy; 2 | 3 | import org.apache.http.HttpHost; 4 | 5 | /** 6 | * Created by edwardsbean on 15-2-28. 7 | */ 8 | public interface ProxyPool { 9 | public void returnProxy(HttpHost host, int statusCode); 10 | public Proxy getProxy(); 11 | public boolean isEnable(); 12 | } 13 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/SpiderListener.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic; 2 | 3 | /** 4 | * Listener of Spider on page processing. Used for monitor and such on. 5 | * 6 | * @author code4crafer@gmail.com 7 | * @since 0.5.0 8 | */ 9 | public interface SpiderListener { 10 | 11 | public void onSuccess(Request request); 12 | 13 | public void onError(Request request); 14 | } 15 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/model/BaseRepo.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.model.annotation.ExtractBy; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public class BaseRepo { 9 | 10 | @ExtractBy("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") 11 | protected int star; 12 | } 13 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/utils/IPUtilsTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.utils; 2 | 3 | import org.junit.Test; 4 | 5 | /** 6 | * @author code4crafer@gmail.com 7 | */ 8 | public class IPUtilsTest { 9 | 10 | @Test 11 | public void testGetFirstNoLoopbackIPAddresses() throws Exception { 12 | System.out.println(IPUtils.getFirstNoLoopbackIPAddresses()); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyLoader.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | */ 9 | public interface PropertyLoader { 10 | 11 | public T load(T object, Map propertyConfigs); 12 | 13 | public List getProperties(Class clazz); 14 | 15 | } 16 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/DynamicClassService.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.service; 2 | 3 | import us.codecraft.webmagic.exception.DynamicClassCompileException; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public interface DynamicClassService { 9 | 10 | public Class compileAndSave(String sourceCode) throws DynamicClassCompileException; 11 | 12 | } 13 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/PageModelPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.pipeline; 2 | 3 | import us.codecraft.webmagic.Task; 4 | 5 | /** 6 | * Implements PageModelPipeline to persistent your page model. 7 | * 8 | * @author code4crafter@gmail.com
9 | * @since 0.2.0 10 | */ 11 | public interface PageModelPipeline { 12 | 13 | public void process(T t, Task task); 14 | 15 | } 16 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/AfterExtractor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.Page; 4 | 5 | /** 6 | * Interface to be implemented by page models that need to do something after fields are extracted.
7 | * 8 | * @author code4crafter@gmail.com
9 | * @since 0.2.0 10 | */ 11 | public interface AfterExtractor { 12 | 13 | public void afterProcess(Page page); 14 | } 15 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.handler; 2 | 3 | /** 4 | * @author code4crafer@gmail.com 5 | */ 6 | public abstract class PatternProcessor extends PatternRequestMatcher implements SubPipeline, SubPageProcessor { 7 | /** 8 | * @param pattern url pattern to handle 9 | */ 10 | public PatternProcessor(String pattern) { 11 | super(pattern); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/MockPageModelPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic; 2 | 3 | import junit.framework.Assert; 4 | import us.codecraft.webmagic.pipeline.PageModelPipeline; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | */ 9 | public class MockPageModelPipeline implements PageModelPipeline{ 10 | @Override 11 | public void process(Object o, Task task) { 12 | Assert.assertNotNull(o); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/utils/NumberUtils.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.utils; 2 | 3 | /** 4 | * @author yihua.huang@dianping.com 5 | */ 6 | public abstract class NumberUtils { 7 | 8 | public static int compareLong(long o1, long o2) { 9 | if (o1 < o2) { 10 | return -1; 11 | } else if (o1 == o2) { 12 | return 0; 13 | } else { 14 | return 1; 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Inject.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.Target; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | */ 10 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) 11 | @Target({ElementType.FIELD}) 12 | public @interface Inject { 13 | 14 | String value() default ""; 15 | 16 | } 17 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/resources/crawl.js: -------------------------------------------------------------------------------- 1 | var system = require('system'); 2 | var url = system.args[1]; 3 | 4 | var page = require('webpage').create(); 5 | page.settings.loadImages = false; 6 | page.settings.resourceTimeout = 5000; 7 | 8 | page.open(url, function (status) { 9 | if (status != 'success') { 10 | console.log("HTTP request failed!"); 11 | } else { 12 | console.log(page.content); 13 | } 14 | 15 | page.close(); 16 | phantom.exit(); 17 | }); -------------------------------------------------------------------------------- /webmagic-samples/src/main/resources/crawl.js: -------------------------------------------------------------------------------- 1 | var system = require('system'); 2 | var url = system.args[1]; 3 | 4 | var page = require('webpage').create(); 5 | page.settings.loadImages = false; 6 | page.settings.resourceTimeout = 5000; 7 | 8 | page.open(url, function (status) { 9 | if (status != 'success') { 10 | console.log("HTTP request failed!"); 11 | } else { 12 | console.log(page.content); 13 | } 14 | 15 | page.close(); 16 | phantom.exit(); 17 | }); -------------------------------------------------------------------------------- /assets/page-extract-rule.bmml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | A%20Web%20Page%0Ahttp%3A// 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/MonitorableScheduler.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scheduler; 2 | 3 | import us.codecraft.webmagic.Task; 4 | 5 | /** 6 | * The scheduler whose requests can be counted for monitor. 7 | * 8 | * @author code4crafter@gmail.com 9 | * @since 0.5.0 10 | */ 11 | public interface MonitorableScheduler extends Scheduler { 12 | 13 | public int getLeftRequestsCount(Task task); 14 | 15 | public int getTotalRequestsCount(Task task); 16 | 17 | } -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/model/MockModel.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.model.annotation.HelpUrl; 4 | import us.codecraft.webmagic.model.annotation.TargetUrl; 5 | 6 | /** 7 | * @author code4crafer@gmail.com 8 | */ 9 | @TargetUrl(value = "http://webmagic.io/post/\\d+",sourceRegion = "//li[@class='post']") 10 | @HelpUrl(value = "http://webmagic.io/list/\\d+",sourceRegion = "//li[@class='list']") 11 | public class MockModel { 12 | 13 | } 14 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/exception/DynamicClassCompileException.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.exception; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | public class DynamicClassCompileException extends Exception{ 7 | 8 | public DynamicClassCompileException(String message) { 9 | super(message); 10 | } 11 | 12 | public DynamicClassCompileException(String message, Throwable cause) { 13 | super(message, cause); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/ruby/github.rb: -------------------------------------------------------------------------------- 1 | name= xpath "//h1[@class='entry-title public']/strong/a/text()" 2 | readme = xpath "//div[@id='readme']/tidyText()" 3 | star = xpath "//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()" 4 | fork = xpath "//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()" 5 | url=$page.getUrl().toString() 6 | 7 | puts name,readme,star,fork,url unless name==nil 8 | 9 | urls "(https://github\\.com/\\w+/\\w+)" 10 | urls "(https://github\\.com/\\w+)" -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/compiler/GroovyForgerCompiler.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.compiler; 2 | 3 | import groovy.lang.GroovyClassLoader; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public class GroovyForgerCompiler implements ForgerCompiler{ 9 | 10 | private GroovyClassLoader groovyClassLoader = new GroovyClassLoader(); 11 | 12 | @Override 13 | public Class compile(String sourceCode) { 14 | return groovyClassLoader.parseClass(sourceCode); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /webmagic-selenium/config.ini: -------------------------------------------------------------------------------- 1 | # What WebDriver to use for the tests 2 | driver=phantomjs 3 | #driver=firefox 4 | #driver=chrome 5 | #driver=http://localhost:8910 6 | #driver=http://localhost:4444/wd/hub 7 | 8 | # PhantomJS specific config (change according to your installation) 9 | #phantomjs_exec_path=/Users/Bingo/bin/phantomjs-qt5 10 | phantomjs_exec_path=/Users/Bingo/Downloads/phantomjs-1.9.8-macosx/bin/phantomjs 11 | #phantomjs_driver_path=/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/src/main.js 12 | phantomjs_driver_loglevel=DEBUG -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPageProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.handler; 2 | 3 | import us.codecraft.webmagic.Page; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | * @date 14-4-5 8 | */ 9 | public interface SubPageProcessor extends RequestMatcher { 10 | 11 | /** 12 | * process the page, extract urls to fetch, extract the data and store 13 | * 14 | * @param page page 15 | * 16 | * @return whether continue to match 17 | */ 18 | public MatchOther processPage(Page page); 19 | 20 | } 21 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/HasKey.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.utils.Experimental; 4 | 5 | /** 6 | * Interface to be implemented by page mode.
7 | * Can be used to identify a page model, or be used as name of file storing the object.
8 | * @author code4crafter@gmail.com
9 | * @since 0.2.0 10 | */ 11 | @Experimental 12 | public interface HasKey { 13 | 14 | /** 15 | * 16 | * 17 | * @return key 18 | */ 19 | public String key(); 20 | } 21 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/CollectorPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.pipeline; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * Pipeline that can collect and store results.
7 | * Used for {@link us.codecraft.webmagic.Spider#getAll(java.util.Collection)} 8 | * 9 | * @author code4crafter@gmail.com 10 | * @since 0.4.0 11 | */ 12 | public interface CollectorPipeline extends Pipeline { 13 | 14 | /** 15 | * Get all results collected. 16 | * 17 | * @return collected results 18 | */ 19 | public List getCollected(); 20 | } 21 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/js/github.js: -------------------------------------------------------------------------------- 1 | var name=xpath("//h1[@class='entry-title public']/strong/a/text()") 2 | var readme=xpath("//div[@id='readme']/tidyText()") 3 | var star=xpath("//ul[@class='pagehead-actions']/li[1]//a[@class='social-count js-social-count']/text()") 4 | var fork=xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") 5 | var url=page.getUrl().toString() 6 | if (name!=null){ 7 | println(name) 8 | println(readme) 9 | println(star) 10 | println(url) 11 | } 12 | 13 | urls("(https://github\\.com/\\w+/\\w+)") 14 | urls("(https://github\\.com/\\w+)") -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/Task.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic; 2 | 3 | /** 4 | * Interface for identifying different tasks.
5 | * 6 | * @author code4crafter@gmail.com
7 | * @since 0.1.0 8 | * @see us.codecraft.webmagic.scheduler.Scheduler 9 | * @see us.codecraft.webmagic.pipeline.Pipeline 10 | */ 11 | public interface Task { 12 | 13 | /** 14 | * unique id for a task. 15 | * 16 | * @return uuid 17 | */ 18 | public String getUUID(); 19 | 20 | /** 21 | * site of a task 22 | * 23 | * @return site 24 | */ 25 | public Site getSite(); 26 | 27 | } 28 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/handler/RequestMatcher.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.handler; 2 | 3 | import us.codecraft.webmagic.Request; 4 | 5 | /** 6 | * @author code4crafer@gmail.com 7 | * @since 0.5.0 8 | */ 9 | public interface RequestMatcher { 10 | 11 | /** 12 | * Check whether to process the page.

13 | * Please DO NOT change page status in this method. 14 | * 15 | * @param page page 16 | * 17 | * @return whether matches 18 | */ 19 | public boolean match(Request page); 20 | 21 | public enum MatchOther { 22 | YES, NO 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/downloader/FileCacheTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.downloader; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | import us.codecraft.webmagic.Spider; 6 | 7 | /** 8 | * @author code4crafter@gmail.com
9 | */ 10 | public class FileCacheTest { 11 | 12 | @Ignore("takes long") 13 | @Test 14 | public void test() { 15 | FileCache fileCache = new FileCache("http://my.oschina.net/flashsword/blog", "http://my.oschina.net/flashsword/blog/*"); 16 | Spider.create(fileCache).downloader(fileCache).pipeline(fileCache).run(); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/PropertyType.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | */ 9 | public enum PropertyType { 10 | 11 | PropertyString,PropertyMap,PropertyList; 12 | 13 | public static PropertyType from(Class clazz){ 14 | if (Map.class.isAssignableFrom(clazz)){ 15 | return PropertyMap; 16 | } 17 | if (List.class.isAssignableFrom(clazz)){ 18 | return PropertyList; 19 | } 20 | return PropertyString; 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/CustomSpiderStatus.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.monitor; 2 | 3 | import us.codecraft.webmagic.Spider; 4 | 5 | /** 6 | * @author code4crafer@gmail.com 7 | */ 8 | public class CustomSpiderStatus extends SpiderStatus implements CustomSpiderStatusMXBean { 9 | 10 | public CustomSpiderStatus(Spider spider, SpiderMonitor.MonitorSpiderListener monitorSpiderListener) { 11 | super(spider, monitorSpiderListener); 12 | } 13 | 14 | 15 | @Override 16 | public String getSchedulerName() { 17 | return spider.getScheduler().getClass().getName(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/DashBoardController.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.avalon.web; 2 | 3 | import org.springframework.stereotype.Controller; 4 | import org.springframework.web.bind.annotation.RequestMapping; 5 | import org.springframework.web.servlet.ModelAndView; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | */ 10 | @Controller("dashboard") 11 | @RequestMapping("/") 12 | public class DashBoardController { 13 | 14 | @RequestMapping 15 | public ModelAndView index() { 16 | ModelAndView map = new ModelAndView("dashboard"); 17 | return map; 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /webmagic-core/src/test/java/us/codecraft/webmagic/ResultItemsTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic; 2 | 3 | import org.junit.Test; 4 | 5 | 6 | import static org.assertj.core.api.Assertions.assertThat; 7 | 8 | /** 9 | * @author code4crafter@gmail.com 10 | */ 11 | public class ResultItemsTest { 12 | 13 | @Test 14 | public void testOrderOfEntries() throws Exception { 15 | ResultItems resultItems = new ResultItems(); 16 | resultItems.put("a", "a"); 17 | resultItems.put("b", "b"); 18 | resultItems.put("c", "c"); 19 | assertThat(resultItems.getAll().keySet()).containsExactly("a","b","c"); 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/CollectorPageModelPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.pipeline; 2 | 3 | import us.codecraft.webmagic.Task; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | /** 9 | * @author code4crafter@gmail.com 10 | */ 11 | public class CollectorPageModelPipeline implements PageModelPipeline { 12 | 13 | private List collected = new ArrayList(); 14 | 15 | @Override 16 | public synchronized void process(T t, Task task) { 17 | collected.add(t); 18 | } 19 | 20 | public List getCollected() { 21 | return collected; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/handler/SubPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.handler; 2 | 3 | import us.codecraft.webmagic.ResultItems; 4 | import us.codecraft.webmagic.Task; 5 | 6 | /** 7 | * @author code4crafer@gmail.com 8 | * @since 0.5.0 9 | */ 10 | public interface SubPipeline extends RequestMatcher { 11 | 12 | /** 13 | * process the page, extract urls to fetch, extract the data and store 14 | * 15 | * @param resultItems resultItems 16 | * @param task task 17 | * @return whether continue to match 18 | */ 19 | public MatchOther processResult(ResultItems resultItems, Task task); 20 | 21 | } 22 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/webapp/WEB-INF/jsp/500.jsp: -------------------------------------------------------------------------------- 1 | <%@ page language="java" contentType="text/html; charset=utf8" 2 | pageEncoding="utf8" isErrorPage="true" import="java.io.*"%> 3 | 4 | 5 | 6 | 7 | 500 8 | 9 | 10 | 页面出错啦! 11 | <% 12 | 13 | StringWriter stringWriter = new StringWriter(); 14 | exception.printStackTrace(new PrintWriter(stringWriter)); 15 | out.println(stringWriter.toString()); 16 | %> 17 | 18 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-worker/src/main/webapp/WEB-INF/jsp/500.jsp: -------------------------------------------------------------------------------- 1 | <%@ page language="java" contentType="text/html; charset=utf8" 2 | pageEncoding="utf8" isErrorPage="true" import="java.io.*"%> 3 | 4 | 5 | 6 | 7 | 500 8 | 9 | 10 | 页面出错啦! 11 | <% 12 | 13 | StringWriter stringWriter = new StringWriter(); 14 | exception.printStackTrace(new PrintWriter(stringWriter)); 15 | out.println(stringWriter.toString()); 16 | %> 17 | 18 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/ConsolePageModelPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import org.apache.commons.lang3.builder.ToStringBuilder; 4 | import us.codecraft.webmagic.Task; 5 | import us.codecraft.webmagic.pipeline.PageModelPipeline; 6 | 7 | /** 8 | * Print page model in console.
9 | * Usually used in test.
10 | * @author code4crafter@gmail.com
11 | * @since 0.2.0 12 | */ 13 | public class ConsolePageModelPipeline implements PageModelPipeline { 14 | @Override 15 | public void process(Object o, Task task) { 16 | System.out.println(ToStringBuilder.reflectionToString(o)); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/formatter/DateFormatterTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.formatter; 2 | 3 | import org.junit.Test; 4 | import us.codecraft.webmagic.model.formatter.DateFormatter; 5 | 6 | import java.util.Date; 7 | 8 | /** 9 | * @author code4crafter@gmail.com 10 | */ 11 | public class DateFormatterTest { 12 | 13 | @Test 14 | public void testDateFormatter() throws Exception { 15 | DateFormatter dateFormatter = new DateFormatter(); 16 | dateFormatter.initParam(new String[]{"yyyy-MM-dd HH:mm"}); 17 | Date format = dateFormatter.format("2013-09-10 22:11"); 18 | System.out.println(format); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/test/java/us/codecraft/forger/compiler/GroovyForgerCompilerTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.compiler; 2 | 3 | import org.junit.Test; 4 | import us.codecraft.forger.Foo; 5 | 6 | import static org.assertj.core.api.Assertions.assertThat; 7 | 8 | /** 9 | * @author code4crafter@gmail.com 10 | */ 11 | public class GroovyForgerCompilerTest { 12 | 13 | @Test 14 | public void testGroovyClassLoader() throws Exception { 15 | GroovyForgerCompiler groovyForgerCompiler = new GroovyForgerCompiler(); 16 | Class compiledClass = groovyForgerCompiler.compile(Foo.SOURCE_CODE); 17 | assertThat(compiledClass.getName()).isEqualTo("Foo"); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/AbstractTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic; 2 | 3 | import org.junit.runner.RunWith; 4 | import org.springframework.test.context.ActiveProfiles; 5 | import org.springframework.test.context.ContextConfiguration; 6 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 7 | import org.springframework.transaction.annotation.Transactional; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | */ 12 | @RunWith(SpringJUnit4ClassRunner.class) 13 | @ContextConfiguration(locations = {"classpath*:/config/spring/applicationContext*.xml"}) 14 | @ActiveProfiles("test") 15 | @Transactional 16 | public abstract class AbstractTest { 17 | } 18 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/Pipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.pipeline; 2 | 3 | import us.codecraft.webmagic.ResultItems; 4 | import us.codecraft.webmagic.Task; 5 | 6 | /** 7 | * Pipeline is the persistent and offline process part of crawler.
8 | * The interface Pipeline can be implemented to customize ways of persistent. 9 | * 10 | * @author code4crafter@gmail.com
11 | * @since 0.1.0 12 | * @see ConsolePipeline 13 | * @see FilePipeline 14 | */ 15 | public interface Pipeline { 16 | 17 | /** 18 | * Process extracted results. 19 | * 20 | * @param resultItems resultItems 21 | * @param task task 22 | */ 23 | public void process(ResultItems resultItems, Task task); 24 | } 25 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/RegexResult.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | /** 4 | * Object contains regex results.
5 | * For multi group result extension.
6 | * 7 | * @author code4crafter@gmail.com
8 | * @since 0.1.0 9 | */ 10 | class RegexResult { 11 | 12 | private String[] groups; 13 | 14 | public static final RegexResult EMPTY_RESULT = new RegexResult(); 15 | 16 | public RegexResult() { 17 | 18 | } 19 | 20 | public RegexResult(String[] groups) { 21 | this.groups = groups; 22 | } 23 | 24 | public String get(int groupId) { 25 | if (groups == null) { 26 | return null; 27 | } 28 | return groups[groupId]; 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/formatter/StringTemplateFormatter.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples.formatter; 2 | 3 | import us.codecraft.webmagic.model.formatter.ObjectFormatter; 4 | 5 | /** 6 | * @author yihua.huang@dianping.com 7 | */ 8 | public class StringTemplateFormatter implements ObjectFormatter { 9 | 10 | private String template; 11 | 12 | @Override 13 | public String format(String raw) throws Exception { 14 | return String.format(template, raw); 15 | } 16 | 17 | @Override 18 | public Class clazz() { 19 | return String.class; 20 | } 21 | 22 | @Override 23 | public void initParam(String[] extra) { 24 | template = extra[0]; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selector.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * Selector(extractor) for text.
7 | * 8 | * @author code4crafter@gmail.com
9 | * @since 0.1.0 10 | */ 11 | public interface Selector { 12 | 13 | /** 14 | * Extract single result in text.
15 | * If there are more than one result, only the first will be chosen. 16 | * 17 | * @param text text 18 | * @return result 19 | */ 20 | public String select(String text); 21 | 22 | /** 23 | * Extract all results in text.
24 | * 25 | * @param text text 26 | * @return results 27 | */ 28 | public List selectList(String text); 29 | 30 | } 31 | -------------------------------------------------------------------------------- /webmagic-core/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-core/src/test/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/java/us/codecraft/webmagic/avalon/web/SpiderController.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.avalon.web; 2 | 3 | import org.springframework.stereotype.Controller; 4 | import org.springframework.web.bind.annotation.RequestMapping; 5 | import org.springframework.web.bind.annotation.ResponseBody; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | /** 11 | * @author code4crafter@gmail.com 12 | */ 13 | @Controller("spider") 14 | @RequestMapping("spider") 15 | public class SpiderController { 16 | 17 | @RequestMapping("create") 18 | @ResponseBody 19 | public Map create() { 20 | HashMap map = new HashMap(); 21 | map.put("code", 200); 22 | return map; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageMapper.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.Page; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * @author code4crafer@gmail.com 9 | * @since 0.5.2 10 | */ 11 | public class PageMapper { 12 | 13 | private Class clazz; 14 | 15 | private PageModelExtractor pageModelExtractor; 16 | 17 | public PageMapper(Class clazz) { 18 | this.clazz = clazz; 19 | this.pageModelExtractor = PageModelExtractor.create(clazz); 20 | } 21 | 22 | public T get(Page page) { 23 | return (T) pageModelExtractor.process(page); 24 | } 25 | 26 | public List getAll(Page page) { 27 | return (List) pageModelExtractor.process(page); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-scripts/src/test/resouces/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/scheduler/LevelLimitScheduler.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples.scheduler; 2 | 3 | import us.codecraft.webmagic.Request; 4 | import us.codecraft.webmagic.Task; 5 | import us.codecraft.webmagic.scheduler.PriorityScheduler; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | */ 10 | public class LevelLimitScheduler extends PriorityScheduler { 11 | 12 | private int levelLimit = 3; 13 | 14 | public LevelLimitScheduler(int levelLimit) { 15 | this.levelLimit = levelLimit; 16 | } 17 | 18 | @Override 19 | public synchronized void push(Request request, Task task) { 20 | if (((Integer) request.getExtra("_level")) <= levelLimit) { 21 | super.push(request, task); 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ResultItemsCollectorPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.pipeline; 2 | 3 | import us.codecraft.webmagic.ResultItems; 4 | import us.codecraft.webmagic.Task; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | * @since 0.4.0 12 | */ 13 | public class ResultItemsCollectorPipeline implements CollectorPipeline { 14 | 15 | private List collector = new ArrayList(); 16 | 17 | @Override 18 | public synchronized void process(ResultItems resultItems, Task task) { 19 | collector.add(resultItems); 20 | } 21 | 22 | @Override 23 | public List getCollected() { 24 | return collector; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/resources/log/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/resources/html/mock-webmagic.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
    9 |
  • 10 |
  • 11 |
  • 12 |
  • 13 |
14 |
    15 |
  • 16 |
  • 17 |
  • 18 |
  • 19 |
20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-worker/src/main/resources/log/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/pipeline/ConsolePipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.pipeline; 2 | 3 | import us.codecraft.webmagic.ResultItems; 4 | import us.codecraft.webmagic.Task; 5 | 6 | import java.util.Map; 7 | 8 | /** 9 | * Write results in console.
10 | * Usually used in test. 11 | * 12 | * @author code4crafter@gmail.com
13 | * @since 0.1.0 14 | */ 15 | public class ConsolePipeline implements Pipeline { 16 | 17 | @Override 18 | public void process(ResultItems resultItems, Task task) { 19 | System.out.println("get page: " + resultItems.getRequest().getUrl()); 20 | for (Map.Entry entry : resultItems.getAll().entrySet()) { 21 | System.out.println(entry.getKey() + ":\t" + entry.getValue()); 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderStatusMXBean.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.monitor; 2 | 3 | import java.util.Date; 4 | import java.util.List; 5 | 6 | /** 7 | * @author code4crafer@gmail.com 8 | * @since 0.5.0 9 | */ 10 | public interface SpiderStatusMXBean { 11 | 12 | public String getName(); 13 | 14 | public String getStatus(); 15 | 16 | public int getThread(); 17 | 18 | public int getTotalPageCount(); 19 | 20 | public int getLeftPageCount(); 21 | 22 | public int getSuccessPageCount(); 23 | 24 | public int getErrorPageCount(); 25 | 26 | public List getErrorPages(); 27 | 28 | public void start(); 29 | 30 | public void stop(); 31 | 32 | public Date getStartTime(); 33 | 34 | public int getPagePerSecond(); 35 | } 36 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/config/log/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepo.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | /** 4 | * @author code4crafer@gmail.com 5 | */ 6 | public class GithubRepo { 7 | 8 | private String name; 9 | 10 | private String author; 11 | 12 | private String readme; 13 | 14 | public String getName() { 15 | return name; 16 | } 17 | 18 | public void setName(String name) { 19 | this.name = name; 20 | } 21 | 22 | public String getAuthor() { 23 | return author; 24 | } 25 | 26 | public void setAuthor(String author) { 27 | this.author = author; 28 | } 29 | 30 | public String getReadme() { 31 | return readme; 32 | } 33 | 34 | public void setReadme(String readme) { 35 | this.readme = readme; 36 | } 37 | } -------------------------------------------------------------------------------- /webmagic-samples/src/test/java/us/codecraft/webmagic/samples/scheduler/DelayQueueSchedulerTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples.scheduler; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | import us.codecraft.webmagic.Request; 6 | 7 | import java.util.concurrent.TimeUnit; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | */ 12 | public class DelayQueueSchedulerTest { 13 | 14 | @Ignore("infinite") 15 | @Test 16 | public void test() { 17 | DelayQueueScheduler delayQueueScheduler = new DelayQueueScheduler(1, TimeUnit.SECONDS); 18 | delayQueueScheduler.push(new Request("1"), null); 19 | while (true){ 20 | Request poll = delayQueueScheduler.poll(null); 21 | System.out.println(System.currentTimeMillis()+"\t"+poll); 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/utils/ClassUtils.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.utils; 2 | 3 | import java.lang.reflect.Field; 4 | import java.util.LinkedHashSet; 5 | import java.util.Set; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | * @since 0.5.0 10 | */ 11 | public abstract class ClassUtils { 12 | 13 | public static Set getFieldsIncludeSuperClass(Class clazz) { 14 | Set fields = new LinkedHashSet(); 15 | Class current = clazz; 16 | while (current != null) { 17 | Field[] currentFields = current.getDeclaredFields(); 18 | for (Field currentField : currentFields) { 19 | fields.add(currentField); 20 | } 21 | current = current.getSuperclass(); 22 | } 23 | return fields; 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /webmagic-core/src/test/java/us/codecraft/webmagic/selector/RegexSelectorTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import org.assertj.core.api.Assertions; 4 | import org.junit.Test; 5 | 6 | /** 7 | * @author code4crafter@gmail.com
8 | */ 9 | public class RegexSelectorTest { 10 | 11 | @Test(expected = IllegalArgumentException.class) 12 | public void testRegexWithSingleLeftBracket() { 13 | String regex = "\\d+("; 14 | new RegexSelector(regex); 15 | } 16 | 17 | @Test 18 | public void testRegexWithLeftBracketQuoted() { 19 | String regex = "\\(.+"; 20 | String source = "(hello world"; 21 | RegexSelector regexSelector = new RegexSelector(regex); 22 | String select = regexSelector.select(source); 23 | Assertions.assertThat(select).isEqualTo(source); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/ElementSelector.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import org.jsoup.nodes.Element; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Selector(extractor) for html elements.
9 | * 10 | * @author code4crafter@gmail.com
11 | * @since 0.3.0 12 | */ 13 | public interface ElementSelector { 14 | 15 | /** 16 | * Extract single result in text.
17 | * If there are more than one result, only the first will be chosen. 18 | * 19 | * @param element element 20 | * @return result 21 | */ 22 | public String select(Element element); 23 | 24 | /** 25 | * Extract all results in text.
26 | * 27 | * @param element element 28 | * @return results 29 | */ 30 | public List selectList(Element element); 31 | 32 | } 33 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/Scheduler.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scheduler; 2 | 3 | import us.codecraft.webmagic.Request; 4 | import us.codecraft.webmagic.Task; 5 | 6 | /** 7 | * Scheduler is the part of url management.
8 | * You can implement interface Scheduler to do: 9 | * manage urls to fetch 10 | * remove duplicate urls 11 | * 12 | * @author code4crafter@gmail.com
13 | * @since 0.1.0 14 | */ 15 | public interface Scheduler { 16 | 17 | /** 18 | * add a url to fetch 19 | * 20 | * @param request request 21 | * @param task task 22 | */ 23 | public void push(Request request, Task task); 24 | 25 | /** 26 | * get an url to crawl 27 | * 28 | * @param task the task of spider 29 | * @return the url to crawl 30 | */ 31 | public Request poll(Task task); 32 | 33 | } 34 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/resources/spider-config-draft.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | utf-8 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /webmagic-avalon.md: -------------------------------------------------------------------------------- 1 | WebMagic-Avalon项目计划 2 | ======= 3 | WebMagic-Avalon项目的目标是打造一个可配置、可管理的爬虫,以及一个可分享配置/脚本的平台,从而减少熟悉的开发者的开发量,并且让**不熟悉Java技术的人**也能简单的使用一个爬虫。 4 | 5 | ## Part1:webmagic-scripts 6 | 7 | 目标:使得可以用简单脚本的方式编写爬虫,从而为一些常用场景提供可流通的脚本。 8 | 例如:我需要抓github的仓库数据,可以这样写一个脚本(javascript): 9 | 10 | [https://github.com/code4craft/webmagic/tree/master/webmagic-scripts](https://github.com/code4craft/webmagic/tree/master/webmagic-scripts) 11 | 12 | 这个功能目前实现了一部分,但最终结果仍在实验阶段。欢迎大家积极参与并提出意见。 13 | 14 | ## Part2:webmagic-pannel 15 | 16 | 一个集成了加载脚本、管理爬虫的后台。计划中。 17 | 18 | ## Part3:webmagic-market 19 | 20 | 一个可以分享、搜索和下载脚本的站点。计划中。 21 | 22 | ## 如何参与 23 | 24 | webmagic目前由作者业余维护,仅仅为了分享和个人提高,没有任何盈利,也没有商业化打算。 25 | 26 | 欢迎以下几种形式的贡献: 27 | 28 | 1. 为webmagic项目本身提出改进意见,可以通过邮件组、qq、oschina或者在github提交issue(推荐)的方式。 29 | 2. 参与WebMagic-Avalon计划的建设讨论,包括产品设计、技术选型等,可以直接回复这个issue。 30 | 3. 参与webmagic代码开发,请fork一份代码,修改后提交pull request给我。请使用尽量新的版本,并说明修改内容。pull request接受后,我会将你加为committer,共同参与开发。 -------------------------------------------------------------------------------- /webmagic-avalon/forger/README.md: -------------------------------------------------------------------------------- 1 | forger 2 | ====== 3 | 4 | Dynamic Java object generator with template class and configuration. 5 | 6 | ## Compiler 7 | 8 | Use groovy compiler. Compile source code to Java class. 9 | 10 | ## PropertyLoader 11 | 12 | Load properties of object from user input. 13 | 14 | ## API 15 | 16 | ```java 17 | @Test 18 | public void testForgerCreateByClassAnnotationCompile() throws Exception { 19 | ForgerFactory forgerFactory = new ForgerFactory(new AnnotationPropertyLoader(), new GroovyForgerCompiler()); 20 | Forger forger = forgerFactory.compile(Foo.SOURCE_CODE); 21 | Fooable foo = forger.forge(ImmutableMap.of("fooa", "test")); 22 | Field field = forger.getClazz().getDeclaredField("foo"); 23 | field.setAccessible(true); 24 | assertThat(field.get(foo)).isEqualTo("test"); 25 | assertThat(foo.foo()).isEqualTo("test"); 26 | } 27 | ``` -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/ForgerFactory.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger; 2 | 3 | import us.codecraft.forger.compiler.ForgerCompiler; 4 | import us.codecraft.forger.property.PropertyLoader; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | */ 9 | public class ForgerFactory { 10 | 11 | private final PropertyLoader propertyLoader; 12 | 13 | private final ForgerCompiler forgerCompiler; 14 | 15 | public ForgerFactory(PropertyLoader propertyLoader, ForgerCompiler forgerCompiler) { 16 | this.propertyLoader = propertyLoader; 17 | this.forgerCompiler = forgerCompiler; 18 | } 19 | 20 | public Forger compile(String sourceCode) { 21 | Class clazz = forgerCompiler.compile(sourceCode); 22 | return new Forger(clazz, propertyLoader); 23 | } 24 | 25 | public Forger create(Class clazz) { 26 | return new Forger(clazz, propertyLoader); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /webmagic-core/src/test/java/us/codecraft/webmagic/selector/JsonTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.assertj.core.api.Assertions.assertThat; 6 | 7 | /** 8 | * @author code4crafter@gmai.com 9 | * @since 0.5.0 10 | */ 11 | public class JsonTest { 12 | 13 | private String text = "callback({\"name\":\"json\"})"; 14 | 15 | private String textWithBrackerInContent = "callback({\"name\":\"json)\"})"; 16 | 17 | @Test 18 | public void testRemovePadding() throws Exception { 19 | String name = new Json(text).removePadding("callback").jsonPath("$.name").get(); 20 | assertThat(name).isEqualTo("json"); 21 | } 22 | 23 | @Test 24 | public void testRemovePaddingForQuotes() throws Exception { 25 | String name = new Json(textWithBrackerInContent).removePadding("callback").jsonPath("$.name").get(); 26 | assertThat(name).isEqualTo("json)"); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/utils/HttpConstant.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.utils; 2 | 3 | /** 4 | * Some constants of Http protocal. 5 | * @author code4crafer@gmail.com 6 | * @since 0.5.0 7 | */ 8 | public abstract class HttpConstant { 9 | 10 | public static abstract class Method { 11 | 12 | public static final String GET = "GET"; 13 | 14 | public static final String HEAD = "HEAD"; 15 | 16 | public static final String POST = "POST"; 17 | 18 | public static final String PUT = "PUT"; 19 | 20 | public static final String DELETE = "DELETE"; 21 | 22 | public static final String TRACE = "TRACE"; 23 | 24 | public static final String CONNECT = "CONNECT"; 25 | 26 | } 27 | 28 | public static abstract class Header { 29 | 30 | public static final String REFERER = "Referer"; 31 | 32 | public static final String USER_AGENT = "User-Agent"; 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scripts; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | public enum Language { 7 | 8 | JavaScript("javascript","js/defines.js",""), 9 | 10 | JRuby("jruby","ruby/defines.rb",""), 11 | 12 | Jython("jython","python/defines.py",""); 13 | 14 | private String engineName; 15 | 16 | private String defineFile; 17 | 18 | private String gatherFile; 19 | 20 | Language(String engineName, String defineFile, String gatherFile) { 21 | this.engineName = engineName; 22 | this.defineFile = defineFile; 23 | this.gatherFile = gatherFile; 24 | } 25 | 26 | public String getEngineName() { 27 | return engineName; 28 | } 29 | 30 | public String getDefineFile() { 31 | return defineFile; 32 | } 33 | 34 | public String getGatherFile() { 35 | return gatherFile; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/dao/DynamicClassDaoTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.dao; 2 | 3 | import org.junit.Test; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | import org.springframework.test.annotation.Rollback; 6 | import org.springframework.transaction.annotation.Transactional; 7 | import us.codecraft.webmagic.AbstractTest; 8 | import us.codecraft.webmagic.model.DynamicClass; 9 | 10 | /** 11 | * @author code4crafter@gmail.com 12 | */ 13 | public class DynamicClassDaoTest extends AbstractTest { 14 | 15 | @Autowired 16 | private DynamicClassDao dynamicClassDao; 17 | 18 | @Test 19 | @Transactional 20 | @Rollback(true) 21 | public void testAdd() throws Exception { 22 | DynamicClass dynamicClass = new DynamicClass(); 23 | dynamicClass.setClassName("test"); 24 | dynamicClass.setSourceCode("testSource"); 25 | dynamicClassDao.add(dynamicClass); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/DateFormatter.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.formatter; 2 | 3 | import org.apache.commons.lang3.time.DateUtils; 4 | 5 | import java.util.Date; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | * @since 0.3.2 10 | */ 11 | public class DateFormatter implements ObjectFormatter { 12 | 13 | public static final String[] DEFAULT_PATTERN = new String[]{"yyyy-MM-dd HH:mm"}; 14 | private String[] datePatterns = DEFAULT_PATTERN; 15 | 16 | @Override 17 | public Date format(String raw) throws Exception { 18 | return DateUtils.parseDate(raw, datePatterns); 19 | } 20 | 21 | @Override 22 | public Class clazz() { 23 | return Date.class; 24 | } 25 | 26 | @Override 27 | public void initParam(String[] extra) { 28 | if (extra != null && !(extra.length == 1 && extra[0].length() == 0)) { 29 | datePatterns = extra; 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/DuplicateRemover.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scheduler.component; 2 | 3 | import us.codecraft.webmagic.Request; 4 | import us.codecraft.webmagic.Task; 5 | 6 | /** 7 | * Remove duplicate requests. 8 | * @author code4crafer@gmail.com 9 | * @since 0.5.1 10 | */ 11 | public interface DuplicateRemover { 12 | /** 13 | * 14 | * Check whether the request is duplicate. 15 | * 16 | * @param request request 17 | * @param task task 18 | * @return true if is duplicate 19 | */ 20 | public boolean isDuplicate(Request request, Task task); 21 | 22 | /** 23 | * Reset duplicate check. 24 | * @param task task 25 | */ 26 | public void resetDuplicateCheck(Task task); 27 | 28 | /** 29 | * Get TotalRequestsCount for monitor. 30 | * @param task task 31 | * @return number of total request 32 | */ 33 | public int getTotalRequestsCount(Task task); 34 | 35 | } 36 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/DateFormatter.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property.format; 2 | 3 | import org.apache.commons.lang3.time.DateUtils; 4 | 5 | import java.text.ParseException; 6 | import java.util.Date; 7 | 8 | /** 9 | * @author code4crafter@gmail.com 10 | * @since 0.3.2 11 | */ 12 | public class DateFormatter implements TypeFormatter { 13 | 14 | public static final String[] DEFAULT_PATTERN = new String[]{"yyyy-MM-dd HH:mm"}; 15 | 16 | @Override 17 | public Date format(String text) { 18 | return format(text,DEFAULT_PATTERN); 19 | } 20 | 21 | @Override 22 | public Date format(String text, String[] params) { 23 | try { 24 | return DateUtils.parseDate(text, params); 25 | } catch (ParseException e) { 26 | throw new IllegalArgumentException(e); 27 | } 28 | } 29 | 30 | @Override 31 | public Class clazz() { 32 | return Date.class; 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/ObjectFormatterWithParams.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property.format; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | public class ObjectFormatterWithParams implements ObjectFormatter { 7 | 8 | private TypeFormatter typeFormatter; 9 | 10 | private String[] params; 11 | 12 | public TypeFormatter getTypeFormatter() { 13 | return typeFormatter; 14 | } 15 | 16 | public ObjectFormatterWithParams setTypeFormatter(TypeFormatter typeFormatter) { 17 | this.typeFormatter = typeFormatter; 18 | return this; 19 | } 20 | 21 | public String[] getParams() { 22 | return params; 23 | } 24 | 25 | public ObjectFormatterWithParams setParams(String[] params) { 26 | this.params = params; 27 | return this; 28 | } 29 | 30 | @Override 31 | public T format(String text) { 32 | return typeFormatter.format(text, params); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/processor/PageProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.processor; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | 6 | /** 7 | * Interface to be implemented to customize a crawler.
8 | *
9 | * In PageProcessor, you can customize: 10 | *
11 | * start urls and other settings in {@link Site}
12 | * how the urls to fetch are detected
13 | * how the data are extracted and stored
14 | * 15 | * @author code4crafter@gmail.com
16 | * @see Site 17 | * @see Page 18 | * @since 0.1.0 19 | */ 20 | public interface PageProcessor { 21 | 22 | /** 23 | * process the page, extract urls to fetch, extract the data and store 24 | * 25 | * @param page page 26 | */ 27 | public void process(Page page); 28 | 29 | /** 30 | * get the site settings 31 | * 32 | * @return site 33 | * @see Site 34 | */ 35 | public Site getSite(); 36 | } 37 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/NjuBBSProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.Page; 5 | import us.codecraft.webmagic.processor.PageProcessor; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * @author code4crafter@gmail.com
11 | * Date: 13-4-21 12 | * Time: 下午8:08 13 | */ 14 | public class NjuBBSProcessor implements PageProcessor { 15 | @Override 16 | public void process(Page page) { 17 | List requests = page.getHtml().regex("]*href=(bbstcon\\?board=Pictures&file=[^>]*)").all(); 18 | page.addTargetRequests(requests); 19 | page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); 20 | page.putField("content",page.getHtml().smartContent()); 21 | } 22 | 23 | @Override 24 | public Site getSite() { 25 | return Site.me().setDomain("bbs.nju.edu.cn").addStartUrl("http://bbs.nju.edu.cn/board?board=Pictures"); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-worker/src/test/java/us/codecraft/webmagic/worker/WorkerTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.worker; 2 | 3 | import org.junit.Test; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.processor.PageProcessor; 7 | 8 | import static org.assertj.core.api.Assertions.assertThat; 9 | import static org.mockito.Mockito.*; 10 | 11 | /** 12 | * @author code4crafter@gmail.com 13 | */ 14 | public class WorkerTest { 15 | 16 | @Test 17 | public void testWorkerAsSpiderContains() throws Exception { 18 | PageProcessor pageProcessor = mock(PageProcessor.class); 19 | Site site = mock(Site.class); 20 | when(pageProcessor.getSite()).thenReturn(site); 21 | when(site.getDomain()).thenReturn("codecraft.us"); 22 | Worker worker = new Worker(); 23 | Spider spider = Spider.create(pageProcessor); 24 | worker.addSpider(spider); 25 | assertThat(worker.getSpider("codecraft.us")).isEqualTo(spider); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/SimpleFieldPropertyLoader.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property; 2 | 3 | import java.lang.reflect.Field; 4 | import java.lang.reflect.Modifier; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | /** 9 | * @author code4crafter@gmail.com 10 | */ 11 | public class SimpleFieldPropertyLoader extends AbstractPropertyLoader { 12 | 13 | @Override 14 | public List getProperties(Class clazz) { 15 | Field[] fields = clazz.getDeclaredFields(); 16 | List properties = new ArrayList(fields.length); 17 | for (Field field : fields) { 18 | if (Modifier.isStatic(field.getModifiers())){ 19 | continue; 20 | } 21 | if (!field.isAccessible()){ 22 | field.setAccessible(true); 23 | } 24 | properties.add(Property.fromField(field).setObjectFormatter(getObjectFormatter(field))); 25 | } 26 | return properties; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/example/MonitorExample.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.example; 2 | 3 | import us.codecraft.webmagic.Spider; 4 | import us.codecraft.webmagic.monitor.SpiderMonitor; 5 | import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; 6 | import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; 7 | 8 | /** 9 | * @author code4crafer@gmail.com 10 | * @since 0.5.0 11 | */ 12 | public class MonitorExample { 13 | 14 | public static void main(String[] args) throws Exception { 15 | 16 | Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()) 17 | .addUrl("http://my.oschina.net/flashsword/blog"); 18 | Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) 19 | .addUrl("https://github.com/code4craft"); 20 | 21 | SpiderMonitor.instance().register(oschinaSpider); 22 | SpiderMonitor.instance().register(githubSpider); 23 | oschinaSpider.start(); 24 | githubSpider.start(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/QQMeishi.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.model.ConsolePageModelPipeline; 5 | import us.codecraft.webmagic.model.OOSpider; 6 | import us.codecraft.webmagic.model.annotation.ExtractBy; 7 | import us.codecraft.webmagic.model.annotation.TargetUrl; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | * @date 14-4-11 12 | */ 13 | @TargetUrl("http://meishi.qq.com/beijing/c/all[\\-p2]*") 14 | @ExtractBy(value = "//ul[@id=\"promos_list2\"]/li",multi = true) 15 | public class QQMeishi { 16 | 17 | @ExtractBy("//div[@class=info]/a[@class=title]/h4/text()") 18 | private String shopName; 19 | 20 | @ExtractBy("//div[@class=info]/a[@class=title]/text()") 21 | private String promo; 22 | 23 | public static void main(String[] args) { 24 | OOSpider.create(Site.me(), new ConsolePageModelPipeline(), QQMeishi.class).addUrl("http://meishi.qq.com/beijing/c/all").thread(4).run(); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/Forger.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger; 2 | 3 | import us.codecraft.forger.property.Property; 4 | import us.codecraft.forger.property.PropertyLoader; 5 | 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | */ 12 | public class Forger { 13 | 14 | private final Class clazz; 15 | 16 | private final PropertyLoader propertyLoader; 17 | 18 | public Forger(Class clazz,PropertyLoader propertyLoader) { 19 | this.clazz = clazz; 20 | this.propertyLoader = propertyLoader; 21 | } 22 | 23 | public T forge(Map properties) throws IllegalAccessException, InstantiationException { 24 | T t = clazz.newInstance(); 25 | propertyLoader.load(t, properties); 26 | return t; 27 | } 28 | 29 | public List getPropertyNames() { 30 | return propertyLoader.getProperties(clazz); 31 | } 32 | 33 | public Class getClazz() { 34 | return clazz; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-tx.xml: -------------------------------------------------------------------------------- 1 | 2 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-worker/src/main/java/us/codecraft/webmagic/worker/controller/SpiderController.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.worker.controller; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | import org.springframework.stereotype.Controller; 5 | import org.springframework.web.bind.annotation.RequestMapping; 6 | import org.springframework.web.bind.annotation.RequestParam; 7 | import org.springframework.web.bind.annotation.ResponseBody; 8 | import us.codecraft.webmagic.worker.Worker; 9 | 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | 13 | /** 14 | * @author code4crafter@gmail.com 15 | */ 16 | @Controller 17 | @RequestMapping("spider") 18 | public class SpiderController { 19 | 20 | @Autowired 21 | private Worker worker; 22 | 23 | @RequestMapping("create") 24 | @ResponseBody 25 | public Map create(@RequestParam("id") String id) { 26 | HashMap map = new HashMap(); 27 | map.put("code", 200); 28 | return map; 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/component/HashSetDuplicateRemover.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scheduler.component; 2 | 3 | import com.google.common.collect.Sets; 4 | import us.codecraft.webmagic.Request; 5 | import us.codecraft.webmagic.Task; 6 | 7 | import java.util.Set; 8 | import java.util.concurrent.ConcurrentHashMap; 9 | 10 | /** 11 | * @author code4crafer@gmail.com 12 | */ 13 | public class HashSetDuplicateRemover implements DuplicateRemover { 14 | 15 | private Set urls = Sets.newSetFromMap(new ConcurrentHashMap()); 16 | 17 | @Override 18 | public boolean isDuplicate(Request request, Task task) { 19 | return !urls.add(getUrl(request)); 20 | } 21 | 22 | protected String getUrl(Request request) { 23 | return request.getUrl(); 24 | } 25 | 26 | @Override 27 | public void resetDuplicateCheck(Task task) { 28 | urls.clear(); 29 | } 30 | 31 | @Override 32 | public int getTotalRequestsCount(Task task) { 33 | return urls.size(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/WebDriverPoolTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.downloader.selenium; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | import org.openqa.selenium.WebDriver; 6 | 7 | /** 8 | * @author code4crafter@gmail.com
9 | * Date: 13-7-26
10 | * Time: 下午2:12
11 | */ 12 | public class WebDriverPoolTest { 13 | 14 | private String chromeDriverPath = "/Users/yihua/Downloads/chromedriver"; 15 | 16 | @Ignore("need chrome driver") 17 | @Test 18 | public void test() { 19 | System.getProperties().setProperty("webdriver.chrome.driver", chromeDriverPath); 20 | WebDriverPool webDriverPool = new WebDriverPool(5); 21 | for (int i = 0; i < 5; i++) { 22 | try { 23 | WebDriver webDriver = webDriverPool.get(); 24 | System.out.println(i); 25 | } catch (InterruptedException e) { 26 | e.printStackTrace(); 27 | } 28 | } 29 | webDriverPool.closeAll(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/TianyaPageProcesser.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.Page; 5 | import us.codecraft.webmagic.processor.PageProcessor; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * @author code4crafter@gmail.com
11 | */ 12 | public class TianyaPageProcesser implements PageProcessor { 13 | 14 | @Override 15 | public void process(Page page) { 16 | List strings = page.getHtml().regex("]*href=[\"']{1}(/post-free.*?\\.shtml)[\"']{1}").all(); 17 | page.addTargetRequests(strings); 18 | page.putField("title", page.getHtml().xpath("//div[@id='post_head']//span[@class='s_title']//b")); 19 | page.putField("body",page.getHtml().smartContent()); 20 | } 21 | 22 | @Override 23 | public Site getSite() { 24 | return Site.me().setDomain("http://bbs.tianya.cn/").addStartUrl("http://bbs.tianya.cn/"); //To change body of implemented methods use File | Settings | File Templates. 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepoTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import org.junit.Test; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Task; 6 | import us.codecraft.webmagic.downloader.MockGithubDownloader; 7 | import us.codecraft.webmagic.pipeline.PageModelPipeline; 8 | 9 | import static org.assertj.core.api.Assertions.assertThat; 10 | 11 | /** 12 | * @author code4crafter@gmail.com
13 | */ 14 | public class GithubRepoTest { 15 | 16 | @Test 17 | public void test() { 18 | OOSpider.create(Site.me().setSleepTime(0) 19 | , new PageModelPipeline() { 20 | @Override 21 | public void process(GithubRepo o, Task task) { 22 | assertThat(o.getStar()).isEqualTo(86); 23 | assertThat(o.getFork()).isEqualTo(70); 24 | } 25 | }, GithubRepo.class).addUrl("https://github.com/code4craft/webmagic").setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/downloader/Downloader.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.downloader; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Request; 5 | import us.codecraft.webmagic.Task; 6 | 7 | /** 8 | * Downloader is the part that downloads web pages and store in Page object.
9 | * Downloader has {@link #setThread(int)} method because downloader is always the bottleneck of a crawler, 10 | * there are always some mechanisms such as pooling in downloader, and pool size is related to thread numbers. 11 | * 12 | * @author code4crafter@gmail.com
13 | * @since 0.1.0 14 | */ 15 | public interface Downloader { 16 | 17 | /** 18 | * Downloads web pages and store in Page object. 19 | * 20 | * @param request request 21 | * @param task task 22 | * @return page 23 | */ 24 | public Page download(Request request, Task task); 25 | 26 | /** 27 | * Tell the downloader how many threads the spider used. 28 | * @param threadNum number of threads 29 | */ 30 | public void setThread(int threadNum); 31 | } 32 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/test/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/test/java/us/codecraft/forger/Bar.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger; 2 | 3 | import us.codecraft.forger.property.Inject; 4 | import us.codecraft.forger.property.format.Formatter; 5 | 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | */ 12 | public class Bar { 13 | 14 | @Inject("bar") 15 | private String bar; 16 | 17 | @Inject 18 | private List values; 19 | 20 | @Formatter(value = "", subClazz = Integer.class) 21 | @Inject 22 | private Map idMap; 23 | 24 | public String getBar() { 25 | return bar; 26 | } 27 | 28 | public void setBar(String bar) { 29 | this.bar = bar; 30 | } 31 | 32 | public List getValues() { 33 | return values; 34 | } 35 | 36 | public void setValues(List values) { 37 | this.values = values; 38 | } 39 | 40 | public Map getIdMap() { 41 | return idMap; 42 | } 43 | 44 | public void setIdMap(Map idMap) { 45 | this.idMap = idMap; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/handler/PatternRequestMatcher.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.handler; 2 | 3 | import us.codecraft.webmagic.Request; 4 | 5 | import java.util.regex.Pattern; 6 | 7 | /** 8 | * Created with IntelliJ IDEA. 9 | * User: Sebastian MA 10 | * Date: April 03, 2014 11 | * Time: 10:00 12 | *

13 | * A PatternHandler is in charge of both page extraction and data processing by implementing 14 | * its two abstract methods. 15 | */ 16 | public abstract class PatternRequestMatcher implements RequestMatcher { 17 | 18 | /** 19 | * match pattern. only matched page should be handled. 20 | */ 21 | protected String pattern; 22 | 23 | private Pattern patternCompiled; 24 | 25 | /** 26 | * @param pattern url pattern to handle 27 | */ 28 | public PatternRequestMatcher(String pattern) { 29 | this.pattern = pattern; 30 | this.patternCompiled = Pattern.compile(pattern); 31 | } 32 | 33 | @Override 34 | public boolean match(Request request) { 35 | return patternCompiled.matcher(request.getUrl()).matches(); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /webmagic-extension/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | us.codecraft 5 | webmagic-parent 6 | 0.5.4-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | webmagic-extension 11 | 12 | 13 | 14 | redis.clients 15 | jedis 16 | 2.0.0 17 | 18 | 19 | us.codecraft 20 | webmagic-core 21 | ${project.version} 22 | 23 | 24 | junit 25 | junit 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/HuxiuProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.Page; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.processor.PageProcessor; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * @author code4crafter@gmail.com
12 | */ 13 | public class HuxiuProcessor implements PageProcessor { 14 | @Override 15 | public void process(Page page) { 16 | List requests = page.getHtml().links().regex(".*article.*").all(); 17 | page.addTargetRequests(requests); 18 | page.putField("title",page.getHtml().xpath("//div[@class='clearfix neirong']//h1/text()")); 19 | page.putField("content",page.getHtml().xpath("//div[@id='neirong_box']/tidyText()")); 20 | } 21 | 22 | @Override 23 | public Site getSite() { 24 | return Site.me().setDomain("www.huxiu.com").addStartUrl("http://www.huxiu.com/"); 25 | } 26 | 27 | public static void main(String[] args) { 28 | Spider.create(new HuxiuProcessor()).run(); 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /webmagic-extension/pom.xml.versionsBackup: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | us.codecraft 5 | webmagic-parent 6 | 0.5.2 7 | 8 | 4.0.0 9 | 10 | webmagic-extension 11 | 12 | 13 | 14 | redis.clients 15 | jedis 16 | 2.0.0 17 | 18 | 19 | us.codecraft 20 | webmagic-core 21 | ${project.version} 22 | 23 | 24 | junit 25 | junit 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /webmagic-extension/pom.xml.releaseBackup: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | us.codecraft 5 | webmagic-parent 6 | 0.5.3-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | webmagic-extension 11 | 12 | 13 | 14 | redis.clients 15 | jedis 16 | 2.0.0 17 | 18 | 19 | us.codecraft 20 | webmagic-core 21 | ${project.version} 22 | 23 | 24 | junit 25 | junit 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/model/GithubRepo.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.model.annotation.ExtractBy; 5 | import us.codecraft.webmagic.model.annotation.HelpUrl; 6 | import us.codecraft.webmagic.model.annotation.TargetUrl; 7 | 8 | /** 9 | * @author code4crafter@gmail.com
10 | * @since 0.3.2 11 | */ 12 | @TargetUrl("https://github.com/\\w+/\\w+") 13 | @HelpUrl({"https://github.com/\\w+\\?tab=repositories", "https://github.com/\\w+", "https://github.com/explore/*"}) 14 | public class GithubRepo extends BaseRepo{ 15 | 16 | @ExtractBy("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count']/text()") 17 | private int fork; 18 | 19 | public static void main(String[] args) { 20 | OOSpider.create(Site.me().setSleepTime(100) 21 | , new ConsolePageModelPipeline(), GithubRepo.class) 22 | .addUrl("https://github.com/code4craft").thread(10).run(); 23 | } 24 | 25 | public int getStar() { 26 | return star; 27 | } 28 | 29 | public int getFork() { 30 | return fork; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /webmagic-samples/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | webmagic-parent 5 | us.codecraft 6 | 0.5.4-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | webmagic-samples 11 | 12 | 13 | 14 | us.codecraft 15 | webmagic-core 16 | ${project.version} 17 | 18 | 19 | us.codecraft 20 | webmagic-extension 21 | ${project.version} 22 | 23 | 24 | junit 25 | junit 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/OschinaAnswer.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.samples; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.model.*; 6 | import us.codecraft.webmagic.model.annotation.ExtractBy; 7 | import us.codecraft.webmagic.model.annotation.HelpUrl; 8 | import us.codecraft.webmagic.model.annotation.TargetUrl; 9 | 10 | /** 11 | * @author code4crafter@gmail.com
12 | */ 13 | @TargetUrl("http://www.oschina.net/question/\\d+_\\d+*") 14 | @HelpUrl("http://www.oschina.net/question/*") 15 | @ExtractBy(value = "//ul[@class='list']/li[@class='Answer']", multi = true) 16 | public class OschinaAnswer implements AfterExtractor{ 17 | 18 | @ExtractBy("//img/@title") 19 | private String user; 20 | 21 | @ExtractBy("//div[@class='detail']") 22 | private String content; 23 | 24 | public static void main(String[] args) { 25 | OOSpider.create(Site.me().addStartUrl("http://www.oschina.net/question/567527_120597"), OschinaAnswer.class).run(); 26 | } 27 | 28 | @Override 29 | public void afterProcess(Page page) { 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/model/DynamicClass.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import java.util.Date; 4 | 5 | /** 6 | * @author code4crafter@gmail.com 7 | */ 8 | public class DynamicClass { 9 | 10 | private String className; 11 | 12 | private String sourceCode; 13 | 14 | private Date addTime; 15 | 16 | private Date updateTime; 17 | 18 | public String getClassName() { 19 | return className; 20 | } 21 | 22 | public void setClassName(String className) { 23 | this.className = className; 24 | } 25 | 26 | public String getSourceCode() { 27 | return sourceCode; 28 | } 29 | 30 | public void setSourceCode(String sourceCode) { 31 | this.sourceCode = sourceCode; 32 | } 33 | 34 | public Date getAddTime() { 35 | return addTime; 36 | } 37 | 38 | public void setAddTime(Date addTime) { 39 | this.addTime = addTime; 40 | } 41 | 42 | public Date getUpdateTime() { 43 | return updateTime; 44 | } 45 | 46 | public void setUpdateTime(Date updateTime) { 47 | this.updateTime = updateTime; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/IteyeBlogProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.processor.PageProcessor; 7 | 8 | /** 9 | * @author code4crafter@gmail.com
10 | */ 11 | public class IteyeBlogProcessor implements PageProcessor { 12 | 13 | private Site site; 14 | 15 | @Override 16 | public void process(Page page) { 17 | page.addTargetRequests(page.getHtml().links().regex(".*yanghaoli\\.iteye\\.com/blog/\\d+").all()); 18 | page.putField("title",page.getHtml().xpath("//title").toString()); 19 | page.putField("content",page.getHtml().smartContent().toString()); 20 | } 21 | 22 | @Override 23 | public Site getSite() { 24 | if (site == null) { 25 | site = Site.me().setDomain("yanghaoli.iteye.com").addStartUrl("http://yanghaoli.iteye.com/"); 26 | } 27 | return site; 28 | } 29 | 30 | public static void main(String[] args) { 31 | Spider.create(new IteyeBlogProcessor()).thread(5).run(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/Extractor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.selector.Selector; 4 | 5 | /** 6 | * The object contains 'ExtractBy' information. 7 | * @author code4crafter@gmail.com
8 | * @since 0.2.0 9 | */ 10 | class Extractor { 11 | 12 | protected Selector selector; 13 | 14 | protected final Source source; 15 | 16 | protected final boolean notNull; 17 | 18 | protected final boolean multi; 19 | 20 | static enum Source {Html, Url, RawHtml} 21 | 22 | public Extractor(Selector selector, Source source, boolean notNull, boolean multi) { 23 | this.selector = selector; 24 | this.source = source; 25 | this.notNull = notNull; 26 | this.multi = multi; 27 | } 28 | 29 | Selector getSelector() { 30 | return selector; 31 | } 32 | 33 | Source getSource() { 34 | return source; 35 | } 36 | 37 | boolean isNotNull() { 38 | return notNull; 39 | } 40 | 41 | boolean isMulti() { 42 | return multi; 43 | } 44 | 45 | void setSelector(Selector selector) { 46 | this.selector = selector; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /webmagic-core/src/test/java/us/codecraft/webmagic/downloader/MockGithubDownloader.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.downloader; 2 | 3 | import org.apache.commons.io.IOUtils; 4 | import us.codecraft.webmagic.Page; 5 | import us.codecraft.webmagic.Request; 6 | import us.codecraft.webmagic.Task; 7 | import us.codecraft.webmagic.selector.PlainText; 8 | 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | 12 | /** 13 | * @author code4crafter@gmail.com 14 | */ 15 | public class MockGithubDownloader implements Downloader { 16 | 17 | @Override 18 | public Page download(Request request, Task task) { 19 | Page page = new Page(); 20 | InputStream resourceAsStream = this.getClass().getResourceAsStream("/html/mock-github.html"); 21 | try { 22 | page.setRawText(IOUtils.toString(resourceAsStream)); 23 | } catch (IOException e) { 24 | e.printStackTrace(); 25 | } 26 | page.setRequest(new Request("https://github.com/code4craft/webmagic")); 27 | page.setUrl(new PlainText("https://github.com/code4craft/webmagic")); 28 | return page; 29 | } 30 | 31 | @Override 32 | public void setThread(int threadNum) { 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/OschinaPageProcesser.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.Page; 5 | import us.codecraft.webmagic.processor.PageProcessor; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * @author code4crafter@gmail.com
11 | */ 12 | public class OschinaPageProcesser implements PageProcessor { 13 | 14 | @Override 15 | public void process(Page page) { 16 | List strings = page.getHtml().regex("]*href=[\"']{1}(http://www\\.oschina\\.net/question/[\\w]+)[\"']{1}").all(); 17 | page.addTargetRequests(strings); 18 | page.putField("title", page.getHtml().xpath("//div[@class='QTitle']/h1/a")); 19 | page.putField("content", page.getHtml().xpath("//div[@class='Question']//div[@class='Content']/div[@class='detail']")); 20 | } 21 | 22 | @Override 23 | public Site getSite() { 24 | return Site.me().setDomain("www.oschina.net").addStartUrl("http://www.oschina.net/"). 25 | setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | webmagic-avalon 5 | us.codecraft 6 | 0.5.4-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | us.codecraft 11 | webmagic-admin 12 | war 13 | 14 | 15 | 16 | us.codecraft 17 | webmagic-avalon-common 18 | ${project.version} 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | maven-deploy-plugin 27 | 28 | true 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/AnnotationPropertyLoader.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property; 2 | 3 | import java.lang.reflect.Field; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | */ 10 | public class AnnotationPropertyLoader extends AbstractPropertyLoader { 11 | 12 | @Override 13 | public List getProperties(Class clazz) { 14 | Field[] fields = clazz.getDeclaredFields(); 15 | List properties = new ArrayList(fields.length); 16 | for (Field field : fields) { 17 | Inject inject = field.getAnnotation(Inject.class); 18 | if (inject != null) { 19 | if (!field.isAccessible()) { 20 | field.setAccessible(true); 21 | } 22 | Property property = Property.fromField(field); 23 | if (inject.value().length() > 0) { 24 | property.setName(inject.value()); 25 | } 26 | property.setObjectFormatter(getObjectFormatter(field)); 27 | properties.add(property); 28 | } 29 | } 30 | return properties; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/sql/mysql/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `DynamicClass` ( 2 | `Id` int(11) unsigned NOT NULL AUTO_INCREMENT, 3 | `ClassName` varchar(200) NOT NULL, 4 | `SourceCode` text NOT NULL, 5 | `AddTime` datetime NOT NULL, 6 | `UpdateTime` datetime NOT NULL, 7 | PRIMARY KEY (`Id`), 8 | UNIQUE KEY `un_class_name` (`ClassName`) 9 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 10 | 11 | CREATE TABLE `Spider` ( 12 | `Id` int(11) unsigned NOT NULL AUTO_INCREMENT, 13 | `PageProcessorId` int(11) unsigned NOT NULL AUTO_INCREMENT, 14 | `PipelineId` int(11) unsigned NOT NULL AUTO_INCREMENT, 15 | `SchedulerId` int(11) unsigned NOT NULL AUTO_INCREMENT, 16 | `Config` text NOT NULL, 17 | `AddTime` datetime NOT NULL, 18 | `UpdateTime` datetime NOT NULL, 19 | PRIMARY KEY (`Id`), 20 | UNIQUE KEY `un_class_name` (`ClassName`) 21 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 22 | 23 | CREATE TABLE `PageProcessor` ( 24 | `Id` int(11) unsigned NOT NULL AUTO_INCREMENT, 25 | `ClassName` varchar(200) NOT NULL, 26 | `Params` text NOT NULL, 27 | `AddTime` datetime NOT NULL, 28 | `UpdateTime` datetime NOT NULL, 29 | PRIMARY KEY (`Id`), 30 | UNIQUE KEY `un_class_name` (`ClassName`) 31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/formatter/ObjectFormatters.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.formatter; 2 | 3 | import java.util.Map; 4 | import java.util.concurrent.ConcurrentHashMap; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | * @since 0.3.2 9 | */ 10 | public class ObjectFormatters { 11 | 12 | private static Map> formatterMap = new ConcurrentHashMap>(); 13 | 14 | static { 15 | for (Class basicTypeFormatter : BasicTypeFormatter.basicTypeFormatters) { 16 | put(basicTypeFormatter); 17 | } 18 | put(DateFormatter.class); 19 | } 20 | 21 | public static void put(Class objectFormatter) { 22 | try { 23 | formatterMap.put(objectFormatter.newInstance().clazz(), objectFormatter); 24 | } catch (InstantiationException e) { 25 | e.printStackTrace(); 26 | } catch (IllegalAccessException e) { 27 | e.printStackTrace(); 28 | } 29 | } 30 | 31 | public static Class get(Class clazz){ 32 | return formatterMap.get(clazz); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/monitor/SpiderMonitorTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.monitor; 2 | 3 | import org.junit.Test; 4 | import us.codecraft.webmagic.Spider; 5 | import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; 6 | import us.codecraft.webmagic.processor.example.OschinaBlogPageProcessor; 7 | 8 | /** 9 | * @author code4crafer@gmail.com 10 | * @since 0.5.0 11 | */ 12 | public class SpiderMonitorTest { 13 | 14 | @Test 15 | public void testInherit() throws Exception { 16 | SpiderMonitor spiderMonitor = new SpiderMonitor(){ 17 | @Override 18 | protected SpiderStatusMXBean getSpiderStatusMBean(Spider spider, MonitorSpiderListener monitorSpiderListener) { 19 | return new CustomSpiderStatus(spider, monitorSpiderListener); 20 | } 21 | }; 22 | 23 | Spider oschinaSpider = Spider.create(new OschinaBlogPageProcessor()) 24 | .addUrl("http://my.oschina.net/flashsword/blog").thread(2); 25 | Spider githubSpider = Spider.create(new GithubRepoPageProcessor()) 26 | .addUrl("https://github.com/code4craft"); 27 | 28 | spiderMonitor.register(oschinaSpider, githubSpider); 29 | 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-service.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-component.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 13 | 14 | 15 | 16 | 17 | web_messages 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/webapp/static/js/jquery.flot.resize.min.js: -------------------------------------------------------------------------------- 1 | (function(n,p,u){var w=n([]),s=n.resize=n.extend(n.resize,{}),o,l="setTimeout",m="resize",t=m+"-special-event",v="delay",r="throttleWindow";s[v]=250;s[r]=true;n.event.special[m]={setup:function(){if(!s[r]&&this[l]){return false}var a=n(this);w=w.add(a);n.data(this,t,{w:a.width(),h:a.height()});if(w.length===1){q()}},teardown:function(){if(!s[r]&&this[l]){return false}var a=n(this);w=w.not(a);a.removeData(t);if(!w.length){clearTimeout(o)}},add:function(b){if(!s[r]&&this[l]){return false}var c;function a(d,h,g){var f=n(this),e=n.data(this,t);e.w=h!==u?h:f.width();e.h=g!==u?g:f.height();c.apply(this,arguments)}if(n.isFunction(b)){c=b;return a}else{c=b.handler;b.handler=a}}};function q(){o=p[l](function(){w.each(function(){var d=n(this),a=d.width(),b=d.height(),c=n.data(this,t);if(a!==c.w||b!==c.h){d.trigger(m,[c.w=a,c.h=b])}});q()},s[v])}})(jQuery,this);(function(b){var a={};function c(f){function e(){var h=f.getPlaceholder();if(h.width()==0||h.height()==0){return}f.resize();f.setupGrid();f.draw()}function g(i,h){i.getPlaceholder().resize(e)}function d(i,h){i.getPlaceholder().unbind("resize",e)}f.hooks.bindEvents.push(g);f.hooks.shutdown.push(d)}b.plot.plugins.push({init:c,options:a,name:"resize",version:"1.0"})})(jQuery); -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/IteyeBlog.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.model.annotation.ExtractBy; 5 | import us.codecraft.webmagic.model.OOSpider; 6 | import us.codecraft.webmagic.model.annotation.TargetUrl; 7 | 8 | /** 9 | * @author code4crafter@gmail.com
10 | * Date: 13-8-2
11 | * Time: 上午7:52
12 | */ 13 | @TargetUrl("http://*.iteye.com/blog/*") 14 | public class IteyeBlog implements Blog{ 15 | 16 | @ExtractBy("//title") 17 | private String title; 18 | 19 | @ExtractBy(value = "div#blog_content",type = ExtractBy.Type.Css) 20 | private String content; 21 | 22 | @Override 23 | public String toString() { 24 | return "IteyeBlog{" + 25 | "title='" + title + '\'' + 26 | ", content='" + content + '\'' + 27 | '}'; 28 | } 29 | 30 | public static void main(String[] args) { 31 | OOSpider.create(Site.me().addStartUrl("http://flashsword20.iteye.com/blog"), IteyeBlog.class).run(); 32 | } 33 | 34 | public String getTitle() { 35 | return title; 36 | } 37 | 38 | public String getContent() { 39 | return content; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/format/Formatter.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property.format; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.Target; 6 | 7 | /** 8 | * Define how the result string is convert to an object for field. 9 | * 10 | * @author code4crafter@gmail.com
11 | * @since 0.3.2 12 | */ 13 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) 14 | @Target({ElementType.FIELD}) 15 | public @interface Formatter { 16 | 17 | /** 18 | * Set formatter params. 19 | * 20 | * @return formatter params 21 | */ 22 | String[] value(); 23 | 24 | /** 25 | * Specific the class of field of class of elements in collection for field.
26 | * It is not necessary to be set because we can detect the class by class of field, 27 | * unless you use a collection as a field.
28 | * 29 | * @return the class of field 30 | */ 31 | Class subClazz() default String.class; 32 | 33 | /** 34 | * If there are more than one formatter for a class, just specify the implement. 35 | * @return implement 36 | */ 37 | Class formatter() default TypeFormatter.class; 38 | 39 | } 40 | -------------------------------------------------------------------------------- /webmagic-core/src/test/java/us/codecraft/webmagic/example/GithubRepoPageProcessorTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.example; 2 | 3 | import org.junit.Test; 4 | import us.codecraft.webmagic.ResultItems; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.Task; 7 | import us.codecraft.webmagic.downloader.MockGithubDownloader; 8 | import us.codecraft.webmagic.pipeline.Pipeline; 9 | import us.codecraft.webmagic.processor.example.GithubRepoPageProcessor; 10 | 11 | import static org.assertj.core.api.Assertions.assertThat; 12 | 13 | /** 14 | * @author code4crafter@gmail.com 15 | * Date: 16/1/19 16 | * Time: 上午7:27 17 | */ 18 | public class GithubRepoPageProcessorTest { 19 | 20 | @Test 21 | public void test_github() throws Exception { 22 | Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() { 23 | @Override 24 | public void process(ResultItems resultItems, Task task) { 25 | assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic"); 26 | assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft"); 27 | } 28 | }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/scheduler/RedisSchedulerTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scheduler; 2 | 3 | import org.junit.Before; 4 | import org.junit.Ignore; 5 | import org.junit.Test; 6 | import us.codecraft.webmagic.Request; 7 | import us.codecraft.webmagic.Site; 8 | import us.codecraft.webmagic.Task; 9 | 10 | /** 11 | * @author code4crafter@gmail.com
12 | */ 13 | public class RedisSchedulerTest { 14 | 15 | private RedisScheduler redisScheduler; 16 | 17 | @Before 18 | public void setUp() { 19 | redisScheduler = new RedisScheduler("localhost"); 20 | } 21 | 22 | @Ignore("environment depended") 23 | @Test 24 | public void test() { 25 | Task task = new Task() { 26 | @Override 27 | public String getUUID() { 28 | return "1"; 29 | } 30 | 31 | @Override 32 | public Site getSite() { 33 | return null; 34 | } 35 | }; 36 | Request request = new Request("http://www.ibm.com/developerworks/cn/java/j-javadev2-22/"); 37 | request.putExtra("1","2"); 38 | redisScheduler.push(request, task); 39 | Request poll = redisScheduler.poll(task); 40 | System.out.println(poll); 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/MultiPageModel.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic; 2 | 3 | import us.codecraft.webmagic.utils.Experimental; 4 | 5 | import java.util.Collection; 6 | 7 | /** 8 | * Extract an object of more than one pages, such as news and articles.
9 | * 10 | * @author code4crafter@gmail.com
11 | * @since 0.2.0 12 | */ 13 | @Experimental 14 | public interface MultiPageModel { 15 | 16 | /** 17 | * Page key is the identifier for the object. 18 | * 19 | * @return page key 20 | */ 21 | public String getPageKey(); 22 | 23 | /** 24 | * page is the identifier of a page in pages for one object. 25 | * 26 | * @return page 27 | */ 28 | public String getPage(); 29 | 30 | /** 31 | * other pages to be extracted.
32 | * It is used to judge whether an object contains more than one page, and whether the pages of the object are all extracted. 33 | * 34 | * @return other pages 35 | */ 36 | public Collection getOtherPages(); 37 | 38 | /** 39 | * Combine multiPageModels to a whole object. 40 | * 41 | * @param multiPageModel multiPageModel 42 | * @return multiPageModel combined 43 | */ 44 | public MultiPageModel combine(MultiPageModel multiPageModel); 45 | 46 | } 47 | -------------------------------------------------------------------------------- /release.properties: -------------------------------------------------------------------------------- 1 | #release configuration 2 | #Thu Jan 21 19:19:07 CST 2016 3 | scm.commentPrefix=[maven-release-plugin] 4 | pushChanges=true 5 | project.dev.us.codecraft\:webmagic-extension=0.5.4-SNAPSHOT 6 | project.scm.us.codecraft\:webmagic-parent.developerConnection=scm\:git\:git@github.com\:code4craft/webmagic.git 7 | project.rel.us.codecraft\:webmagic-core=0.5.3 8 | project.rel.us.codecraft\:webmagic-extension=0.5.3 9 | project.rel.us.codecraft\:webmagic-parent=0.5.3 10 | scm.tag=webmagic-parent-0.5.3 11 | remoteTagging=true 12 | project.scm.us.codecraft\:webmagic-parent.tag=webmagic-parent-0.5.3 13 | exec.additionalArguments=-Psonatype-oss-release -P development 14 | project.dev.us.codecraft\:webmagic-core=0.5.4-SNAPSHOT 15 | scm.url=scm\:git\:git@github.com\:code4craft/webmagic.git 16 | scm.tagNameFormat=@{project.artifactId}-@{project.version} 17 | project.scm.us.codecraft\:webmagic-extension.empty=true 18 | project.scm.us.codecraft\:webmagic-parent.url=git@github.com\:code4craft/webmagic.git 19 | preparationGoals=clean verify 20 | project.scm.us.codecraft\:webmagic-core.empty=true 21 | project.scm.us.codecraft\:webmagic-parent.connection=scm\:git\:git@github.com\:code4craft/webmagic.git 22 | exec.snapshotReleasePluginAllowed=false 23 | project.dev.us.codecraft\:webmagic-parent=0.5.4-SNAPSHOT 24 | completedPhase=end-release 25 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/utils/MultiKeyMapBase.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.utils; 2 | 3 | /** 4 | * @author code4crafter@gmail.com 5 | */ 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | /** 11 | * multi-key map, some basic objects * 12 | * 13 | * @author yihua.huang 14 | */ 15 | public abstract class MultiKeyMapBase { 16 | 17 | protected static final Class DEFAULT_CLAZZ = HashMap.class; 18 | @SuppressWarnings("rawtypes") 19 | private Class protoMapClass = DEFAULT_CLAZZ; 20 | 21 | public MultiKeyMapBase() { 22 | } 23 | 24 | @SuppressWarnings("rawtypes") 25 | public MultiKeyMapBase(Class protoMapClass) { 26 | this.protoMapClass = protoMapClass; 27 | } 28 | 29 | @SuppressWarnings("unchecked") 30 | protected Map newMap() { 31 | try { 32 | return (Map) protoMapClass.newInstance(); 33 | } catch (InstantiationException e) { 34 | throw new IllegalArgumentException("wrong proto type map " 35 | + protoMapClass); 36 | } catch (IllegalAccessException e) { 37 | throw new IllegalArgumentException("wrong proto type map " 38 | + protoMapClass); 39 | } 40 | } 41 | } -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/DianpingFtlDataScanner.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.samples; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.model.AfterExtractor; 6 | import us.codecraft.webmagic.model.OOSpider; 7 | import us.codecraft.webmagic.model.annotation.ExtractBy; 8 | import us.codecraft.webmagic.model.annotation.TargetUrl; 9 | 10 | import java.util.List; 11 | 12 | /** 13 | * @author yihua.huang@dianping.com
14 | * Date: 13-8-13
15 | * Time: 上午10:13
16 | */ 17 | @TargetUrl("http://*.alpha.dp/*") 18 | public class DianpingFtlDataScanner implements AfterExtractor { 19 | 20 | @ExtractBy(value = "(DP\\.data\\(\\{.*\\}\\));", type = ExtractBy.Type.Regex, notNull = true, multi = true) 21 | private List data; 22 | 23 | public static void main(String[] args) { 24 | OOSpider.create(Site.me().addStartUrl("http://w.alpha.dp/").setSleepTime(0), DianpingFtlDataScanner.class) 25 | .thread(5).run(); 26 | } 27 | 28 | @Override 29 | public void afterProcess(Page page) { 30 | if (data.size() > 1) { 31 | System.err.println(page.getUrl()); 32 | } 33 | if (data.size() > 0 && data.get(0).length() > 100) { 34 | System.err.println(page.getUrl()); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/scheduler/QueueScheduler.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scheduler; 2 | 3 | import org.apache.http.annotation.ThreadSafe; 4 | import us.codecraft.webmagic.Request; 5 | import us.codecraft.webmagic.Task; 6 | 7 | import java.util.concurrent.BlockingQueue; 8 | import java.util.concurrent.LinkedBlockingQueue; 9 | 10 | 11 | /** 12 | * Basic Scheduler implementation.
13 | * Store urls to fetch in LinkedBlockingQueue and remove duplicate urls by HashMap. 14 | * 15 | * @author code4crafter@gmail.com
16 | * @since 0.1.0 17 | */ 18 | @ThreadSafe 19 | public class QueueScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler { 20 | 21 | private BlockingQueue queue = new LinkedBlockingQueue(); 22 | 23 | @Override 24 | public void pushWhenNoDuplicate(Request request, Task task) { 25 | queue.add(request); 26 | } 27 | 28 | @Override 29 | public synchronized Request poll(Task task) { 30 | return queue.poll(); 31 | } 32 | 33 | @Override 34 | public int getLeftRequestsCount(Task task) { 35 | return queue.size(); 36 | } 37 | 38 | @Override 39 | public int getTotalRequestsCount(Task task) { 40 | return getDuplicateRemover().getTotalRequestsCount(task); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /webmagic-core/src/test/java/us/codecraft/webmagic/selector/SelectorTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.List; 6 | 7 | import static org.assertj.core.api.Assertions.assertThat; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | */ 12 | public class SelectorTest { 13 | 14 | private String html = "

"; 15 | 16 | @Test 17 | public void testChain() throws Exception { 18 | Html selectable = new Html(html); 19 | List linksWithoutChain = selectable.links().all(); 20 | Selectable xpath = selectable.xpath("//div"); 21 | List linksWithChainFirstCall = xpath.links().all(); 22 | List linksWithChainSecondCall = xpath.links().all(); 23 | assertThat(linksWithoutChain).hasSameSizeAs(linksWithChainFirstCall); 24 | assertThat(linksWithChainFirstCall).hasSameSizeAs(linksWithChainSecondCall); 25 | } 26 | 27 | @Test 28 | public void testNodes() throws Exception { 29 | Html selectable = new Html(html); 30 | List links = selectable.xpath("//a").nodes(); 31 | assertThat(links.get(0).links().get()).isEqualTo("http://whatever.com/aaa"); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/F58PageProcesser.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.Page; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.processor.PageProcessor; 7 | import us.codecraft.webmagic.scheduler.RedisScheduler; 8 | 9 | import java.util.List; 10 | 11 | /** 12 | * @author code4crafter@gmail.com
13 | * Date: 13-4-21 14 | * Time: 下午1:48 15 | */ 16 | public class F58PageProcesser implements PageProcessor { 17 | 18 | @Override 19 | public void process(Page page) { 20 | List strings = page.getHtml().links().regex(".*/yewu/.*").all(); 21 | page.addTargetRequests(strings); 22 | page.putField("title",page.getHtml().regex("(.*)")); 23 | page.putField("body",page.getHtml().xpath("//dd")); 24 | } 25 | 26 | @Override 27 | public Site getSite() { 28 | return Site.me().setDomain("sh.58.com").addStartUrl("http://sh1.51a8.com/").setCycleRetryTimes(2); //To change body of implemented methods use File | Settings | File Templates. 29 | } 30 | 31 | public static void main(String[] args) { 32 | Spider.create(new F58PageProcesser()).setScheduler(new RedisScheduler("localhost")).run(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /webmagic-core/src/test/java/us/codecraft/webmagic/pipeline/FilePipelineTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.pipeline; 2 | 3 | import org.junit.BeforeClass; 4 | import org.junit.Test; 5 | import us.codecraft.webmagic.Request; 6 | import us.codecraft.webmagic.ResultItems; 7 | import us.codecraft.webmagic.Site; 8 | import us.codecraft.webmagic.Task; 9 | 10 | import java.util.UUID; 11 | 12 | /** 13 | * Created by ywooer on 2014/5/6 0006. 14 | */ 15 | public class FilePipelineTest { 16 | 17 | private static ResultItems resultItems; 18 | private static Task task; 19 | 20 | @BeforeClass 21 | public static void before() { 22 | resultItems = new ResultItems(); 23 | resultItems.put("content", "webmagic 爬虫工具"); 24 | Request request = new Request("http://www.baidu.com"); 25 | resultItems.setRequest(request); 26 | 27 | task = new Task() { 28 | @Override 29 | public String getUUID() { 30 | return UUID.randomUUID().toString(); 31 | } 32 | 33 | @Override 34 | public Site getSite() { 35 | return null; 36 | } 37 | }; 38 | } 39 | @Test 40 | public void testProcess() { 41 | FilePipeline filePipeline = new FilePipeline(); 42 | filePipeline.process(resultItems, task); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/Formatter.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.annotation; 2 | 3 | import us.codecraft.webmagic.model.formatter.ObjectFormatter; 4 | 5 | import java.lang.annotation.ElementType; 6 | import java.lang.annotation.Retention; 7 | import java.lang.annotation.Target; 8 | 9 | /** 10 | * Define how the result string is convert to an object for field. 11 | * 12 | * @author code4crafter@gmail.com
13 | * @since 0.3.2 14 | */ 15 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) 16 | @Target({ElementType.FIELD}) 17 | public @interface Formatter { 18 | 19 | /** 20 | * Set formatter params. 21 | * 22 | * @return formatter params 23 | */ 24 | String[] value() default ""; 25 | 26 | /** 27 | * Specific the class of field of class of elements in collection for field.
28 | * It is not necessary to be set because we can detect the class by class of field, 29 | * unless you use a collection as a field.
30 | * 31 | * @return the class of field 32 | */ 33 | Class subClazz() default Void.class; 34 | 35 | /** 36 | * If there are more than one formatter for a class, just specify the implement. 37 | * @return implement 38 | */ 39 | Class formatter() default ObjectFormatter.class; 40 | 41 | } 42 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/TargetUrl.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.Target; 6 | 7 | /** 8 | * Define the url patterns for class.
9 | * All urls matching the pattern will be crawled and extracted for new objects.
10 | * 11 | * @author code4crafter@gmail.com
12 | * @since 0.2.0 13 | */ 14 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) 15 | @Target({ElementType.TYPE}) 16 | public @interface TargetUrl { 17 | 18 | /** 19 | * The url patterns for class.
20 | * Use regex expression with some changes:
21 | * "." stand for literal character "." instead of "any character".
22 | * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
23 | * 24 | * @return the url patterns for class 25 | */ 26 | String[] value(); 27 | 28 | /** 29 | * Define the region for url extracting.
30 | * Only support XPath.
31 | * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
32 | * 33 | * @return the region for url extracting 34 | */ 35 | String sourceRegion() default ""; 36 | 37 | } 38 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/BaiduNews.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.model.OOSpider; 5 | import us.codecraft.webmagic.model.annotation.ExtractBy; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | * @date 14-4-9 10 | */ 11 | public class BaiduNews { 12 | 13 | @ExtractBy("//h3[@class='c-title']/a/text()") 14 | private String name; 15 | 16 | @ExtractBy("//div[@class='c-summary']/text()") 17 | private String description; 18 | 19 | @Override 20 | public String toString() { 21 | return "BaiduNews{" + 22 | "name='" + name + '\'' + 23 | ", description='" + description + '\'' + 24 | '}'; 25 | } 26 | 27 | public static void main(String[] args) { 28 | OOSpider ooSpider = OOSpider.create(Site.me().setSleepTime(0), BaiduNews.class); 29 | //single download 30 | BaiduNews baike = ooSpider.get("http://news.baidu.com/ns?tn=news&cl=2&rn=20&ct=1&fr=bks0000&ie=utf-8&word=httpclient"); 31 | System.out.println(baike); 32 | 33 | ooSpider.close(); 34 | } 35 | 36 | public String getName() { 37 | return name; 38 | } 39 | 40 | public String getDescription() { 41 | return description; 42 | } 43 | } -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/HelpUrl.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.Target; 6 | 7 | /** 8 | * Define the 'help' url patterns for class.
9 | * All urls matching the pattern will be crawled and but not extracted for new objects.
10 | * 11 | * @author code4crafter@gmail.com
12 | * @since 0.2.0 13 | */ 14 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) 15 | @Target({ElementType.TYPE}) 16 | public @interface HelpUrl { 17 | 18 | /** 19 | * The url patterns to crawl.
20 | * Use regex expression with some changes:
21 | * "." stand for literal character "." instead of "any character".
22 | * "*" stand for any legal character for url in 0-n length ([^"'#]*) instead of "any length".
23 | * 24 | * @return the url patterns for class 25 | */ 26 | String[] value(); 27 | 28 | /** 29 | * Define the region for url extracting.
30 | * Only support XPath.
31 | * When sourceRegion is set, the urls will be extracted only from the region instead of entire content.
32 | * 33 | * @return the region for url extracting 34 | */ 35 | String sourceRegion() default ""; 36 | } 37 | -------------------------------------------------------------------------------- /webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptEnginePool.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scripts; 2 | 3 | import javax.script.ScriptEngine; 4 | import javax.script.ScriptEngineManager; 5 | import java.util.concurrent.LinkedBlockingQueue; 6 | import java.util.concurrent.atomic.AtomicInteger; 7 | 8 | /** 9 | * @author code4crafter@gmail.com 10 | * @since 0.4.1 11 | */ 12 | public class ScriptEnginePool { 13 | 14 | private final int size; 15 | 16 | private final AtomicInteger availableCount; 17 | 18 | private final LinkedBlockingQueue scriptEngines = new LinkedBlockingQueue(); 19 | 20 | public ScriptEnginePool(Language language,int size) { 21 | this.size = size; 22 | this.availableCount = new AtomicInteger(size); 23 | for (int i=0;i spiderMap; 24 | 25 | public Worker(int poolSize) { 26 | this.poolSize = poolSize; 27 | this.executorService = initExecutorService(); 28 | this.spiderMap = new ConcurrentHashMap(); 29 | } 30 | 31 | public Worker() { 32 | this(DEFAULT_POOL_SIZE); 33 | } 34 | 35 | protected ExecutorService initExecutorService() { 36 | return Executors.newFixedThreadPool(poolSize); 37 | } 38 | 39 | public void addSpider(Spider spider) { 40 | spider.setExecutorService(executorService); 41 | spiderMap.put(spider.getUUID(), spider); 42 | } 43 | 44 | public Spider getSpider(String uuid){ 45 | return spiderMap.get(uuid); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/ReplaceSelector.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import java.util.List; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | import java.util.regex.PatternSyntaxException; 7 | 8 | /** 9 | * Replace selector.
10 | * 11 | * @author code4crafter@gmail.com
12 | * @since 0.1.0 13 | */ 14 | public class ReplaceSelector implements Selector { 15 | 16 | private String regexStr; 17 | 18 | private String replacement; 19 | 20 | private Pattern regex; 21 | 22 | public ReplaceSelector(String regexStr, String replacement) { 23 | this.regexStr = regexStr; 24 | this.replacement = replacement; 25 | try { 26 | regex = Pattern.compile(regexStr); 27 | } catch (PatternSyntaxException e) { 28 | throw new IllegalArgumentException("invalid regex", e); 29 | } 30 | } 31 | 32 | @Override 33 | public String select(String text) { 34 | Matcher matcher = regex.matcher(text); 35 | return matcher.replaceAll(replacement); 36 | } 37 | 38 | @Override 39 | public List selectList(String text) { 40 | throw new UnsupportedOperationException(); 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return regexStr + "_" + replacement; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/utils/IPUtils.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.utils; 2 | 3 | import java.net.Inet6Address; 4 | import java.net.InetAddress; 5 | import java.net.NetworkInterface; 6 | import java.net.SocketException; 7 | import java.util.Enumeration; 8 | 9 | /** 10 | * @author code4crafer@gmail.com 11 | * @since 0.5.0 12 | */ 13 | public abstract class IPUtils { 14 | 15 | public static String getFirstNoLoopbackIPAddresses() throws SocketException { 16 | 17 | Enumeration networkInterfaces = NetworkInterface.getNetworkInterfaces(); 18 | 19 | InetAddress localAddress = null; 20 | while (networkInterfaces.hasMoreElements()) { 21 | NetworkInterface networkInterface = networkInterfaces.nextElement(); 22 | Enumeration inetAddresses = networkInterface.getInetAddresses(); 23 | while (inetAddresses.hasMoreElements()) { 24 | InetAddress address = inetAddresses.nextElement(); 25 | if (!address.isLoopbackAddress() && !Inet6Address.class.isInstance(address)) { 26 | return address.getHostAddress(); 27 | } else if (!address.isLoopbackAddress()) { 28 | localAddress = address; 29 | } 30 | } 31 | } 32 | 33 | return localAddress.getHostAddress(); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/model/samples/JokejiModel.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.samples; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.model.ConsolePageModelPipeline; 5 | import us.codecraft.webmagic.model.OOSpider; 6 | import us.codecraft.webmagic.model.annotation.ExtractBy; 7 | import us.codecraft.webmagic.model.annotation.HelpUrl; 8 | import us.codecraft.webmagic.model.annotation.TargetUrl; 9 | import us.codecraft.webmagic.scheduler.RedisScheduler; 10 | 11 | /** 12 | * @author code4crafter@gmail.com 13 | */ 14 | @TargetUrl("http://www.jokeji.cn/jokehtml/jy/\\d+.htm") 15 | @HelpUrl("http://www.jokeji.cn/list\\w+.htm") 16 | public class JokejiModel { 17 | 18 | @ExtractBy("//title/regex('([^_]+)',1)") 19 | private String title; 20 | 21 | @ExtractBy("//div[@class=mob_txt]/tidyText()") 22 | private String content; 23 | 24 | public static void main(String[] args) { 25 | OOSpider.create(Site.me().setDomain("www.jokeji.cn").setCharset("gbk").setSleepTime(100).setTimeOut(3000) 26 | .setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)") 27 | , new ConsolePageModelPipeline(), JokejiModel.class).addUrl("http://www.jokeji.cn/").thread(2) 28 | .scheduler(new RedisScheduler("127.0.0.1")) 29 | .run(); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/test/java/us/codecraft/webmagic/Foo.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic; 2 | 3 | import us.codecraft.forger.property.Inject; 4 | import us.codecraft.forger.property.format.Formatter; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | */ 9 | public class Foo { 10 | 11 | @Formatter("") 12 | @Inject("fooa") 13 | private String foo; 14 | 15 | public static final String SOURCE_CODE="package us.codecraft.webmagic;\n" + 16 | "\n" + 17 | "import us.codecraft.forger.property.Inject;\n" + 18 | "import us.codecraft.forger.property.format.Formatter;\n" + 19 | "\n" + 20 | "/**\n" + 21 | " * @author code4crafter@gmail.com\n" + 22 | " */\n" + 23 | "public class Foo {\n" + 24 | "\n" + 25 | " @Formatter(\"\")\n" + 26 | " @Inject(\"fooa\")\n" + 27 | " private String foo;\n" + 28 | "\n" + 29 | " public String getFoo() {\n" + 30 | " return foo;\n" + 31 | " }\n" + 32 | "\n" + 33 | " public String foo() {\n" + 34 | " return foo;\n" + 35 | " }\n" + 36 | "}"; 37 | 38 | public String getFoo() { 39 | return foo; 40 | } 41 | 42 | public String foo() { 43 | return foo; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/utils/FilePersistentBase.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.utils; 2 | 3 | import java.io.File; 4 | 5 | /** 6 | * Base object of file persistence. 7 | * 8 | * @author code4crafter@gmail.com <br> 9 | * @since 0.2.0 10 | */ 11 | public class FilePersistentBase { 12 | 13 | protected String path; 14 | 15 | public static String PATH_SEPERATOR = "/"; 16 | 17 | static { 18 | String property = System.getProperties().getProperty("file.separator"); 19 | if (property != null) { 20 | PATH_SEPERATOR = property; 21 | } 22 | } 23 | 24 | public void setPath(String path) { 25 | if (!path.endsWith(PATH_SEPERATOR)) { 26 | path += PATH_SEPERATOR; 27 | } 28 | this.path = path; 29 | } 30 | 31 | public File getFile(String fullName) { 32 | checkAndMakeParentDirecotry(fullName); 33 | return new File(fullName); 34 | } 35 | 36 | public void checkAndMakeParentDirecotry(String fullName) { 37 | int index = fullName.lastIndexOf(PATH_SEPERATOR); 38 | if (index > 0) { 39 | String path = fullName.substring(0, index); 40 | File file = new File(path); 41 | if (!file.exists()) { 42 | file.mkdirs(); 43 | } 44 | } 45 | } 46 | 47 | public String getPath() { 48 | return path; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /webmagic-core/src/test/java/us/codecraft/webmagic/selector/ExtractorsTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.assertj.core.api.Assertions.assertThat; 6 | import static us.codecraft.webmagic.selector.Selectors.*; 7 | 8 | /** 9 | * @author code4crafter@gmail.com <br> 10 | */ 11 | public class ExtractorsTest { 12 | 13 | String html = "<div><h1>test<a href=\"xxx\">aabbcc</a></h1></div>"; 14 | 15 | String html2 = "<title>aabbcc"; 16 | 17 | @Test 18 | public void testEach() { 19 | assertThat($("div h1 a").select(html)).isEqualTo("aabbcc"); 20 | assertThat($("div h1 a", "href").select(html)).isEqualTo("xxx"); 21 | assertThat($("div h1 a", "innerHtml").select(html)).isEqualTo("aabbcc"); 22 | assertThat(xpath("//a/@href").select(html)).isEqualTo("xxx"); 23 | assertThat(regex("a href=\"(.*)\"").select(html)).isEqualTo("xxx"); 24 | assertThat(regex("(a href)=\"(.*)\"", 2).select(html)).isEqualTo("xxx"); 25 | } 26 | 27 | @Test 28 | public void testCombo() { 29 | assertThat(and($("title"), regex("aa(bb)cc")).select(html2)).isEqualTo("bb"); 30 | OrSelector or = or($("div h1 a", "innerHtml"), xpath("//title")); 31 | assertThat(or.select(html)).isEqualTo("aabbcc"); 32 | assertThat(or.select(html2)).isEqualTo("aabbcc"); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /webmagic-samples/src/test/java/us/codecraft/webmagic/processor/SinablogProcessorTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.processor; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.pipeline.FilePipeline; 7 | import us.codecraft.webmagic.pipeline.JsonFilePipeline; 8 | import us.codecraft.webmagic.samples.SinaBlogProcessor; 9 | import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler; 10 | 11 | import java.io.IOException; 12 | 13 | /** 14 | * @author code4crafter@gmail.com
15 | * Date: 13-6-9 16 | * Time: 上午8:02 17 | */ 18 | public class SinablogProcessorTest { 19 | 20 | @Ignore 21 | @Test 22 | public void test() throws IOException { 23 | SinaBlogProcessor sinaBlogProcessor = new SinaBlogProcessor(); 24 | //pipeline是抓取结束后的处理 25 | //默认放到/data/webmagic/ftl/[domain]目录下 26 | JsonFilePipeline pipeline = new JsonFilePipeline("/data/webmagic/"); 27 | //Spider.me()是简化写法,其实就是new一个啦 28 | //Spider.pipeline()设定一个pipeline,支持链式调用 29 | //ConsolePipeline输出结果到控制台 30 | //FileCacheQueueSchedular保存url,支持断点续传,临时文件输出到/data/temp/webmagic/cache目录 31 | //Spider.run()执行 32 | Spider.create(sinaBlogProcessor).pipeline(new FilePipeline()).pipeline(pipeline).scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")). 33 | run(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /webmagic-selenium/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | webmagic-parent 5 | us.codecraft 6 | 0.5.4-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | webmagic-selenium 11 | 12 | 13 | 14 | org.seleniumhq.selenium 15 | selenium-java 16 | 2.46.0 17 | 18 | 19 | us.codecraft 20 | webmagic-core 21 | ${project.version} 22 | 23 | 24 | com.github.detro 25 | phantomjsdriver 26 | 1.2.0 27 | 28 | 29 | 30 | 31 | 32 | junit 33 | junit 34 | 35 | 36 | 37 | 38 | 39 | 40 | maven-deploy-plugin 41 | 42 | true 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /webmagic-selenium/src/test/java/us/codecraft/webmagic/samples/GooglePlayProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.downloader.selenium.SeleniumDownloader; 7 | import us.codecraft.webmagic.pipeline.FilePipeline; 8 | import us.codecraft.webmagic.processor.PageProcessor; 9 | 10 | /** 11 | * 12 | * Using Selenium with PhantomJS to fetch web-page with JS
13 | * 14 | * @author bob.li.0718@gmail.com
15 | * Date: 15-7-11
16 | */ 17 | public class GooglePlayProcessor implements PageProcessor { 18 | 19 | private Site site; 20 | 21 | @Override 22 | public void process(Page page) { 23 | 24 | page.putField("whole-html", page.getHtml().toString()); 25 | 26 | } 27 | 28 | @Override 29 | public Site getSite() { 30 | if (null == site) { 31 | site = Site.me().setDomain("play.google.com").setSleepTime(300); 32 | } 33 | return site; 34 | } 35 | 36 | public static void main(String[] args) { 37 | Spider.create(new GooglePlayProcessor()) 38 | .thread(5) 39 | .addPipeline( 40 | new FilePipeline( 41 | "/Users/Bingo/Documents/workspace/webmagic/webmagic-selenium/data/")) 42 | .setDownloader(new SeleniumDownloader()) 43 | .addUrl("https://play.google.com/store/apps/details?id=com.tencent.mm") 44 | .runAsync(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/OrSelector.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * All extractors will do extracting separately,
8 | * and the results of extractors will combined as the final result. 9 | * @author code4crafter@gmail.com
10 | * @since 0.2.0 11 | */ 12 | public class OrSelector implements Selector { 13 | 14 | private List selectors = new ArrayList(); 15 | 16 | public OrSelector(Selector... selectors) { 17 | for (Selector selector : selectors) { 18 | this.selectors.add(selector); 19 | } 20 | } 21 | 22 | public OrSelector(List selectors) { 23 | this.selectors = selectors; 24 | } 25 | 26 | @Override 27 | public String select(String text) { 28 | for (Selector selector : selectors) { 29 | String result = selector.select(text); 30 | if (result != null) { 31 | return result; 32 | } 33 | } 34 | return null; 35 | } 36 | 37 | @Override 38 | public List selectList(String text) { 39 | List results = new ArrayList(); 40 | for (Selector selector : selectors) { 41 | List strings = selector.selectList(text); 42 | results.addAll(strings); 43 | } 44 | return results; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/KaichibaProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.processor.PageProcessor; 7 | 8 | /** 9 | * @author code4crafter@gmail.com
10 | * Date: 13-5-20 11 | * Time: 下午5:31 12 | */ 13 | public class KaichibaProcessor implements PageProcessor { 14 | @Override 15 | public void process(Page page) { 16 | //http://progressdaily.diandian.com/post/2013-01-24/40046867275 17 | int i = Integer.valueOf(page.getUrl().regex("shop/(\\d+)").toString()) + 1; 18 | page.addTargetRequest("http://kaichiba.com/shop/" + i); 19 | page.putField("title",page.getHtml().xpath("//Title")); 20 | page.putField("items", page.getHtml().xpath("//li[@class=\"foodTitle\"]").replace("^\\s+", "").replace("\\s+$", "").replace(".*?", "")); 21 | } 22 | 23 | @Override 24 | public Site getSite() { 25 | return Site.me().setDomain("kaichiba.com").addStartUrl("http://kaichiba.com/shop/41725781").setCharset("utf-8"). 26 | setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); 27 | } 28 | 29 | public static void main(String[] args) { 30 | Spider.create(new KaichibaProcessor()).run(); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/example/GithubRepoPageMapper.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.example; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.model.PageMapper; 7 | import us.codecraft.webmagic.processor.PageProcessor; 8 | 9 | /** 10 | * @author code4crafter@gmail.com
11 | * @since 0.3.2 12 | */ 13 | public class GithubRepoPageMapper implements PageProcessor { 14 | 15 | private Site site = Site.me().setRetryTimes(3).setSleepTime(0); 16 | 17 | private PageMapper githubRepoPageMapper = new PageMapper(GithubRepo.class); 18 | 19 | @Override 20 | public void process(Page page) { 21 | page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); 22 | page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); 23 | GithubRepo githubRepo = githubRepoPageMapper.get(page); 24 | if (githubRepo == null) { 25 | page.setSkip(true); 26 | } else { 27 | page.putField("repo", githubRepo); 28 | } 29 | 30 | } 31 | 32 | @Override 33 | public Site getSite() { 34 | return site; 35 | } 36 | 37 | public static void main(String[] args) { 38 | Spider.create(new GithubRepoPageMapper()).addUrl("https://github.com/code4craft").thread(5).run(); 39 | } 40 | } -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/annotation/ExtractByUrl.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model.annotation; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.Target; 6 | 7 | /** 8 | * Define a extractor to extract data in url of current page. Only regex can be used.
9 | * 10 | * @author code4crafter@gmail.com
11 | * @since 0.2.0 12 | */ 13 | @Retention(java.lang.annotation.RetentionPolicy.RUNTIME) 14 | @Target({ElementType.FIELD}) 15 | public @interface ExtractByUrl { 16 | 17 | /** 18 | * Extractor expression, only regex can be used 19 | * 20 | * @return extractor expression 21 | */ 22 | String value() default ""; 23 | 24 | /** 25 | * Define whether the field can be null.
26 | * If set to 'true' and the extractor get no result, the entire class will be discarded.
27 | * 28 | * @return whether the field can be null 29 | */ 30 | boolean notNull() default false; 31 | 32 | /** 33 | * Define whether the extractor return more than one result. 34 | * When set to 'true', the extractor return a list of string (so you should define the field as List).
35 | * 36 | * Deprecated since 0.4.2. This option is determined automatically by the class of field. 37 | * @deprecated since 0.4.2 38 | * @return whether the extractor return more than one result 39 | */ 40 | boolean multi() default false; 41 | 42 | } 43 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/processor/SimplePageProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.processor; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.utils.UrlUtils; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * A simple PageProcessor. 11 | * 12 | * @author code4crafter@gmail.com
13 | * @since 0.1.0 14 | */ 15 | public class SimplePageProcessor implements PageProcessor { 16 | 17 | private String urlPattern; 18 | 19 | private Site site; 20 | 21 | public SimplePageProcessor(String startUrl, String urlPattern) { 22 | this.site = Site.me().addStartUrl(startUrl). 23 | setDomain(UrlUtils.getDomain(startUrl)); 24 | //compile "*" expression to regex 25 | this.urlPattern = "(" + urlPattern.replace(".", "\\.").replace("*", "[^\"'#]*") + ")"; 26 | 27 | } 28 | 29 | @Override 30 | public void process(Page page) { 31 | List requests = page.getHtml().links().regex(urlPattern).all(); 32 | //add urls to fetch 33 | page.addTargetRequests(requests); 34 | //extract by XPath 35 | page.putField("title", page.getHtml().xpath("//title")); 36 | page.putField("html", page.getHtml().toString()); 37 | //extract by Readability 38 | page.putField("content", page.getHtml().smartContent()); 39 | } 40 | 41 | @Override 42 | public Site getSite() { 43 | //settings 44 | return site; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/OschinaBlogPageProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.processor.example; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.processor.PageProcessor; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * @author code4crafter@gmail.com
12 | */ 13 | public class OschinaBlogPageProcessor implements PageProcessor { 14 | 15 | private Site site = Site.me().setDomain("my.oschina.net"); 16 | 17 | @Override 18 | public void process(Page page) { 19 | List links = page.getHtml().links().regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").all(); 20 | page.addTargetRequests(links); 21 | page.putField("title", page.getHtml().xpath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").toString()); 22 | if (page.getResultItems().get("title") == null) { 23 | //skip this page 24 | page.setSkip(true); 25 | } 26 | page.putField("content", page.getHtml().smartContent().toString()); 27 | page.putField("tags", page.getHtml().xpath("//div[@class='BlogTags']/a/text()").all()); 28 | } 29 | 30 | @Override 31 | public Site getSite() { 32 | return site; 33 | 34 | } 35 | 36 | public static void main(String[] args) { 37 | Spider.create(new OschinaBlogPageProcessor()).addUrl("http://my.oschina.net/flashsword/blog").run(); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | /** 4 | * Convenient methods for selectors.
5 | * 6 | * @author code4crafter@gmail.com
7 | * @since 0.2.1 8 | */ 9 | public abstract class Selectors { 10 | 11 | public static RegexSelector regex(String expr) { 12 | return new RegexSelector(expr); 13 | } 14 | 15 | public static RegexSelector regex(String expr, int group) { 16 | return new RegexSelector(expr,group); 17 | } 18 | 19 | public static SmartContentSelector smartContent() { 20 | return new SmartContentSelector(); 21 | } 22 | 23 | public static CssSelector $(String expr) { 24 | return new CssSelector(expr); 25 | } 26 | 27 | public static CssSelector $(String expr, String attrName) { 28 | return new CssSelector(expr, attrName); 29 | } 30 | 31 | public static XpathSelector xpath(String expr) { 32 | return new XpathSelector(expr); 33 | } 34 | 35 | /** 36 | * @Deprecated 37 | * @see #xpath(String) 38 | * @param expr expr 39 | * @return new selector 40 | */ 41 | public static XpathSelector xsoup(String expr) { 42 | return new XpathSelector(expr); 43 | } 44 | 45 | public static AndSelector and(Selector... selectors) { 46 | return new AndSelector(selectors); 47 | } 48 | 49 | public static OrSelector or(Selector... selectors) { 50 | return new OrSelector(selectors); 51 | } 52 | 53 | } -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/test/java/us/codecraft/forger/Foo.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger; 2 | 3 | import us.codecraft.forger.property.Inject; 4 | import us.codecraft.forger.property.format.Formatter; 5 | 6 | /** 7 | * @author code4crafter@gmail.com 8 | */ 9 | public class Foo implements Fooable{ 10 | 11 | @Formatter("") 12 | @Inject("fooa") 13 | private String foo; 14 | 15 | public static final String SOURCE_CODE="import us.codecraft.forger.*;\n" + 16 | "import us.codecraft.forger.property.Inject;\n" + 17 | "import us.codecraft.forger.property.Inject;\n" + 18 | "import us.codecraft.forger.property.format.Formatter;\n" + 19 | "\n" + 20 | "/**\n" + 21 | " * @author code4crafter@gmail.com\n" + 22 | " */\n" + 23 | "public class Foo implements Fooable{\n" + 24 | "\n" + 25 | " @Formatter(\"\")\n" + 26 | " @Inject(\"fooa\")\n" + 27 | " private String foo;\n" + 28 | "\n" + 29 | " public String getFoo() {\n" + 30 | " return foo;\n" + 31 | " }\n" + 32 | "\n" + 33 | " @Override\n" + 34 | " public String foo() {\n" + 35 | " return foo;\n" + 36 | " }\n" + 37 | "}"; 38 | 39 | public String getFoo() { 40 | return foo; 41 | } 42 | 43 | @Override 44 | public String foo() { 45 | return foo; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Element; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * @author code4crafter@gmail.com 11 | * @since 0.3.0 12 | */ 13 | public abstract class BaseElementSelector implements Selector, ElementSelector { 14 | 15 | @Override 16 | public String select(String text) { 17 | if (text != null) { 18 | return select(Jsoup.parse(text)); 19 | } 20 | return null; 21 | } 22 | 23 | @Override 24 | public List selectList(String text) { 25 | if (text != null) { 26 | return selectList(Jsoup.parse(text)); 27 | } else { 28 | return new ArrayList(); 29 | } 30 | } 31 | 32 | public Element selectElement(String text) { 33 | if (text != null) { 34 | return selectElement(Jsoup.parse(text)); 35 | } 36 | return null; 37 | } 38 | 39 | public List selectElements(String text) { 40 | if (text != null) { 41 | return selectElements(Jsoup.parse(text)); 42 | } else { 43 | return new ArrayList(); 44 | } 45 | } 46 | 47 | public abstract Element selectElement(Element element); 48 | 49 | public abstract List selectElements(Element element); 50 | 51 | public abstract boolean hasAttribute(); 52 | 53 | } 54 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/webapp/static/css/jquery.cleditor.css: -------------------------------------------------------------------------------- 1 | .cleditorMain {border:1px solid #999; padding:0 1px 1px; background-color:white} 2 | .cleditorMain iframe {border:none; margin:0; padding:0} 3 | .cleditorMain textarea {border:none; margin:0; padding:0; overflow-y:scroll; font:10pt Arial,Verdana; resize:none; outline:none /* webkit grip focus */} 4 | .cleditorToolbar {background: url('../img/toolbar.gif') repeat} 5 | .cleditorGroup {float:left; height:26px} 6 | .cleditorButton {float:left; width:24px; height:24px; margin:1px 0 1px 0; background: url('../img/buttons.gif')} 7 | .cleditorDisabled {opacity:0.3; filter:alpha(opacity=30)} 8 | .cleditorDivider {float:left; width:1px; height:23px; margin:1px 0 1px 0; background:#CCC} 9 | .cleditorPopup {border:solid 1px #999; background-color:white; position:absolute; font:10pt Arial,Verdana; cursor:default; z-index:10000} 10 | .cleditorList div {padding:2px 4px 2px 4px} 11 | .cleditorList p, 12 | .cleditorList h1, 13 | .cleditorList h2, 14 | .cleditorList h3, 15 | .cleditorList h4, 16 | .cleditorList h5, 17 | .cleditorList h6, 18 | .cleditorList font {padding:0; margin:0; background-color:Transparent} 19 | .cleditorColor {width:150px; padding:1px 0 0 1px} 20 | .cleditorColor div {float:left; width:14px; height:14px; margin:0 1px 1px 0} 21 | .cleditorPrompt {background-color:#F6F7F9; padding:4px; font-size:8.5pt} 22 | .cleditorPrompt input, 23 | .cleditorPrompt textarea {font:8.5pt Arial,Verdana;} 24 | .cleditorMsg {background-color:#FDFCEE; width:150px; padding:4px; font-size:8.5pt} 25 | -------------------------------------------------------------------------------- /webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.scripts; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | import us.codecraft.webmagic.Spider; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | * @since 0.4.1 10 | */ 11 | @Ignore 12 | public class ScriptProcessorTest { 13 | 14 | @Test 15 | public void testJavaScriptProcessor() { 16 | ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JavaScript).scriptFromClassPathFile("js/oschina.js").build(); 17 | pageProcessor.getSite().setSleepTime(0); 18 | Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); 19 | } 20 | 21 | @Test 22 | public void testRubyProcessor() { 23 | ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.JRuby).scriptFromClassPathFile("ruby/oschina.rb").build(); 24 | pageProcessor.getSite().setSleepTime(0); 25 | Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); 26 | } 27 | 28 | 29 | @Test 30 | public void testPythonProcessor() { 31 | ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build(); 32 | pageProcessor.getSite().setSleepTime(0); 33 | Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /webmagic-avalon/forger/src/main/java/us/codecraft/forger/property/Property.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.forger.property; 2 | 3 | import us.codecraft.forger.property.format.ObjectFormatter; 4 | 5 | import java.lang.reflect.Field; 6 | 7 | /** 8 | * @author code4crafter@gmail.com 9 | */ 10 | public class Property { 11 | 12 | private String name; 13 | 14 | private PropertyType type; 15 | 16 | private Field field; 17 | 18 | private ObjectFormatter objectFormatter; 19 | 20 | public ObjectFormatter getObjectFormatter() { 21 | return objectFormatter; 22 | } 23 | 24 | public Property setObjectFormatter(ObjectFormatter objectFormatter) { 25 | this.objectFormatter = objectFormatter; 26 | return this; 27 | } 28 | 29 | public String getName() { 30 | return name; 31 | } 32 | 33 | public Property setName(String name) { 34 | this.name = name; 35 | return this; 36 | } 37 | 38 | public PropertyType getType() { 39 | return type; 40 | } 41 | 42 | public Property setType(PropertyType type) { 43 | this.type = type; 44 | return this; 45 | } 46 | 47 | public Field getField() { 48 | return field; 49 | } 50 | 51 | public Property setField(Field field) { 52 | this.field = field; 53 | return this; 54 | } 55 | 56 | public static Property fromField(Field field) { 57 | return new Property().setName(field.getName()).setType(PropertyType.from(field.getType())).setField(field); 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/config/mapper/DynamicClass.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 19 | 20 | 21 | 22 | 23 | insert into DynamicClass (`ClassName`,`SourceCode`,`AddTime`,`UpdateTime`) 24 | values (#{className},#{sourceCode},now(),now()) 25 | 26 | 27 | 28 | insert into DynamicClass (`ClassName`,`SourceCode`,`AddTime`,`UpdateTime`) 29 | values (#{className},#{sourceCode},now(),now()) 30 | 31 | 32 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/handler/CompositePipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.handler; 2 | 3 | import us.codecraft.webmagic.ResultItems; 4 | import us.codecraft.webmagic.Task; 5 | import us.codecraft.webmagic.pipeline.Pipeline; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * @author code4crafer@gmail.com 12 | */ 13 | public class CompositePipeline implements Pipeline { 14 | 15 | private List subPipelines = new ArrayList(); 16 | 17 | @Override 18 | public void process(ResultItems resultItems, Task task) { 19 | for (SubPipeline subPipeline : subPipelines) { 20 | if (subPipeline.match(resultItems.getRequest())) { 21 | RequestMatcher.MatchOther matchOtherProcessorProcessor = subPipeline.processResult(resultItems, task); 22 | if (matchOtherProcessorProcessor == null || matchOtherProcessorProcessor != RequestMatcher.MatchOther.YES) { 23 | return; 24 | } 25 | } 26 | } 27 | } 28 | 29 | public CompositePipeline addSubPipeline(SubPipeline subPipeline) { 30 | this.subPipelines.add(subPipeline); 31 | return this; 32 | } 33 | 34 | public CompositePipeline setSubPipeline(SubPipeline... subPipelines) { 35 | this.subPipelines = new ArrayList(); 36 | for (SubPipeline subPipeline : subPipelines) { 37 | this.subPipelines.add(subPipeline); 38 | } 39 | return this; 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/FieldExtractor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.model.formatter.ObjectFormatter; 4 | import us.codecraft.webmagic.selector.Selector; 5 | 6 | import java.lang.reflect.Field; 7 | import java.lang.reflect.Method; 8 | 9 | /** 10 | * Wrapper of field and extractor. 11 | * @author code4crafter@gmail.com
12 | * @since 0.2.0 13 | */ 14 | class FieldExtractor extends Extractor { 15 | 16 | private final Field field; 17 | 18 | private Method setterMethod; 19 | 20 | private ObjectFormatter objectFormatter; 21 | 22 | public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) { 23 | super(selector, source, notNull, multi); 24 | this.field = field; 25 | } 26 | 27 | Field getField() { 28 | return field; 29 | } 30 | 31 | Selector getSelector() { 32 | return selector; 33 | } 34 | 35 | Source getSource() { 36 | return source; 37 | } 38 | 39 | void setSetterMethod(Method setterMethod) { 40 | this.setterMethod = setterMethod; 41 | } 42 | 43 | Method getSetterMethod() { 44 | return setterMethod; 45 | } 46 | 47 | boolean isNotNull() { 48 | return notNull; 49 | } 50 | 51 | ObjectFormatter getObjectFormatter() { 52 | return objectFormatter; 53 | } 54 | 55 | void setObjectFormatter(ObjectFormatter objectFormatter) { 56 | this.objectFormatter = objectFormatter; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/processor/example/GithubRepoPageProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.processor.example; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.processor.PageProcessor; 7 | 8 | /** 9 | * @author code4crafter@gmail.com
10 | * @since 0.3.2 11 | */ 12 | public class GithubRepoPageProcessor implements PageProcessor { 13 | 14 | private Site site = Site.me().setRetryTimes(3).setSleepTime(0); 15 | 16 | @Override 17 | public void process(Page page) { 18 | page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-]+/[\\w\\-])").all()); 19 | page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/[\\w\\-])").all()); 20 | page.putField("author", page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); 21 | page.putField("name", page.getHtml().xpath("//h1[@class='entry-title public']/strong/a/text()").toString()); 22 | if (page.getResultItems().get("name")==null){ 23 | //skip this page 24 | page.setSkip(true); 25 | } 26 | page.putField("readme", page.getHtml().xpath("//div[@id='readme']/tidyText()")); 27 | } 28 | 29 | @Override 30 | public Site getSite() { 31 | return site; 32 | } 33 | 34 | public static void main(String[] args) { 35 | Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /webmagic-core/src/main/java/us/codecraft/webmagic/selector/XpathSelector.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.selector; 2 | 3 | import org.apache.commons.collections.CollectionUtils; 4 | import org.jsoup.nodes.Element; 5 | import us.codecraft.xsoup.XPathEvaluator; 6 | import us.codecraft.xsoup.Xsoup; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * XPath selector based on Xsoup.
12 | * 13 | * @author code4crafter@gmail.com
14 | * @since 0.3.0 15 | */ 16 | public class XpathSelector extends BaseElementSelector { 17 | 18 | private XPathEvaluator xPathEvaluator; 19 | 20 | public XpathSelector(String xpathStr) { 21 | this.xPathEvaluator = Xsoup.compile(xpathStr); 22 | } 23 | 24 | @Override 25 | public String select(Element element) { 26 | return xPathEvaluator.evaluate(element).get(); 27 | } 28 | 29 | @Override 30 | public List selectList(Element element) { 31 | return xPathEvaluator.evaluate(element).list(); 32 | } 33 | 34 | @Override 35 | public Element selectElement(Element element) { 36 | List elements = selectElements(element); 37 | if (CollectionUtils.isNotEmpty(elements)){ 38 | return elements.get(0); 39 | } 40 | return null; 41 | } 42 | 43 | @Override 44 | public List selectElements(Element element) { 45 | return xPathEvaluator.evaluate(element).getElements(); 46 | } 47 | 48 | @Override 49 | public boolean hasAttribute() { 50 | return xPathEvaluator.hasAttribute(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /webmagic-extension/src/test/java/us/codecraft/webmagic/processor/GithubRepoProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.processor; 2 | 3 | import junit.framework.Assert; 4 | import org.junit.Test; 5 | import us.codecraft.webmagic.*; 6 | import us.codecraft.webmagic.downloader.MockGithubDownloader; 7 | import us.codecraft.webmagic.model.OOSpider; 8 | import us.codecraft.webmagic.pipeline.Pipeline; 9 | 10 | /** 11 | * @author code4crafter@gmail.com 12 | */ 13 | public class GithubRepoProcessor implements PageProcessor { 14 | @Override 15 | public void process(Page page) { 16 | page.putField("star",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[2]//a[@class='social-count js-social-count']/text()").toString()); 17 | page.putField("fork",page.getHtml().xpath("//ul[@class='pagehead-actions']/li[3]//a[@class='social-count']/text()").toString()); 18 | } 19 | 20 | @Override 21 | public Site getSite() { 22 | return Site.me().addStartUrl("https://github.com/code4craft/webmagic"); 23 | } 24 | 25 | @Test 26 | public void test() { 27 | OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() { 28 | @Override 29 | public void process(ResultItems resultItems, Task task) { 30 | Assert.assertEquals("78",((String)resultItems.get("star")).trim()); 31 | Assert.assertEquals("65",((String)resultItems.get("fork")).trim()); 32 | } 33 | }).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic"); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/example/AppStore.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.example; 2 | 3 | import us.codecraft.webmagic.Site; 4 | import us.codecraft.webmagic.model.OOSpider; 5 | import us.codecraft.webmagic.model.annotation.ExtractBy; 6 | import us.codecraft.webmagic.utils.Experimental; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * @author code4crafter@gmail.com 12 | * @since 0.4.1 13 | */ 14 | @Experimental 15 | public class AppStore { 16 | 17 | @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..trackName") 18 | private String trackName; 19 | 20 | @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..description") 21 | private String description; 22 | 23 | @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..userRatingCount") 24 | private int userRatingCount; 25 | 26 | @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..screenshotUrls") 27 | private List screenshotUrls; 28 | 29 | @ExtractBy(type = ExtractBy.Type.JsonPath, value = "$..supportedDevices") 30 | private List supportedDevices; 31 | 32 | public static void main(String[] args) { 33 | AppStore appStore = OOSpider.create(Site.me(), AppStore.class).get("http://itunes.apple.com/lookup?id=653350791&country=cn&entity=software"); 34 | System.out.println(appStore.trackName); 35 | System.out.println(appStore.description); 36 | System.out.println(appStore.userRatingCount); 37 | System.out.println(appStore.screenshotUrls); 38 | System.out.println(appStore.supportedDevices); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/QzoneBlogProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.processor.PageProcessor; 6 | 7 | import java.util.List; 8 | 9 | /** 10 | * @author code4crafter@gmail.com
11 | */ 12 | public class QzoneBlogProcessor implements PageProcessor { 13 | @Override 14 | public void process(Page page) { 15 | //http://progressdaily.diandian.com/post/2013-01-24/40046867275 16 | 17 | //http://b1.cnc.qzone.qq.com/cgi-bin/blognew/get_abs?hostUin=233017404&uin=233017404&blogType=0&statYear=2013&source=0&statYear=2013&g_tk=291639571&g_tk=291639571&reqInfo=7&pos=0&num=15&source=0&rand=0.46480297949165106 18 | // &cateName=&cateHex=&statYear=2013&reqInfo=7&pos=0&num=15&sortType=0&source=0&rand=0.46480297949165106&g_tk=291639571&verbose=1&ref=qzone 19 | List requests = page.getHtml().regex("]*href=[\"']{1}(http://17dujingdian\\.com/post/[^#]*?)[\"']{1}").all(); 20 | page.addTargetRequests(requests); 21 | page.putField("title",page.getHtml().xpath("//div[@id='content']//h2/a")); 22 | page.putField("content",page.getHtml().smartContent()); 23 | } 24 | 25 | @Override 26 | public Site getSite() { 27 | return Site.me().setDomain("www.diandian.com").addStartUrl("http://17dujingdian.com/"). 28 | setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-myBatis.xml: -------------------------------------------------------------------------------- 1 | 2 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | sqlserver 15 | db2 16 | oracle 17 | mysql 18 | h2 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/resources/config/spring/applicationContext-webmvc.xml: -------------------------------------------------------------------------------- 1 | 2 | 12 | 13 | 14 | 15 | 16 | 18 | 19 | 20 | text/html;charset=UTF-8 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-avalon-common/src/main/java/us/codecraft/webmagic/service/impl/DynamicClassServiceImpl.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.service.impl; 2 | 3 | import org.codehaus.groovy.control.CompilationFailedException; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | import org.springframework.stereotype.Service; 6 | import us.codecraft.forger.Forger; 7 | import us.codecraft.forger.ForgerFactory; 8 | import us.codecraft.webmagic.dao.DynamicClassDao; 9 | import us.codecraft.webmagic.exception.DynamicClassCompileException; 10 | import us.codecraft.webmagic.model.DynamicClass; 11 | import us.codecraft.webmagic.service.DynamicClassService; 12 | 13 | /** 14 | * @author code4crafter@gmail.com 15 | */ 16 | @Service 17 | public class DynamicClassServiceImpl implements DynamicClassService { 18 | 19 | @Autowired 20 | private DynamicClassDao dynamicClassDao; 21 | 22 | @Autowired 23 | private ForgerFactory forgerFactory; 24 | 25 | @Override 26 | public Class compileAndSave(String sourceCode) throws DynamicClassCompileException { 27 | Forger forger; 28 | try { 29 | forger = forgerFactory.compile(sourceCode); 30 | } catch (CompilationFailedException e) { 31 | throw new DynamicClassCompileException(e.getMessage(),e); 32 | } 33 | String className = forger.getClazz().getCanonicalName(); 34 | DynamicClass dynamicClass = new DynamicClass(); 35 | dynamicClass.setClassName(className); 36 | dynamicClass.setSourceCode(sourceCode); 37 | dynamicClassDao.add(dynamicClass); 38 | return forger.getClazz(); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /webmagic-avalon/webmagic-admin/src/main/webapp/static/css/fullcalendar.print.css: -------------------------------------------------------------------------------- 1 | /* 2 | * FullCalendar v1.5.3 Print Stylesheet 3 | * 4 | * Include this stylesheet on your page to get a more printer-friendly calendar. 5 | * When including this stylesheet, use the media='print' attribute of the tag. 6 | * Make sure to include this stylesheet IN ADDITION to the regular fullcalendar.css. 7 | * 8 | * Copyright (c) 2011 Adam Shaw 9 | * Dual licensed under the MIT and GPL licenses, located in 10 | * MIT-LICENSE.txt and GPL-LICENSE.txt respectively. 11 | * 12 | * Date: Mon Feb 6 22:40:40 2012 -0800 13 | * 14 | */ 15 | 16 | 17 | /* Events 18 | -----------------------------------------------------*/ 19 | 20 | .fc-event-skin { 21 | background: none !important; 22 | color: #000 !important; 23 | } 24 | 25 | /* horizontal events */ 26 | 27 | .fc-event-hori { 28 | border-width: 0 0 1px 0 !important; 29 | border-bottom-style: dotted !important; 30 | border-bottom-color: #000 !important; 31 | padding: 1px 0 0 0 !important; 32 | } 33 | 34 | .fc-event-hori .fc-event-inner { 35 | border-width: 0 !important; 36 | padding: 0 1px !important; 37 | } 38 | 39 | /* vertical events */ 40 | 41 | .fc-event-vert { 42 | border-width: 0 0 0 1px !important; 43 | border-left-style: dotted !important; 44 | border-left-color: #000 !important; 45 | padding: 0 1px 0 0 !important; 46 | } 47 | 48 | .fc-event-vert .fc-event-inner { 49 | border-width: 0 !important; 50 | padding: 1px 0 !important; 51 | } 52 | 53 | .fc-event-bg { 54 | display: none !important; 55 | } 56 | 57 | .fc-event .ui-resizable-handle { 58 | display: none !important; 59 | } 60 | 61 | 62 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/InfoQMiniBookProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import org.apache.commons.collections.CollectionUtils; 4 | import us.codecraft.webmagic.Page; 5 | import us.codecraft.webmagic.Site; 6 | import us.codecraft.webmagic.Spider; 7 | import us.codecraft.webmagic.processor.PageProcessor; 8 | 9 | import java.util.List; 10 | 11 | /** 12 | * @author code4crafter@gmail.com
13 | */ 14 | public class InfoQMiniBookProcessor implements PageProcessor { 15 | 16 | private Site site; 17 | 18 | @Override 19 | public void process(Page page) { 20 | page.addTargetRequests(page.getHtml().links().regex("http://www\\.infoq\\.com/cn/minibooks/.*").all()); 21 | List all = page.getHtml().links().regex(".*\\.pdf").all(); 22 | if (CollectionUtils.isNotEmpty(all)) { 23 | page.putField("pdf", all); 24 | } else { 25 | page.getResultItems().setSkip(true); 26 | } 27 | } 28 | 29 | @Override 30 | public Site getSite() { 31 | if (site == null) { 32 | site = Site.me().setDomain("www.infoq.com").addStartUrl("http://www.infoq.com/cn/minibooks").addCookie("RegisteredUserCookie", "sDDDc8dIAgZSq67uJSXhtpQaHEi1XDOH"). 33 | setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31"); 34 | } 35 | return site; 36 | } 37 | 38 | public static void main(String[] args) { 39 | Spider.create(new InfoQMiniBookProcessor()) 40 | .thread(5) 41 | .run(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /webmagic-saxon/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | webmagic-parent 5 | us.codecraft 6 | 0.5.4-SNAPSHOT 7 | 8 | 4.0.0 9 | 10 | webmagic-saxon 11 | 12 | 13 | 14 | us.codecraft 15 | webmagic-core 16 | ${project.version} 17 | 18 | 19 | net.sourceforge.htmlcleaner 20 | htmlcleaner 21 | 2.5 22 | 23 | 24 | net.sf.saxon 25 | Saxon-HE 26 | 9.5.1-1 27 | 28 | 29 | junit 30 | junit 31 | 32 | 33 | 34 | 35 | 36 | 37 | maven-deploy-plugin 38 | 39 | true 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/model/PageModelCollectorPipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.model; 2 | 3 | import us.codecraft.webmagic.ResultItems; 4 | import us.codecraft.webmagic.Task; 5 | import us.codecraft.webmagic.model.annotation.ExtractBy; 6 | import us.codecraft.webmagic.pipeline.CollectorPageModelPipeline; 7 | import us.codecraft.webmagic.pipeline.CollectorPipeline; 8 | 9 | import java.lang.annotation.Annotation; 10 | import java.util.List; 11 | 12 | /** 13 | * @author code4crafter@gmail.com 14 | * @since 0.4.0 15 | */ 16 | class PageModelCollectorPipeline implements CollectorPipeline { 17 | 18 | private final CollectorPageModelPipeline classPipeline = new CollectorPageModelPipeline(); 19 | 20 | private final Class clazz; 21 | 22 | PageModelCollectorPipeline(Class clazz) { 23 | this.clazz = clazz; 24 | } 25 | 26 | @Override 27 | public List getCollected() { 28 | return classPipeline.getCollected(); 29 | } 30 | 31 | @Override 32 | public synchronized void process(ResultItems resultItems, Task task) { 33 | Object o = resultItems.get(clazz.getCanonicalName()); 34 | if (o != null) { 35 | Annotation annotation = clazz.getAnnotation(ExtractBy.class); 36 | if (annotation == null || !((ExtractBy) annotation).multi()) { 37 | classPipeline.process((T) o, task); 38 | } else { 39 | List list = (List) o; 40 | for (Object o1 : list) { 41 | classPipeline.process((T) o1, task); 42 | } 43 | } 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /webmagic-extension/src/main/java/us/codecraft/webmagic/pipeline/JsonFilePipeline.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.pipeline; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import org.apache.commons.codec.digest.DigestUtils; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import us.codecraft.webmagic.ResultItems; 8 | import us.codecraft.webmagic.Task; 9 | import us.codecraft.webmagic.utils.FilePersistentBase; 10 | 11 | import java.io.FileWriter; 12 | import java.io.IOException; 13 | import java.io.PrintWriter; 14 | 15 | /** 16 | * Store results to files in JSON format.
17 | * 18 | * @author code4crafter@gmail.com
19 | * @since 0.2.0 20 | */ 21 | public class JsonFilePipeline extends FilePersistentBase implements Pipeline { 22 | 23 | private Logger logger = LoggerFactory.getLogger(getClass()); 24 | 25 | /** 26 | * new JsonFilePageModelPipeline with default path "/data/webmagic/" 27 | */ 28 | public JsonFilePipeline() { 29 | setPath("/data/webmagic"); 30 | } 31 | 32 | public JsonFilePipeline(String path) { 33 | setPath(path); 34 | } 35 | 36 | @Override 37 | public void process(ResultItems resultItems, Task task) { 38 | String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR; 39 | try { 40 | PrintWriter printWriter = new PrintWriter(new FileWriter(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".json"))); 41 | printWriter.write(JSON.toJSONString(resultItems.getAll())); 42 | printWriter.close(); 43 | } catch (IOException e) { 44 | logger.warn("write file error", e); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/SeleniumTest.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.downloader; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | import org.openqa.selenium.By; 6 | import org.openqa.selenium.WebDriver; 7 | import org.openqa.selenium.WebElement; 8 | import org.openqa.selenium.chrome.ChromeDriver; 9 | import org.openqa.selenium.remote.DesiredCapabilities; 10 | 11 | import java.util.Arrays; 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | 15 | /** 16 | * @author code4crafter@gmail.com
17 | * Date: 13-7-26
18 | * Time: 下午12:27
19 | */ 20 | public class SeleniumTest { 21 | 22 | @Ignore("need chrome driver") 23 | @Test 24 | public void testSelenium() { 25 | System.getProperties().setProperty("webdriver.chrome.driver", "/Users/yihua/Downloads/chromedriver"); 26 | Map contentSettings = new HashMap(); 27 | contentSettings.put("images", 2); 28 | 29 | Map preferences = new HashMap(); 30 | preferences.put("profile.default_content_settings", contentSettings); 31 | 32 | DesiredCapabilities caps = DesiredCapabilities.chrome(); 33 | caps.setCapability("chrome.prefs", preferences); 34 | caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=/Users/yihua/temp/chrome")); 35 | WebDriver webDriver = new ChromeDriver(caps); 36 | webDriver.get("http://huaban.com/"); 37 | WebElement webElement = webDriver.findElement(By.xpath("/html")); 38 | System.out.println(webElement.getAttribute("outerHTML")); 39 | webDriver.close(); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /webmagic-samples/src/main/java/us/codecraft/webmagic/samples/GithubRepoPageProcessor.java: -------------------------------------------------------------------------------- 1 | package us.codecraft.webmagic.samples; 2 | 3 | import us.codecraft.webmagic.Page; 4 | import us.codecraft.webmagic.Site; 5 | import us.codecraft.webmagic.Spider; 6 | import us.codecraft.webmagic.processor.PageProcessor; 7 | 8 | /** 9 | * @author code4crafter@gmail.com
10 | * @since 0.5.1 11 | */ 12 | public class GithubRepoPageProcessor implements PageProcessor { 13 | 14 | private Site site = Site.me().setRetryTimes(3).setSleepTime(0); 15 | 16 | @Override 17 | public void process(Page page) { 18 | page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+/\\w+)").all()); 19 | page.addTargetRequests(page.getHtml().links().regex("(https://github\\.com/\\w+)").all()); 20 | GithubRepo githubRepo = new GithubRepo(); 21 | githubRepo.setAuthor(page.getUrl().regex("https://github\\.com/(\\w+)/.*").toString()); 22 | githubRepo.setName(page.getHtml().xpath("//h1[contains(@class, 'entry-title') and contains(@class, 'public')]/strong/a/text()").toString()); 23 | githubRepo.setReadme(page.getHtml().xpath("//div[@id='readme']/tidyText()").toString()); 24 | if (githubRepo.getName() == null) { 25 | //skip this page 26 | page.setSkip(true); 27 | } else { 28 | page.putField("repo", githubRepo); 29 | } 30 | } 31 | 32 | @Override 33 | public Site getSite() { 34 | return site; 35 | } 36 | 37 | public static void main(String[] args) { 38 | Spider.create(new GithubRepoPageProcessor()).addUrl("https://github.com/code4craft").thread(5).run(); 39 | } 40 | } 41 | --------------------------------------------------------------------------------