├── .gitignore ├── .gitmessage ├── .mvn └── jvm.config ├── CONTRIBUTING.md ├── CONTRIBUTING_zh_CN.md ├── LICENSE ├── README.md ├── README_zh_CN.md ├── documents ├── crawler-development-tutorial_en.md ├── crawler-development-tutorial_zh_CN.md ├── crawler-example-tutorial_zh_CN.md ├── database-design.md ├── develop-process.md ├── home-page.png ├── hotcrawler-homepage-v2-mobile.jpg ├── hotcrawler-homepage-v2-pc.png ├── system-design.md ├── techniques-list_en.md ├── techniques-list_zh_CN.md ├── update_log.md ├── websites-list_en.md └── websites-list_zh_CN.md ├── pom.xml └── src ├── main ├── java │ └── com │ │ └── taogen │ │ └── hotcrawler │ │ ├── App.java │ │ ├── api │ │ ├── exception │ │ │ └── DataNotFoundException.java │ │ ├── service │ │ │ ├── BaseService.java │ │ │ └── InfoService.java │ │ └── web │ │ │ ├── controller │ │ │ ├── AbstractApiController.java │ │ │ ├── BaseV1Controller.java │ │ │ └── InfoController.java │ │ │ └── model │ │ │ ├── ResponseModel.java │ │ │ └── response │ │ │ └── GenericResponseModel.java │ │ ├── commons │ │ ├── aspect │ │ │ └── DataHandlerAspect.java │ │ ├── config │ │ │ ├── RedisConfig.java │ │ │ ├── SiteProperties.java │ │ │ └── SpringFoxConfig.java │ │ ├── constant │ │ │ └── RequestMethod.java │ │ ├── crawler │ │ │ ├── APIHotProcessor.java │ │ │ ├── AbstractHotProcessor.java │ │ │ ├── DocumentHotProcessor.java │ │ │ ├── HotProcessor.java │ │ │ ├── MultipleAPIHotProcessor.java │ │ │ ├── SimpleAPIHotProcessor.java │ │ │ ├── SimpleDocumentHotProcessor.java │ │ │ ├── handler │ │ │ │ ├── DataHandler.java │ │ │ │ ├── HandlerCenter.java │ │ │ │ └── impl │ │ │ │ │ └── DeduplicationDataHandler.java │ │ │ └── impl │ │ │ │ ├── abroad │ │ │ │ ├── BBCNewsHotProcessor.java │ │ │ │ ├── EconomistHotProcessor.java │ │ │ │ ├── HackernewsHotProcessor.java │ │ │ │ ├── LobstersHotProcessor.java │ │ │ │ ├── MediumHotProcessor.java │ │ │ │ ├── RedditHotProcessor.java │ │ │ │ ├── TechmemeHotProcessor.java │ │ │ │ ├── TheNewYorkTimesHotProcessor.java │ │ │ │ └── YouTubeHotProcessor.java │ │ │ │ ├── news │ │ │ │ ├── GeekParkHotProcessor.java │ │ │ │ ├── HuxiuHotProcessor.java │ │ │ │ ├── IfanrHotProcessor.java │ │ │ │ ├── NatureHotProcessor.java │ │ │ │ ├── ReadhubHotProcessor.java │ │ │ │ └── SolidotHotProcessor.java │ │ │ │ ├── slack │ │ │ │ ├── DoubanHotProcessor.java │ │ │ │ ├── DoubanTopicsHotProcessor.java │ │ │ │ ├── HupuHotProcessor.java │ │ │ │ ├── JiandanHotProcessor.java │ │ │ │ ├── SspaiHotProcessor.java │ │ │ │ ├── TianyaHotProcessor.java │ │ │ │ ├── V2exHotProcessor.java │ │ │ │ ├── WeiboHotProcessor.java │ │ │ │ └── ZhihuHotProcessor.java │ │ │ │ ├── stream │ │ │ │ ├── BilibiliHotProcessor.java │ │ │ │ └── CloudmusicHotProcessor.java │ │ │ │ └── technique │ │ │ │ ├── AliyunHotProcessor.java │ │ │ │ ├── DeveloperHotProcessor.java │ │ │ │ ├── DzoneHotProcessor.java │ │ │ │ ├── GithubHotProcessor.java │ │ │ │ ├── InfoqHotProcessor.java │ │ │ │ ├── InfoqcomHotProcessor.java │ │ │ │ ├── JAXenterHotProcessor.java │ │ │ │ ├── JavaWorldHotProcessor.java │ │ │ │ ├── JuejinHotProcessor.java │ │ │ │ └── SegmentFaultHotProcessor.java │ │ ├── entity │ │ │ ├── Info.java │ │ │ ├── InfoCate.java │ │ │ ├── InfoType.java │ │ │ └── UserVisitStat.java │ │ ├── repository │ │ │ └── InfoRepository.java │ │ ├── task │ │ │ └── CrawlerTask.java │ │ ├── util │ │ │ ├── ClassUtils.java │ │ │ └── OsUtils.java │ │ └── vo │ │ │ └── HttpRequest.java │ │ └── frontend │ │ └── Controller │ │ └── IndexController.java └── resources │ ├── application-dev.yml │ ├── application-prod.yml │ ├── application-test.yml │ ├── application.yml │ ├── logback-spring.xml │ ├── sites.properties │ ├── static │ ├── css │ │ ├── index.css │ │ └── style.css │ ├── favicon.ico │ ├── img │ │ ├── const520-logo_200x200.png │ │ └── const520.ico │ └── js │ │ ├── index.js │ │ ├── index2.js │ │ ├── index3.js │ │ └── jquery-3.4.1-min.js │ ├── templates │ ├── index.html │ ├── index2.html │ └── index3.html │ └── webdrivermanager.properties └── test └── java └── com └── taogen └── hotcrawler ├── AppTest.java ├── api ├── service │ └── InfoServiceTest.java └── web │ └── controller │ └── InfoControllerTest.java └── commons ├── config └── SitePropertiesTest.java ├── crawler ├── HotProcessorTest.java └── impl │ ├── abroad │ ├── BBCNewsHotProcessorTest.java │ ├── EconomistHotProcessorTest.java │ ├── HackernewsHotProcessorTest.java │ ├── LobstersHotProcessorTest.java │ ├── MediumHotProcessorTest.java │ └── TheNewYorkTimesHotProcessorTest.java │ ├── news │ ├── GeekParkHotProcessorTest.java │ ├── HuxiuHotProcessorTest.java │ ├── IfanrHotProcessorTest.java │ ├── NatureHotProcessorTest.java │ ├── ReadhubHotProcessorTest.java │ ├── SolidotHotProcessorTest.java │ └── TechmemeHotProcessorTest.java │ ├── slack │ ├── DoubanHotProcessorTest.java │ ├── DoubanTopicsHotProcessorTest.java │ ├── HupuHotProcessorTest.java │ ├── JiandanHotProcessorTest.java │ ├── SspaiHotProcessorTest.java │ ├── TianyaHotProcessorTest.java │ ├── V2exHotProcessorTest.java │ ├── WeiboHotProcessorTest.java │ └── ZhihuHotProcessorTest.java │ ├── stream │ └── CloudmusicHotProcessorTest.java │ └── technique │ ├── DeveloperHotProcessorTest.java │ ├── DzoneHotProcessorTest.java │ ├── GithubHotProcessorTest.java │ ├── InfoqHotProcessorTest.java │ ├── InfoqcomHotProcessorTest.java │ ├── JAXenterHotProcessorTest.java │ ├── JavaWorldHotProcessorTest.java │ ├── JuejinHotProcessorTest.java │ └── SegmentFaultHotProcessorTest.java └── repository └── InfoRepositoryTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # ignore Maven generated target folders 23 | target 24 | 25 | # ignore eclipse files 26 | .project 27 | .classpath 28 | .settings 29 | .metadata 30 | 31 | # ignore idea files 32 | .idea 33 | *.iml -------------------------------------------------------------------------------- /.gitmessage: -------------------------------------------------------------------------------- 1 | # [] (If applied, this commit will...) (Max 72 char) 2 | # |<---- Preferably using up to 50 chars --->|<------------------->| 3 | # Example: 4 | # [feat] Implement automated commit messages 5 | 6 | 7 | # (Optional) Explain why this change is being made 8 | # |<---- Try To Limit Each Line to a Maximum Of 72 Characters ---->| 9 | 10 | 11 | # (Optional) Provide links or keys to any relevant tickets, articles or other resources 12 | # Example: Github issue #23 13 | 14 | 15 | # --- COMMIT END --- 16 | # Tag can be 17 | # feat (new feature) 18 | # fix (bug fix) 19 | # docs (changes to documentation) 20 | # style (formatting, missing semi colons, etc; no code change) 21 | # refactor (refactoring production code) 22 | # test (adding or refactoring tests; no production code change) 23 | # chore (updating build tasks, package manager configs, etc; no production code change) 24 | # version (version bump/new release; no production code change) 25 | 26 | # jsrXXX (Patches related to the implementation of jsrXXX, where XXX the JSR number) 27 | # jdkX (Patches related to supporting jdkX as the host VM, where X the JDK version) 28 | # dbg (Changes in debugging code/frameworks; no production code change) 29 | # license (Edits regarding licensing; no production code change) 30 | # hack (Temporary fix to make things move forward; please avoid it) 31 | # WIP (Work In Progress; for intermediate commits to keep patches reasonably sized) 32 | # defaults (changes default options) 33 | # 34 | # Note: Multiple tags can be combined, e.g. [fix][jsr292] Fix issue X with methodhandles 35 | # -------------------- 36 | # Remember to: 37 | # * Capitalize the subject line 38 | # * Use the imperative mood in the subject line 39 | # * Do not end the subject line with a period 40 | # * Separate subject from body with a blank line 41 | # * Use the body to explain what and why vs. how 42 | # * Can use multiple lines with "-" or "*" for bullet points in body 43 | # -------------------- -------------------------------------------------------------------------------- /.mvn/jvm.config: -------------------------------------------------------------------------------- 1 | -XX:+PrintGCDetails -Xloggc:hotcrawler_jvm.log -Xms188m -Xmx188m -XX:NewSize=70m -XX:MaxNewSize=70m -XX:MetaspaceSize=48m -XX:MaxMetaspaceSize=48m -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to hot-crawler 2 | 3 | There are many ways to contribute to the project: 4 | 5 | - Logging bugs. 6 | - Submitting pull requests. 7 | - Reporting issues. 8 | - Creating suggestions. -------------------------------------------------------------------------------- /CONTRIBUTING_zh_CN.md: -------------------------------------------------------------------------------- 1 | # 项目贡献指南 2 | 3 | 可对本项目做以下贡献: 4 | 5 | 1. 添加有价值的网站到[网站爬取列表](documents/websites-list_zh_CN.md)中。 6 | 2. 实现新的爬虫。 7 | 3. 提交 pull request. 8 | 4. 创建 issues。 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Taogen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hot-crawler 2 | 3 | [![GitHub issues](https://img.shields.io/github/issues/tagnja/hot-crawler)](https://github.com/tagnja/hot-crawler/issues) 4 | [![GitHub stars](https://img.shields.io/github/stars/tagnja/hot-crawler)](https://github.com/tagnja/hot-crawler/stargazers) 5 | [![GitHub forks](https://img.shields.io/github/forks/tagnja/hot-crawler)](https://github.com/tagnja/hot-crawler/network) 6 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/d25aed8571b944e6838686d96ea3873f)](https://www.codacy.com/manual/tagnja/hot-crawler?utm_source=github.com&utm_medium=referral&utm_content=tagnja/hot-crawler&utm_campaign=Badge_Grade) 7 | [![codecov](https://codecov.io/gh/tagnja/hot-crawler/branch/master/graph/badge.svg)](https://codecov.io/gh/tagnja/hot-crawler) 8 | [![Build Status](https://travis-ci.com/tagnja/hot-crawler.svg?branch=test)](https://travis-ci.com/tagnja/hot-crawler) 9 | 10 | Languages: [English](README.md) | [中文简体](README_zh_CN.md) 11 | 12 | ## What's hot-crawler 13 | 14 | **hot-crawler** is a web crawler about hot information on excellent websites. It can help you quickly access valuable information on many websites. It is compatible with multiple devices, such as personal computers, mobile phones, and tablets. These websites are mainly composed of two languages, Chinese and English. As shown in the figure below. 15 | 16 | ![homepage](documents/hotcrawler-homepage-v2-pc.png) 17 | 18 | ## How to develop 19 | 20 | ### Before starting 21 | 22 | To develop Hot-Crawler, you need to install the following softwares: 23 | 24 | - Git 25 | - JDK 26 | - Maven 27 | - Redis 28 | 29 | We strongly suggest using [IntelliJ IDEA](https://www.jetbrains.com/idea/?fromMenu) to develop this project. 30 | 31 | ## How to run 32 | 33 | ### How to run with Maven command line 34 | 35 | 1. Clone project source to your local computer 36 | 37 | ``` 38 | $ git clone https://github.com/tagnja/hot-crawler.git 39 | ``` 40 | 41 | 2. Active develop configuration in `src/main/resources/application.yml` 42 | 43 | ``` 44 | spring: 45 | profiles: 46 | active: dev 47 | ``` 48 | 49 | 3. Startup your local Redis Server before running the project. 50 | 51 | 4. Running the project with Spring Boot maven plugin. 52 | 53 | ``` 54 | cd hot-crawler 55 | mvn spring-boot:run 56 | ``` 57 | 58 | 5. Visting the website by http://localhost:8080 59 | 60 | ## Document 61 | 62 | For example of crawler development to view [crawler development tutorial](documents/crawler-development-tutorial_en.md). 63 | 64 | For using techniques of the project to view [use techniques list](documents/techniques-list_en.md). 65 | 66 | For the complete crawled websites to view [websites crawler list](documents/websites-list_en.md). 67 | 68 | For more info about update log to view [update log](documents/update_log.md). 69 | 70 | ## Contributing 71 | 72 | Contributing is welcome! Please check out the [Contributing to hot-crawler guide](CONTRIBUTING.md). 73 | 74 | ## License 75 | 76 | hot-crawler is released under the [MIT License](https://opensource.org/licenses/MIT). 77 | 78 | -------------------------------------------------------------------------------- /README_zh_CN.md: -------------------------------------------------------------------------------- 1 | # hot-crawler 2 | 3 | [![GitHub issues](https://img.shields.io/github/issues/tagnja/hot-crawler)](https://github.com/tagnja/hot-crawler/issues) 4 | [![GitHub stars](https://img.shields.io/github/stars/tagnja/hot-crawler)](https://github.com/tagnja/hot-crawler/stargazers) 5 | [![GitHub forks](https://img.shields.io/github/forks/tagnja/hot-crawler)](https://github.com/tagnja/hot-crawler/network) 6 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/d25aed8571b944e6838686d96ea3873f)](https://www.codacy.com/manual/tagnja/hot-crawler?utm_source=github.com&utm_medium=referral&utm_content=tagnja/hot-crawler&utm_campaign=Badge_Grade) 7 | [![codecov](https://codecov.io/gh/tagnja/hot-crawler/branch/master/graph/badge.svg)](https://codecov.io/gh/tagnja/hot-crawler) 8 | [![Build Status](https://travis-ci.com/tagnja/hot-crawler.svg?branch=test)](https://travis-ci.com/tagnja/hot-crawler) 9 | 10 | 语言: [English](README.md) | [中文简体](README_zh_CN.md) 11 | 12 | ## What's hot-crawler 13 | 14 | > 汇集热点内容,一站式阅读体验。 15 | 16 | **hot-cralwer** 是一个关于优秀网站的热点资讯爬虫。它可以帮助你快速获取多个网站有价值的信息。它兼容多个终端设备,如个人电脑,手机,和平板电脑等。爬取的网站主要由两种语言组成:中文和英文。具体样式如下图所示。 17 | 18 | ![网站首页图](documents/hotcrawler-homepage-v2-pc.png) 19 | 20 | ## How to develop 21 | 22 | ### 开始之前 23 | 24 | 开发 hot-cralwer 之前,你需要安装以下软件: 25 | 26 | - Git 27 | - JDK 28 | - Maven 29 | - Redis 30 | 31 | 强烈建议使用 [IntelliJ IDEA](https://www.jetbrains.com/idea/?fromMenu) 进行开发。 32 | 33 | 34 | ## How to run 35 | ### 通过 Maven 命令行运行 36 | 37 | 1. 克隆项目到本地 38 | 39 | ``` 40 | $ git clone https://github.com/tagnja/hot-crawler.git 41 | ``` 42 | 43 | 2. 设置使用开发环境的配置,修改 `src/main/resources/application.yml` 的配置如下: 44 | 45 | ``` 46 | spring: 47 | profiles: 48 | active: dev 49 | ``` 50 | 51 | 3. 启动本地的 Redis 服务 52 | 53 | ``` 54 | $ ./redis-server 55 | ``` 56 | 57 | 4. 使用 Maven 命令运行项目 58 | 59 | ``` 60 | $ cd hot-crawler 61 | $ mvn spring-boot:run 62 | ``` 63 | 64 | 5. 本地访问项目链接 http://localhost:8080 65 | 66 | ## Document 67 | 68 | [网站爬虫快速指南](documents/crawler-development-tutorial_zh_CN.md) 69 | 70 | [使用技术列表](documents/techniques-list_zh_CN.md) 71 | 72 | [网站爬取列表](documents/websites-list_zh_CN.md) 73 | 74 | [更新日志](documents/update_log.md) 75 | 76 | ## Contributing 77 | 78 | 欢迎大家对本开源项目进行贡献!详细说明请见[项目贡献指南](CONTRIBUTING_zh_CN.md)。 79 | 80 | ## License 81 | 82 | 本项目使用 [MIT License](https://opensource.org/licenses/MIT). -------------------------------------------------------------------------------- /documents/crawler-development-tutorial_en.md: -------------------------------------------------------------------------------- 1 | # Crawler Development Tutorial 2 | 3 | 1. Add information of website in configuration file `hot-crawler/src/main/resources/sites.properties`。Notice the number x, y of cates[x].sites[y] can't be repeated with others. 4 | 5 | ``` 6 | #example 7 | cates[0].sites[0].id = 1 8 | cates[0].sites[0].name = example 9 | cates[0].sites[0].processorName = ExampleHotProcessor 10 | cates[0].sites[0].url = https://example.com/list 11 | cates[0].sites[0].prefix = https://example.com 12 | ``` 13 | 14 | 2. Add processor of crawler. e.g. `hot-crawler/src/main/java/com/taogen/hotcrawler/commons/crawler/impl/ExampleHotProcessor.java` 15 | 16 | ```java 17 | @Component("ExampleHotProcessor") 18 | public class ExampleHotProcessor implements HotProcessor 19 | { 20 | @Override 21 | public List crawlHotList() 22 | { 23 | ... 24 | } 25 | } 26 | ``` 27 | 28 | Tips: You can also extend one abstract class if need. Abstract classes have implemented some operations for you. 29 | 30 | 3. Test and running on local. 31 | 32 | - Startup Redis server in your computer. 33 | 34 | - Execute unit test. 35 | 36 | ``` 37 | $ mvn test 38 | ``` 39 | 40 | - Running project by Spring Boot maven plugin. 41 | 42 | ``` 43 | $ mvn spring-boot:run 44 | ``` 45 | 46 | - Visiting http://localhost:8080 to access your created website crawler page. -------------------------------------------------------------------------------- /documents/crawler-development-tutorial_zh_CN.md: -------------------------------------------------------------------------------- 1 | # 实现一个网站爬虫快速指南 2 | 3 | 1. 添加站点信息在 hot-crawler/src/main/resources/sites.properties。注意,cates[x].sites[y] 中的序号 x, y 不能和已存在的重复。 4 | 5 | ``` 6 | #example 7 | cates[0].sites[0].id = 1 8 | cates[0].sites[0].name = example 9 | cates[0].sites[0].processorName = ExampleHotProcessor 10 | cates[0].sites[0].url = https://example.com/list 11 | cates[0].sites[0].prefix = https://example.com 12 | ``` 13 | 14 | 2. 添加热点爬取处理器,如 hot-crawler/src/main/java/com/taogen/hotcrawler/commons/crawler/impl/ExampleHotProcessor.java 15 | 16 | ```java 17 | @Component("ExampleHotProcessor") 18 | public class ExampleHotProcessor implements HotProcessor 19 | { 20 | @Override 21 | public List crawlHotList() 22 | { 23 | ... 24 | } 25 | } 26 | ``` 27 | 28 | Tips:你也可以继承某个实现 HotProcessor 的抽象类,这些抽象类帮你实现了一些操作。 29 | 30 | 3. 本地测试和运行 31 | 32 | - 运行 Redis 缓存。 33 | 34 | - 进入项目根目录,执行单元测试 35 | 36 | ``` 37 | $ mvn test 38 | ``` 39 | 40 | - 进入项目根目录,使用 maven 插件运行项目 41 | 42 | ``` 43 | $ mvn spring-boot:run 44 | ``` 45 | 46 | - 访问 http://localhost:8080 ,即可看到你添加的爬虫 example 的页面。 -------------------------------------------------------------------------------- /documents/crawler-example-tutorial_zh_CN.md: -------------------------------------------------------------------------------- 1 | # 添加一个网站爬虫快速指南 2 | 3 | 1. 添加站点信息在 hot-crawler/src/main/resources/sites.properties。注意,序号 sites[0] 不能和已存在的重复。 4 | 5 | ``` 6 | #example 7 | cates[0].sites[0].id = 1 8 | cates[0].sites[0].name = example 9 | cates[0].sites[0].processorName: ExampleHotProcessor 10 | ``` 11 | 12 | 2. 添加热点爬取处理器,如 hot-crawler/src/main/java/com/taogen/hotcrawler/commons/crawler/impl/ExampleHotProcessor.java 13 | 14 | ```java 15 | @Component("ExampleHotProcessor") 16 | public class ExampleHotProcessor implements HotProcessor 17 | { 18 | @Override 19 | public List crawlHotList() 20 | { 21 | ... 22 | } 23 | } 24 | ``` 25 | 26 | 3. 本地测试和运行 27 | 28 | - 运行 Redis 缓存。 29 | 30 | - 进入项目根目录,执行单元测试 31 | 32 | ``` 33 | $ mvn test 34 | ``` 35 | 36 | - 进入项目根目录,使用 maven 插件运行项目 37 | 38 | ``` 39 | $ mvn spring-boot:run 40 | ``` 41 | 42 | - 访问 http://localhost:8080 ,即可看到你添加的爬虫 example 的页面。 -------------------------------------------------------------------------------- /documents/database-design.md: -------------------------------------------------------------------------------- 1 | # Database Design 2 | 3 | - t_dict 4 | - t_info_cate 5 | - t_info_type 6 | - t_info 7 | - t_info_display 8 | - t_user 9 | - t_user_info 10 | - t_user_passwd 11 | - t_user_bind 12 | 13 | 14 | 15 | ### t_dict 16 | 17 | | Name | Type | Length | NULL | Key | Description | 18 | | ----------- | ---- | ------ | ---- | ---- | ----------- | 19 | | id | | | | | | 20 | | value | | | | | | 21 | | parent_id | | | | | | 22 | | sort_num | | | | | | 23 | | is_delete | | | | | | 24 | | create_time | | | | | | 25 | | modify_time | | | | | | 26 | 27 | ### t_info_cate 28 | 29 | | Name | Type | Length | NULL | Key | Description | 30 | | ---- | ---- | ------ | ---- | ---- | ----------- | 31 | | id | | | | | | 32 | | name | | | | | | 33 | 34 | ### t_info_type 35 | 36 | | Name | Type | Length | NULL | Key | Description | 37 | | ------- | ------- | ------ | -------- | ---- | ----------- | 38 | | id | varchar | 64 | not null | P | | 39 | | name | enum | 64 | not null | | | 40 | | cate_id | | | | | | 41 | 42 | ### t_info 43 | 44 | | Name | Type | Length | NULL | Key | Description | 45 | | ----------- | --------- | ------ | -------- | ---- | ----------- | 46 | | id | varchar | 64 | not null | P | | 47 | | title | varchar | 128 | not null | | | 48 | | url | varchar | 255 | not null | | | 49 | | url_hash | | | | | | 50 | | info_type | | | | F | | 51 | | create_time | timestamp | | | | | 52 | 53 | index 54 | 55 | - index(info_type) 56 | 57 | ### t_info_display 58 | 59 | | Name | Type | Length | NULL | Key | Description | 60 | | ------------ | ---- | ------ | ---- | ---- | ----------- | 61 | | id | | | | | | 62 | | info_id | | | | | | 63 | | info_type | | | | | | 64 | | display_time | | | | | | 65 | | sort_num | | | | | | 66 | 67 | index 68 | 69 | - index(display_time, info_type) 70 | - //bitmap index(info_type) 71 | 72 | --- 73 | 74 | t_user 75 | 76 | | Name | Type | Length | NULL | Key | Description | 77 | | -------------- | ---- | ------ | ---- | ---- | ----------- | 78 | | id | | | | P | | 79 | | username | | | | | | 80 | | avatar_uri | | | | | | 81 | | big_avatar_uri | | | | | | 82 | | language | | | | | | 83 | | is_delete | | | | | | 84 | | create_time | | | | | | 85 | | modify_time | | | | | | 86 | 87 | index 88 | 89 | - index(username) 90 | 91 | t_user_info 92 | 93 | | Name | Type | Length | NULL | Key | Description | 94 | | ----------- | ---- | ------ | ---- | ---- | ----------- | 95 | | user_id | | | | P | | 96 | | gender | | | | | | 97 | | birth_date | | | | | | 98 | | area_id | | | | | | 99 | | address | | | | | | 100 | | is_delete | | | | | | 101 | | create_time | | | | | | 102 | | modify_time | | | | | | 103 | 104 | t_user_passwd 105 | 106 | | Name | Type | Length | NULL | Key | Description | 107 | | ----------- | ---- | ------ | ---- | ---- | ----------- | 108 | | user_id | | | | P | | 109 | | passwd | | | | | | 110 | | is_delete | | | | | | 111 | | create_time | | | | | | 112 | | modify_time | | | | | | 113 | 114 | t_user_bind 115 | 116 | | Name | Type | Length | NULL | Key | Description | 117 | | ------------ | -------------- | ------ | ---- | ---- | ----------- | 118 | | user_id | | | | P | | 119 | | platform | (email, phone) | | | | | 120 | | open_id | | | | | | 121 | | access_token | | | | | | 122 | | is_delete | | | | | | 123 | | create_time | | | | | | 124 | | modify_time | | | | | | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /documents/develop-process.md: -------------------------------------------------------------------------------- 1 | ### Development Process 2 | 3 | Development 4 | 5 | - Switch to develop branch 6 | - Write unit test code. 7 | - Write function code. 8 | 9 | Local test and commit 10 | 11 | - Check code specification by sonar. 12 | - Code format. 13 | - Pass unit test. 14 | - Check function and style on client (PC, mobile). 15 | - Commit code to develop branch. 16 | 17 | Pre-release Test 18 | 19 | - Deploy develop branch. 20 | - Pass unit test. 21 | - Check function and style on client (PC, mobile). 22 | 23 | Release 24 | 25 | - git pull master from develop branch. 26 | - Deploy master branch. 27 | - Check function and style on client (PC, mobile). -------------------------------------------------------------------------------- /documents/home-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tagnja/hot-crawler/89d9311844470eaba1851fc4963cff4abfaf713e/documents/home-page.png -------------------------------------------------------------------------------- /documents/hotcrawler-homepage-v2-mobile.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tagnja/hot-crawler/89d9311844470eaba1851fc4963cff4abfaf713e/documents/hotcrawler-homepage-v2-mobile.jpg -------------------------------------------------------------------------------- /documents/hotcrawler-homepage-v2-pc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tagnja/hot-crawler/89d9311844470eaba1851fc4963cff4abfaf713e/documents/hotcrawler-homepage-v2-pc.png -------------------------------------------------------------------------------- /documents/system-design.md: -------------------------------------------------------------------------------- 1 | # Hot Crawler System Design 2 | 3 | Content 4 | 5 | - Requirement Analysis 6 | - System Design 7 | - Code Implementation 8 | - Test & Deployment 9 | - Optimization Function 10 | 11 | ### Main 12 | 13 | ### 1. Requirement Analysis 14 | 15 | #### 1.1 Requirements 16 | 17 | - Crawler hot content from media. 18 | - Frontend and Backend separation. 19 | - Timer Crawler. 20 | 21 | Stage 22 | 23 | - Stage 1 24 | - Write Crawler one site API interface. 25 | - Write Frontend page. 26 | - Easy Deploy. 27 | - Stage 2 28 | - Write crawler more sites API interface. 29 | - Update Frontend page. 30 | - Auto Deploy. 31 | - Stage 3 32 | - Crawler WeChat API 33 | - Storing information URL to DB. 34 | 35 | Media 36 | 37 | - Zhihu 38 | - V2EX 39 | - Weibo 40 | - Douban 41 | - Tianya 42 | - CloudMusic 43 | 44 | #### 1.2 Using Techniques 45 | 46 | - Backend 47 | - Spring Boot 2 48 | - Swagger 2 49 | - JUnit 5 50 | - Apache Nutch 51 | - Frontend 52 | - Vue.js 53 | - Test & Deployment 54 | - Docker 55 | - Jenkins 56 | - Database & Cache 57 | 58 | ### 2. System Design 59 | 60 | #### 2.1 Function Module 61 | 62 | System Function 63 | 64 | Frontend --> Backend Controller --> Cache <--> Crawler 65 | 66 | #### 2.2 Database Design 67 | 68 | t_info_type 69 | 70 | | Name | Type | Length | NULL | Key | Description | 71 | | ---- | ------- | ------ | -------- | ---- | ----------- | 72 | | id | varchar | 64 | not null | P | | 73 | | name | varchar | 64 | not null | | | 74 | 75 | t_info 76 | 77 | | Name | Type | Length | NULL | Key | Description | 78 | | ----- | ------- | ------ | -------- | ---- | ----------- | 79 | | id | varchar | 64 | not null | P | | 80 | | title | varchar | 128 | not null | | | 81 | | url | varchar | 255 | not null | | | 82 | 83 | 84 | 85 | #### 2.3 Interfaces Design 86 | 87 | 1\. v1/types 88 | 89 | Method: GET 90 | 91 | Data Type: JSON 92 | 93 | Parameters: null 94 | 95 | Result: 96 | 97 | ```json 98 | { 99 | ret_code: 0, 100 | ret_msg: "success", 101 | data: [ 102 | { 103 | id: 1 104 | name: "v2ex" 105 | }, 106 | ... 107 | ] 108 | } 109 | 110 | ``` 111 | 112 | 2\. v1/types/{id}/infos 113 | 114 | Method: GET 115 | 116 | Data Type: JSON 117 | 118 | Parameters: null 119 | 120 | Result: 121 | 122 | ```json 123 | { 124 | ret_code: 0, 125 | ret_msg: "success", 126 | data: [ 127 | { 128 | id: 1, 129 | title: "", 130 | url: "" 131 | }, 132 | ... 133 | ] 134 | } 135 | ``` 136 | 137 | 138 | 139 | ### 3. Code Implementation 140 | 141 | #### 3.1 Implementation Backend 142 | 143 | ##### 3.1.1 Build Project 144 | 145 | Build Maven Project, run hello-world of spring-boot 146 | 147 | Project Directory Structure Definition 148 | 149 | ```$xslt 150 | /controller 151 | /service 152 | /dao 153 | /crawler 154 | /entity 155 | /util 156 | /conf 157 | ``` 158 | 159 | ##### 3.1.2 Integrate Swagger 160 | 161 | visiting http://localhost:8080/swagger-ui.html 162 | 163 | - Integer with Spring Fox 164 | - Fix config bean doesn't scan the problem. 165 | - Using Swagger-UI. 166 | - Using Swagger Core annotations in controller. 167 | 168 | ##### 3.1.3 Test Drive Development 169 | 170 | - Integrate JUnit5 171 | 172 | ##### 3.1.4 Developing Crawler 173 | 174 | Jsoup, Jsonpath 175 | 176 | ##### 3.1.5 Timer Scheduler 177 | 178 | ##### 3.1.6 Using Redis Cache. 179 | 180 | ##### 3.1.7 Implementation API. 181 | 182 | ##### 3.1.8 Add Log File 183 | 184 | 185 | 186 | #### 3.2 Implementation Frontend 187 | 188 | ### 4. Test & Deployment 189 | 190 | #### CD/CI 191 | 192 | ### 5. Optimization Function 193 | 194 | Notify 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | ### References 209 | 210 | [Documenting Spring Boot REST API with Swagger and SpringFox](https://www.vojtechruzicka.com/documenting-spring-boot-rest-api-swagger-springfox/) 211 | 212 | -------------------------------------------------------------------------------- /documents/techniques-list_en.md: -------------------------------------------------------------------------------- 1 | # Use Techniques List 2 | ### Backend 3 | 4 | - Spring Boot 5 | - Spring Boot Web 6 | - Spring Boot Redis 7 | - Spring Boot Devtools 8 | - Spring Scheduling Tasks 9 | - SLF4j 10 | - RESTful API 11 | - Swagger UI/SpringFox 12 | - Lombok 13 | 14 | ### Crawl and Parse 15 | 16 | - Jsoup 17 | - Jsonpath 18 | 19 | ### Frontend 20 | 21 | - Thymeleaf 22 | - jQuery 23 | 24 | ### Cache & Storage 25 | 26 | - Redis 27 | 28 | ### Test 29 | 30 | - JUnit4 31 | 32 | ### Deployment 33 | 34 | - Nginx proxy 35 | - Spring Boot embedded server - Jetty -------------------------------------------------------------------------------- /documents/techniques-list_zh_CN.md: -------------------------------------------------------------------------------- 1 | # 使用技术 2 | 后端 3 | 4 | - Spring Boot 5 | - Spring Boot Web 6 | - Spring Boot Redis 7 | - Spring Boot Devtools 8 | - Spring Scheduling Tasks 9 | - SLF4j 10 | - RESTful API 11 | - Swagger UI/SpringFox 12 | - Lombok 13 | 14 | 页面爬取解析 15 | - Jsoup 16 | - Jsonpath 17 | 18 | 前端 19 | 20 | - Thymeleaf 21 | - jQuery 22 | 23 | 缓存 24 | - Redis 25 | 26 | 测试 27 | - JUnit4 28 | 29 | 部署 30 | 31 | - Nginx proxy 32 | - Spring Boot embedded server - Jetty -------------------------------------------------------------------------------- /documents/update_log.md: -------------------------------------------------------------------------------- 1 | # Update Log 2 | 3 | 2019.07.27 4 | 5 | - Frontend HTML page fit mobile device. 6 | - Optimize frontend home page style. 7 | 8 | 2019.07.25 9 | 10 | - Enable HTTPS by Certbot. 11 | - Configuring Nginx HTTPS proxy. 12 | 13 | 2019.07.24 14 | 15 | - Implements crawler of GitHub, Weibo, Douban, Tianya, Cloud music, Hacker News websites. 16 | - Write README. 17 | 18 | 2019.07.23 19 | 20 | - Install server software environment. 21 | - DNS configuration and Nginx HTTP reverse proxy. 22 | - Deploy the project to the server by upload executable `jar`. 23 | 24 | 2019.07.22 25 | 26 | - Finished APIs code. 27 | - Optimized crawler functions. (ThreadPool, High scalability) 28 | - Finished frontend code. 29 | - Bought website domain. 30 | 31 | 2019.07.21 32 | 33 | - Finished the hot crawler of V2EX and Zhihu websites. 34 | - Write unit tests. 35 | - Finished scheduling crawler. 36 | - Add properties files. 37 | - Finished integrating Redis. 38 | - Update the project directory structure. 39 | 40 | 2019.07.20 41 | 42 | - Writing system design document. 43 | - Build Spring Boot Project. 44 | - Integrate Swagger-UI. 45 | - Integrate JUnit4. 46 | -------------------------------------------------------------------------------- /documents/websites-list_en.md: -------------------------------------------------------------------------------- 1 | # Websites Crawler List 2 | 3 | ### Social Media 🐟 4 | 5 | - [x] [V2EX](https://v2ex.com) 6 | - [x] [Zhihu](https://zhihu.com) 7 | - [x] [Weibo](https://weibo.com) 8 | - [x] [Douban](https://douban.com) 9 | - [x] [Tianya](https://bbs.tianya.cn) 10 | - [x] [Jiandan](http://jandan.net) 11 | - [x] [NetEase Music](https://music.163.com) 12 | - [ ] [Guokr](https://www.guokr.com/science/category/all) 13 | 14 | ### Technology News 🌎 15 | 16 | - [x] [Geek Park](https://www.geekpark.net) 17 | - [x] [Huxiu](https://www.huxiu.com) 18 | - [x] [Techmeme](https://www.techmeme.com) 19 | - [x] [Nature](https://www.nature.com) 20 | - [x] [Solidot](https://www.solidot.org) 21 | - [x] [Readhub](https://readhub.cn) 22 | - [ ] [cnbeta](https://www.cnbeta.com/) 23 | 24 | ### Developer 🦁 25 | 26 | - [x] [GitHub](https://github.com) 27 | - [x] [Developer Top News](https://toutiao.io) 28 | - [x] [SegmentFault](https://segmentfault.com) 29 | - [x] [InfoQ.cn](https://infoq.cn) 30 | - [x] [Juejin](https://juejin.im) 31 | - [x] [InfoQ.com](https://infoq.com) 32 | - [x] [DZone](https://dzone.com) 33 | 34 | ### Scientific Surf ✈️ 35 | 36 | - [x] [BBC News](https://www.bbc.com) 37 | - [x] [The Economist](https://www.economist.com) 38 | - [x] [Hacker News](https://news.ycombinator.com) 39 | - [ ] [Medium](https://medium.com) 40 | - [ ] [Reddit](https://reddit.com) 41 | - [ ] [Bloomberg](https://www.bloomberg.com) -------------------------------------------------------------------------------- /documents/websites-list_zh_CN.md: -------------------------------------------------------------------------------- 1 | # 网站爬取列表 2 | 3 | ### 社交媒体 🐟 4 | 5 | - [x] [V2EX](https://v2ex.com) 6 | - [x] [知乎](https://zhihu.com) 7 | - [x] [微博](https://weibo.com) 8 | - [x] [豆瓣](https://douban.com) 9 | - [x] [天涯](https://bbs.tianya.cn) 10 | - [x] [煎蛋](http://jandan.net) 11 | - [x] [云音乐](https://music.163.com) 12 | - [ ] [果壳](https://www.guokr.com/science/category/all) 13 | 14 | ### 科技新闻 🌎 15 | 16 | - [x] [极客公园](https://www.geekpark.net) 17 | - [x] [虎嗅](https://www.huxiu.com) 18 | - [x] [Techmeme](https://www.techmeme.com) 19 | - [x] [Nature](https://www.nature.com) 20 | - [x] [Solidot](https://www.solidot.org) 21 | - [x] [Readhub](https://readhub.cn) 22 | - [ ] [cnbeta](https://www.cnbeta.com/) 23 | 24 | ### 开发者 🦁 25 | 26 | - [x] [GitHub](https://github.com) 27 | - [x] [开发者头条](https://toutiao.io) 28 | - [x] [SegmentFault](https://segmentfault.com) 29 | - [x] [InfoQ.cn](https://infoq.cn) 30 | - [x] [掘金](https://juejin.im) 31 | - [x] [InfoQ.com](https://infoq.com) 32 | - [x] [DZone](https://dzone.com) 33 | 34 | ### 科学上网 ✈️ 35 | 36 | - [x] [BBC News](https://www.bbc.com) 37 | - [x] [The Economist](https://www.economist.com) 38 | - [x] [Hacker News](https://news.ycombinator.com) 39 | - [ ] [Medium](https://medium.com) 40 | - [ ] [Reddit](https://reddit.com) 41 | - [ ] [Bloomberg](https://www.bloomberg.com) -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/App.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler; 2 | 3 | import com.ulisesbocchio.jasyptspringboot.annotation.EnableEncryptableProperties; 4 | import io.swagger.annotations.Api; 5 | import org.springframework.boot.SpringApplication; 6 | import org.springframework.boot.autoconfigure.SpringBootApplication; 7 | 8 | /** 9 | * Hello world! 10 | */ 11 | @SpringBootApplication 12 | @Api("This is a hello API") 13 | @EnableEncryptableProperties 14 | public class App 15 | { 16 | public static void main(String[] args) 17 | { 18 | SpringApplication.run(App.class, args); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/api/exception/DataNotFoundException.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.exception; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | public class DataNotFoundException extends RuntimeException 7 | { 8 | private static final long serialVersionUID = 674487573480637090L; 9 | private static final Logger log = LoggerFactory.getLogger(DataNotFoundException.class); 10 | 11 | public DataNotFoundException() {} 12 | 13 | public DataNotFoundException(String errorMessage) 14 | { 15 | log.error(errorMessage); 16 | } 17 | 18 | public DataNotFoundException(String message, Throwable cause) 19 | { 20 | log.error(message, cause); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/api/service/BaseService.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.service; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | import org.springframework.context.ApplicationContext; 5 | import org.springframework.stereotype.Service; 6 | 7 | @Service 8 | public class BaseService 9 | { 10 | @Autowired 11 | private ApplicationContext context; 12 | 13 | public Object getBean(String beanName) 14 | { 15 | return context.getBean(beanName); 16 | } 17 | public Object getBean(Class clazz) 18 | { 19 | return context.getBean(clazz); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/api/service/InfoService.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.service; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import com.taogen.hotcrawler.commons.repository.InfoRepository; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.stereotype.Service; 9 | 10 | import javax.servlet.http.HttpServletRequest; 11 | import java.text.DateFormat; 12 | import java.text.SimpleDateFormat; 13 | import java.util.Date; 14 | import java.util.List; 15 | 16 | @Service 17 | public class InfoService extends BaseService 18 | { 19 | public static final Logger log = LoggerFactory.getLogger(InfoService.class); 20 | 21 | @Autowired 22 | private InfoRepository infoRepository; 23 | 24 | private final DateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd"); 25 | 26 | public List findListByTypeId(String code) 27 | { 28 | return infoRepository.findByTypeId(code); 29 | } 30 | 31 | 32 | public void statVisitUser(HttpServletRequest request) 33 | { 34 | String ip = getRealIpAddress(request); 35 | String today = dateFormatter.format(new Date()); 36 | infoRepository.statVisitUser(ip, today); 37 | } 38 | 39 | public long countVisitUser() 40 | { 41 | String today = dateFormatter.format(new Date()); 42 | return infoRepository.countVisitUser(today); 43 | } 44 | 45 | public static String getIpAddr(HttpServletRequest request) 46 | { 47 | String ip = request.getHeader("x-forwarded-for"); 48 | if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) 49 | { 50 | ip = request.getHeader("Proxy-Client-IP"); 51 | } 52 | if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) 53 | { 54 | ip = request.getHeader("WL-Proxy-Client-IP"); 55 | } 56 | if(ip == null || ip.length() == 0 || "unknown".equalsIgnoreCase(ip)) 57 | { 58 | ip = request.getRemoteAddr(); 59 | } 60 | return ip; 61 | } 62 | 63 | public static String getRealIpAddress(HttpServletRequest request) 64 | { 65 | String ip = getIpAddr(request); 66 | int ipIndex = ip.indexOf(IP_HEADER); 67 | if (ipIndex >= 0) 68 | { 69 | return IP_ADDRESS; 70 | } 71 | return ip; 72 | } 73 | 74 | private static final String IP_ADDRESS = "58.212.237.176"; 75 | private static final String IP_HEADER = "192.168."; 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/api/web/controller/AbstractApiController.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.web.controller; 2 | 3 | import com.taogen.hotcrawler.api.exception.DataNotFoundException; 4 | import com.taogen.hotcrawler.api.web.model.ResponseModel; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.core.annotation.Order; 8 | import org.springframework.http.HttpStatus; 9 | import org.springframework.web.bind.annotation.ExceptionHandler; 10 | import org.springframework.web.bind.annotation.ResponseBody; 11 | import org.springframework.web.bind.annotation.ResponseStatus; 12 | 13 | import javax.servlet.http.HttpServletRequest; 14 | 15 | public abstract class AbstractApiController 16 | { 17 | protected static final Logger log = LoggerFactory.getLogger(AbstractApiController.class); 18 | 19 | @ResponseBody 20 | @ResponseStatus(HttpStatus.NOT_FOUND) 21 | @ExceptionHandler({DataNotFoundException.class}) 22 | @Order(Integer.MIN_VALUE) 23 | public ResponseModel dataNotFoundExceptionHandler(HttpServletRequest request) 24 | { 25 | return getResponseModel(/*request, */404, "无法找到指定的数据。"); 26 | } 27 | 28 | private ResponseModel getResponseModel(/*HttpServletRequest request,*/ int code, String message) 29 | { 30 | ResponseModel result = new ResponseModel(); 31 | result.setErrCode(Integer.valueOf(code)); 32 | result.setErrMsg(message); 33 | return result; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/api/web/controller/BaseV1Controller.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.web.controller; 2 | 3 | import org.springframework.stereotype.Controller; 4 | import org.springframework.web.bind.annotation.RequestMapping; 5 | 6 | @Controller 7 | @RequestMapping({"/api/v1"}) 8 | public class BaseV1Controller extends AbstractApiController 9 | { 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/api/web/controller/InfoController.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.web.controller; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.api.exception.DataNotFoundException; 5 | import com.taogen.hotcrawler.api.service.InfoService; 6 | import com.taogen.hotcrawler.api.web.model.response.GenericResponseModel; 7 | import com.taogen.hotcrawler.commons.entity.Info; 8 | import com.taogen.hotcrawler.commons.entity.InfoCate; 9 | import io.swagger.annotations.Api; 10 | import io.swagger.annotations.ApiOperation; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | import org.springframework.beans.factory.annotation.Autowired; 14 | import org.springframework.web.bind.annotation.*; 15 | 16 | import java.util.List; 17 | 18 | @Api("Information API") 19 | @RestController("InfoController") 20 | @RequestMapping({"/api/v1"}) 21 | //@CrossOrigin 22 | public class InfoController extends BaseV1Controller 23 | { 24 | private static final Logger log = LoggerFactory.getLogger(InfoController.class); 25 | public static final String PRODUCES_JSON = "application/json;charset=UTF-8"; 26 | 27 | @Autowired 28 | private SiteProperties siteProperties; 29 | 30 | @Autowired 31 | private InfoService infoService; 32 | 33 | 34 | @GetMapping(value = "/types", produces = PRODUCES_JSON) 35 | @ApiOperation("Get All Type of Information.") 36 | public GenericResponseModel getTypes() 37 | { 38 | GenericResponseModel result = new GenericResponseModel(); 39 | result.setData(siteProperties.convertToInfoCateList()); 40 | return result; 41 | } 42 | 43 | @GetMapping(value = "/cates/{cid}/types/{tid}/infos", produces = PRODUCES_JSON) 44 | @ApiOperation("Get All Information of specified type.") 45 | public GenericResponseModel getTypeInfos(@PathVariable(value = "cid") String cateId, @PathVariable(value = "tid") String code) 46 | { 47 | log.debug("cateId is {}, code is {}", cateId, code); 48 | GenericResponseModel result = new GenericResponseModel(); 49 | List infoList = infoService.findListByTypeId(code); 50 | if (infoList != null) 51 | { 52 | result.setData(infoList); 53 | } 54 | else 55 | { 56 | throw new DataNotFoundException("无法找到指定数据"); 57 | } 58 | return result; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/api/web/model/ResponseModel.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.web.model; 2 | 3 | import lombok.Data; 4 | 5 | @Data 6 | public class ResponseModel 7 | { 8 | private static final long serialVersionUID = 5413727785722549217L; 9 | protected Integer errCode; 10 | protected String errMsg; 11 | // protected String requestId; 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/api/web/model/response/GenericResponseModel.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.web.model.response; 2 | 3 | import com.taogen.hotcrawler.api.web.model.ResponseModel; 4 | import lombok.Data; 5 | 6 | @Data 7 | public class GenericResponseModel extends ResponseModel 8 | { 9 | private static final long serialVersionUID = 7100791756352030649L; 10 | private T data; 11 | 12 | // public GenericResponseModel(){} 13 | 14 | // public GenericResponseModel(String requestId) 15 | // { 16 | // this.requestId = requestId; 17 | // } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/aspect/DataHandlerAspect.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.aspect; 2 | 3 | 4 | import org.aspectj.lang.JoinPoint; 5 | import org.aspectj.lang.annotation.After; 6 | import org.aspectj.lang.annotation.Aspect; 7 | import org.aspectj.lang.annotation.Pointcut; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.context.annotation.Configuration; 11 | 12 | @Aspect 13 | @Configuration 14 | public class DataHandlerAspect 15 | { 16 | private Logger log = LoggerFactory.getLogger(DataHandlerAspect.class); 17 | 18 | @Pointcut("execution(* com.taogen.hotcrawler.commons.crawler.handler.*.handleRequest(..))") 19 | public void dataHandlerPointCut() 20 | { 21 | // define PointCut not need method body 22 | } 23 | 24 | @After("dataHandlerPointCut()") 25 | public void after(JoinPoint joinPoint) 26 | { 27 | String className = joinPoint.getSignature().getDeclaringType().getSimpleName(); 28 | log.debug("Data handle by {}", className); 29 | } 30 | 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/config/RedisConfig.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.config; 2 | 3 | import org.springframework.boot.autoconfigure.data.redis.RedisProperties; 4 | import org.springframework.cache.annotation.CachingConfigurerSupport; 5 | import org.springframework.cache.annotation.EnableCaching; 6 | import org.springframework.context.annotation.Bean; 7 | import org.springframework.context.annotation.Configuration; 8 | import org.springframework.context.annotation.Primary; 9 | import org.springframework.data.redis.connection.RedisStandaloneConfiguration; 10 | import org.springframework.data.redis.connection.jedis.JedisConnectionFactory; 11 | import org.springframework.data.redis.core.RedisTemplate; 12 | import org.springframework.data.redis.repository.configuration.EnableRedisRepositories; 13 | 14 | @Configuration 15 | @EnableCaching 16 | @EnableRedisRepositories 17 | public class RedisConfig extends CachingConfigurerSupport 18 | { 19 | @Bean 20 | @Primary 21 | public RedisProperties redisProperties() { 22 | return new RedisProperties(); 23 | } 24 | @Bean 25 | JedisConnectionFactory jedisConnectionFactory() 26 | { 27 | RedisProperties properties = redisProperties(); 28 | RedisStandaloneConfiguration configuration = new RedisStandaloneConfiguration(); 29 | configuration.setHostName(properties.getHost()); 30 | configuration.setPort(properties.getPort()); 31 | configuration.setPassword(properties.getPassword()); 32 | configuration.setDatabase(properties.getDatabase()); 33 | return new JedisConnectionFactory(configuration); 34 | } 35 | 36 | @Bean(value = "redisTemplate") 37 | public RedisTemplate redisTemplate() { 38 | RedisTemplate template = new RedisTemplate<>(); 39 | template.setConnectionFactory(jedisConnectionFactory()); 40 | return template; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/config/SpringFoxConfig.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.config; 2 | 3 | import org.springframework.context.annotation.Bean; 4 | import org.springframework.context.annotation.Configuration; 5 | import springfox.documentation.builders.PathSelectors; 6 | import springfox.documentation.builders.RequestHandlerSelectors; 7 | import springfox.documentation.spi.DocumentationType; 8 | import springfox.documentation.spring.web.plugins.Docket; 9 | import springfox.documentation.swagger2.annotations.EnableSwagger2; 10 | 11 | @Configuration 12 | @EnableSwagger2 13 | public class SpringFoxConfig 14 | { 15 | @Bean 16 | public Docket api() 17 | { 18 | return new Docket(DocumentationType.SWAGGER_2) 19 | .host("localhost:8080") 20 | .select() 21 | .apis(RequestHandlerSelectors.any()) 22 | .paths(PathSelectors.any()) 23 | .build(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/constant/RequestMethod.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.constant; 2 | 3 | public enum RequestMethod { 4 | GET, POST 5 | } 6 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/APIHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler; 2 | 3 | import com.jayway.jsonpath.PathNotFoundException; 4 | import com.taogen.hotcrawler.commons.entity.Info; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | public abstract class APIHotProcessor extends AbstractHotProcessor { 11 | 12 | protected abstract List getInfoDataByJson(String json); 13 | 14 | @Override 15 | public List crawlHotList() { 16 | return crawlHotListFromAPI(); 17 | } 18 | 19 | private List crawlHotListFromAPI(){ 20 | List infoList = new ArrayList<>(); 21 | String json = getJson(this.httpRequest); 22 | if (json != null){ 23 | try{ 24 | infoList = getInfoDataByJson(json); 25 | }catch (PathNotFoundException e){ 26 | log.error("Json path error!", e); 27 | } 28 | } 29 | log.debug("crawl hot list from {}, list size is {}", this.name, infoList.size()); 30 | return handlerCenter.handleData(infoList); 31 | } 32 | 33 | @Override 34 | protected Map generateHeader(){ 35 | // Basic implementation, most situation not need header 36 | return null; 37 | } 38 | 39 | @Override 40 | protected String generateRequestBody(){ 41 | // Basic implementation, most situation not need request body 42 | return null; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/DocumentHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.select.Elements; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | public abstract class DocumentHotProcessor extends AbstractHotProcessor { 12 | 13 | protected abstract Elements getElements(Document document); 14 | protected abstract List getInfoDataByElements(Elements elements); 15 | 16 | @Override 17 | public List crawlHotList() { 18 | return crawlHotListFromDoc(); 19 | } 20 | 21 | private List crawlHotListFromDoc(){ 22 | List infoList = new ArrayList<>(); 23 | Document document = getDocument(this.httpRequest); 24 | if (document != null){ 25 | Elements elements = getElements(document); 26 | if (elements != null){ 27 | log.debug("elements size is {}", elements.size()); 28 | infoList = getInfoDataByElements(elements); 29 | } 30 | } 31 | log.debug("crawl hot list from {}, list size is {}", this.name, infoList.size()); 32 | return handlerCenter.handleData(infoList); 33 | } 34 | 35 | @Override 36 | protected Map generateHeader(){ 37 | // Basic implementation, most situation not need header 38 | return null; 39 | } 40 | 41 | @Override 42 | protected String generateRequestBody(){ 43 | // Basic implementation, most situation not need request body 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/HotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | 5 | import java.util.List; 6 | 7 | public interface HotProcessor { 8 | List crawlHotList(); 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/MultipleAPIHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import com.taogen.hotcrawler.commons.vo.HttpRequest; 5 | import lombok.Data; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | @Data 11 | public abstract class MultipleAPIHotProcessor extends AbstractHotProcessor{ 12 | 13 | protected List httpRequestList; 14 | 15 | protected abstract List getInfoListByJson(String json, int index); 16 | 17 | @Override 18 | public List crawlHotList() { 19 | List infoList = getHotInfoListByHttpRequestParams(getHttpRequestList()); 20 | return getHandlerCenter().handleData(infoList); 21 | } 22 | 23 | protected List getHotInfoListByHttpRequestParams(List httpRequestList){ 24 | List returnInfoList = new ArrayList<>(); 25 | if (httpRequestList != null) { 26 | for (int i = 0; i < httpRequestList.size(); i++) { 27 | String json = getJson(httpRequestList.get(i)); 28 | List infoList = getInfoListByJson(json, i); 29 | returnInfoList.addAll(infoList); 30 | } 31 | } 32 | return returnInfoList; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/SimpleAPIHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler; 2 | 3 | import com.jayway.jsonpath.JsonPath; 4 | import com.jayway.jsonpath.PathNotFoundException; 5 | import com.taogen.hotcrawler.commons.entity.Info; 6 | import lombok.Data; 7 | 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | @Data 12 | public abstract class SimpleAPIHotProcessor extends APIHotProcessor { 13 | private List titleJsonPaths; 14 | private List urlJsonPaths; 15 | 16 | @Override 17 | protected List getInfoDataByJson(String json) { 18 | List infoList = new ArrayList<>(); 19 | if (json != null && checkJsonPathList(this.titleJsonPaths, this.urlJsonPaths)){ 20 | for (int i = 0; i < titleJsonPaths.size(); i++){ 21 | try { 22 | List titles = JsonPath.read(json, this.titleJsonPaths.get(i)); 23 | List urls = JsonPath.read(json, this.urlJsonPaths.get(i)); 24 | infoList.addAll(getInfoListByTitlesAndUrls(titles, urls)); 25 | }catch(PathNotFoundException e){ 26 | log.error("Json path error!", e); 27 | } 28 | } 29 | } 30 | return infoList; 31 | } 32 | 33 | private boolean checkJsonPathList(List titleJsonPaths, List urlJsonPaths){ 34 | return titleJsonPaths != null && urlJsonPaths != null && titleJsonPaths.size() == urlJsonPaths.size(); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/SimpleDocumentHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import org.jsoup.nodes.Document; 5 | import org.jsoup.nodes.Element; 6 | import org.jsoup.select.Elements; 7 | 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | public abstract class SimpleDocumentHotProcessor extends DocumentHotProcessor { 12 | protected String elementClass; 13 | 14 | protected abstract Info getInfoByElement(Element element); 15 | 16 | @Override 17 | protected Elements getElements(Document document) { 18 | return document.getElementsByClass(elementClass); 19 | } 20 | 21 | @Override 22 | protected List getInfoDataByElements(Elements elements) { 23 | List list = new ArrayList<>(); 24 | if (elements != null) { 25 | int i = 0; 26 | for (Element element : elements) { 27 | try { 28 | Info info = getInfoByElement(element); 29 | list.add(info); 30 | } catch (IndexOutOfBoundsException | NullPointerException e) { 31 | log.error("Can't find attribute in element {}!", i, e); 32 | } 33 | i++; 34 | } 35 | } 36 | return list; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/handler/DataHandler.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.handler; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import lombok.Data; 5 | 6 | import java.util.List; 7 | 8 | @Data 9 | public abstract class DataHandler { 10 | protected DataHandler nextDataHandler; 11 | 12 | public abstract List handleRequest(List infoList); 13 | 14 | public void setNext(DataHandler datahandler){ 15 | this.nextDataHandler = datahandler; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/handler/HandlerCenter.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.handler; 2 | 3 | import com.taogen.hotcrawler.api.service.BaseService; 4 | import com.taogen.hotcrawler.commons.crawler.handler.impl.DeduplicationDataHandler; 5 | import com.taogen.hotcrawler.commons.entity.Info; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.stereotype.Component; 8 | 9 | import javax.annotation.PostConstruct; 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | @Component 15 | public class HandlerCenter { 16 | private DataHandler firstDataHandler; 17 | 18 | @Autowired 19 | private BaseService baseService; 20 | 21 | private final List dataHandlers = new ArrayList<>(Arrays.asList( 22 | DeduplicationDataHandler.class 23 | )); 24 | 25 | @PostConstruct 26 | private void initialize(){ 27 | setHandlerChain(); 28 | } 29 | 30 | private void setHandlerChain(){ 31 | if (! dataHandlers.isEmpty()){ 32 | firstDataHandler = (DataHandler) baseService.getBean(dataHandlers.get(0)); 33 | DataHandler dataHandler = firstDataHandler; 34 | for (int i = 1; i < dataHandlers.size(); i++){ 35 | dataHandler.nextDataHandler = (DataHandler) baseService.getBean(dataHandlers.get(i)); 36 | dataHandler = dataHandler.nextDataHandler; 37 | } 38 | } 39 | } 40 | 41 | public List handleData(List infoList){ 42 | DataHandler dataHandler = firstDataHandler; 43 | while(dataHandler != null){ 44 | infoList = dataHandler.handleRequest(infoList); 45 | dataHandler = dataHandler.getNextDataHandler(); 46 | } 47 | return infoList; 48 | } 49 | 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/handler/impl/DeduplicationDataHandler.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.handler.impl; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.handler.DataHandler; 4 | import com.taogen.hotcrawler.commons.entity.Info; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.stereotype.Component; 8 | 9 | import java.util.ArrayList; 10 | import java.util.HashSet; 11 | import java.util.List; 12 | import java.util.Set; 13 | 14 | @Component 15 | public class DeduplicationDataHandler extends DataHandler { 16 | 17 | Logger log = LoggerFactory.getLogger(getClass()); 18 | 19 | @Override 20 | public List handleRequest(List infoList) { 21 | List resultList = new ArrayList<>(); 22 | if (infoList == null) 23 | { 24 | return resultList; 25 | } 26 | Set infoUrlSet = new HashSet<>(); 27 | int subtract = 0; 28 | for (int i = 0; i < infoList.size(); i++) 29 | { 30 | Info info = infoList.get(i); 31 | if (info != null) { 32 | if (infoUrlSet.contains(info.getUrl())) { 33 | subtract++; 34 | continue; 35 | } 36 | infoUrlSet.add(info.getUrl()); 37 | info.setId(String.valueOf(i + 1 - subtract)); 38 | resultList.add(info); 39 | } 40 | } 41 | return resultList; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/BBCNewsHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.DocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.context.ApplicationContext; 13 | import org.springframework.stereotype.Component; 14 | 15 | import javax.annotation.PostConstruct; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | 19 | @Component("BBCNewsHotProcessor") 20 | public class BBCNewsHotProcessor extends DocumentHotProcessor 21 | { 22 | public static final String ITEM_KEY = "gs-c-promo-heading"; 23 | 24 | @Autowired 25 | private SiteProperties siteProperties; 26 | 27 | @Autowired 28 | private ApplicationContext context; 29 | 30 | @Override 31 | @PostConstruct 32 | protected void initialize(){ 33 | RequestMethod requestMethod = RequestMethod.GET; 34 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 35 | injectBeansByContext(context); 36 | setLog(LoggerFactory.getLogger(getClass())); 37 | } 38 | 39 | @Override 40 | protected Elements getElements(Document document) { 41 | return document.getElementsByClass("nw-c-top-stories--standard").get(0).getElementsByClass(ITEM_KEY); 42 | } 43 | 44 | @Override 45 | protected List getInfoDataByElements(Elements elements) { 46 | List list = new ArrayList<>(); 47 | if (elements != null) { 48 | int i = 0; 49 | for (Element element : elements) { 50 | try { 51 | String infoTitle = element.getElementsByClass("gs-c-promo-heading__title").html(); 52 | StringBuilder infoUrl = new StringBuilder(); 53 | infoUrl.append(this.prefix); 54 | infoUrl.append(element.attr("href")); 55 | String id = String.valueOf(++i); 56 | list.add(new Info(id, infoTitle, infoUrl.toString())); 57 | } catch (IndexOutOfBoundsException e) { 58 | log.error("Can't find attribute!", e); 59 | } 60 | } 61 | } 62 | return list; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/EconomistHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("EconomistHotProcessor") 16 | public class EconomistHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "teaser"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | Element item = element.getElementsByClass("teaser__link").get(0); 39 | String infoTitle = item.attr("aria-label"); 40 | StringBuilder infoUrl = new StringBuilder(); 41 | infoUrl.append(this.prefix); 42 | infoUrl.append(item.attr("href")); 43 | Info info = new Info(); 44 | info.setTitle(infoTitle); 45 | info.setUrl(infoUrl.toString()); 46 | return info; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/HackernewsHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.DocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.select.Elements; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.beans.factory.annotation.Autowired; 11 | import org.springframework.context.ApplicationContext; 12 | import org.springframework.stereotype.Component; 13 | 14 | import javax.annotation.PostConstruct; 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | import java.util.Map; 18 | 19 | @Component("HackernewsHotProcessor") 20 | public class HackernewsHotProcessor extends DocumentHotProcessor 21 | { 22 | private static final String ITEM_KEY = "storylink"; 23 | private Elements titleElements; 24 | private Elements urlElements; 25 | 26 | @Autowired 27 | private SiteProperties siteProperties; 28 | 29 | @Autowired 30 | private ApplicationContext context; 31 | 32 | @Override 33 | @PostConstruct 34 | protected void initialize(){ 35 | RequestMethod requestMethod = RequestMethod.GET; 36 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 37 | injectBeansByContext(context); 38 | setLog(LoggerFactory.getLogger(getClass())); 39 | } 40 | 41 | @Override 42 | protected Elements getElements(Document document) { 43 | this.titleElements = document.getElementsByClass(ITEM_KEY); 44 | this.urlElements = document.getElementsByClass("subtext"); 45 | return null; 46 | } 47 | 48 | @Override 49 | protected List getInfoDataByElements(Elements elements) { 50 | List list = new ArrayList<>(); 51 | if (this.titleElements != null && this.urlElements != null) { 52 | for (int i = 0; i < titleElements.size(); i++) { 53 | String id = String.valueOf(i + 1); 54 | Elements aElements = urlElements.get(i).getElementsByTag("a"); 55 | String infoUrl = aElements.get(aElements.size() - 1).attr("href"); 56 | infoUrl = this.prefix + "/" + infoUrl; 57 | String infoTitle = titleElements.get(i).html(); 58 | list.add(new Info(id, infoTitle, infoUrl)); 59 | } 60 | } 61 | return list; 62 | } 63 | 64 | @Override 65 | protected Map generateHeader() { 66 | return getBasicHeaders(); 67 | } 68 | 69 | @Override 70 | protected String generateRequestBody() { 71 | return null; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/LobstersHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("LobstersHotProcessor") 16 | public class LobstersHotProcessor extends SimpleDocumentHotProcessor { 17 | public static final String ITEM_KEY = "story"; 18 | 19 | @Autowired 20 | private SiteProperties siteProperties; 21 | 22 | @Autowired 23 | private ApplicationContext context; 24 | 25 | @Override 26 | @PostConstruct 27 | protected void initialize(){ 28 | RequestMethod requestMethod = RequestMethod.GET; 29 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 30 | injectBeansByContext(context); 31 | setLog(LoggerFactory.getLogger(getClass())); 32 | this.elementClass = ITEM_KEY; 33 | } 34 | 35 | @Override 36 | protected Info getInfoByElement(Element element) { 37 | String infoTitle = element.getElementsByClass("u-url").get(0).html(); 38 | StringBuilder infoUrl = new StringBuilder(); 39 | infoUrl.append(this.prefix); 40 | infoUrl.append(element.getElementsByClass("comments_label").get(0).getElementsByTag("a").get(0).attr("href")); 41 | Info info = new Info(); 42 | info.setTitle(infoTitle); 43 | info.setUrl(infoUrl.toString()); 44 | return info; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/RedditHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad;//package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | // 3 | //import com.taogen.hotcrawler.commons.crawler.HotProcessor; 4 | //import com.taogen.hotcrawler.commons.entity.Info; 5 | //import org.jsoup.nodes.Document; 6 | //import org.jsoup.nodes.Element; 7 | //import org.jsoup.select.Elements; 8 | //import org.slf4j.Logger; 9 | //import org.slf4j.LoggerFactory; 10 | //import org.springframework.beans.factory.annotation.Autowired; 11 | //import org.springframework.stereotype.Component; 12 | // 13 | //import java.util.ArrayList; 14 | //import java.util.List; 15 | // 16 | //@Component("RedditHotProcessor") 17 | //public class RedditHotProcessor implements HotProcessor 18 | //{ 19 | // private static final Logger log = LoggerFactory.getLogger(RedditHotProcessor.class); 20 | // 21 | // @Autowired 22 | // private BaseHotProcessor baseHotProcessor; 23 | // 24 | // public static final String DOMAIN = "https://www.reddit.com"; 25 | // public static final String HOT_PAGE_URL = "https://www.reddit.com/hot/"; 26 | // public static final String ITEM_KEY = "_1oQyIsiPHYt6nx7VOmd1sz"; // title: ._eYtD2XCVieq6emjKBH3m , url: SQnoC3ObvgnGjWt90zD9Z 27 | // 28 | // @Override 29 | // public List crawlHotList() 30 | // { 31 | // List list = new ArrayList<>(); 32 | // 33 | // // document 34 | // Document doc = baseHotProcessor.getDoc(HOT_PAGE_URL, null, log); 35 | // if (doc == null) 36 | // { 37 | // return list; 38 | // } 39 | // 40 | // // elements 41 | // Elements elements = doc.getElementsByClass(ITEM_KEY); 42 | // 43 | // int i = 0; 44 | // for (Element element : elements) 45 | // { 46 | // try 47 | // { 48 | // String infoTitle = element.getElementsByClass("_eYtD2XCVieq6emjKBH3m").get(0).html(); 49 | // StringBuilder infoUrl = new StringBuilder(); 50 | // infoUrl.append(DOMAIN); 51 | // infoUrl.append(element.getElementsByClass("SQnoC3ObvgnGjWt90zD9Z").get(0).attr("href")); 52 | // String id = String.valueOf(++i); 53 | // list.add(new Info(id, infoTitle, infoUrl.toString())); 54 | // } 55 | // catch(IndexOutOfBoundsException e) 56 | // { 57 | // log.error("Can't find attribute!", e); 58 | // } 59 | // } 60 | // 61 | // return baseHotProcessor.handleData(list); 62 | // } 63 | //} 64 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/TechmemeHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("TechmemeHotProcessor") 16 | public class TechmemeHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "item"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | Element item = element.getElementsByClass("ourh").get(0); 39 | String infoTitle = item.html(); 40 | StringBuilder infoUrl = new StringBuilder(); 41 | infoUrl.append(item.attr("href")); 42 | Info info = new Info(); 43 | info.setTitle(infoTitle); 44 | info.setUrl(infoUrl.toString()); 45 | return info; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/TheNewYorkTimesHotProcessor.java: -------------------------------------------------------------------------------- 1 | //package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | // 3 | //import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | //import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | //import com.taogen.hotcrawler.commons.crawler.DocumentHotProcessor; 6 | //import com.taogen.hotcrawler.commons.entity.Info; 7 | //import org.jsoup.nodes.Document; 8 | //import org.jsoup.nodes.Element; 9 | //import org.jsoup.select.Elements; 10 | //import org.slf4j.LoggerFactory; 11 | //import org.springframework.beans.factory.annotation.Autowired; 12 | //import org.springframework.context.ApplicationContext; 13 | //import org.springframework.stereotype.Component; 14 | // 15 | //import javax.annotation.PostConstruct; 16 | //import java.util.ArrayList; 17 | //import java.util.List; 18 | // 19 | //@Component("TheNewYorkTimesHotProcessor") 20 | //public class TheNewYorkTimesHotProcessor extends DocumentHotProcessor { 21 | // 22 | // public static final String FIRST_ITEM_KEY = "css-2pep1h"; 23 | // public static final String OTHER_ITEM_KEY = "css-1iski2w"; 24 | // 25 | // @Autowired 26 | // private SiteProperties siteProperties; 27 | // 28 | // @Autowired 29 | // private ApplicationContext context; 30 | // 31 | // @Override 32 | // @PostConstruct 33 | // protected void initialize(){ 34 | // RequestMethod requestMethod = RequestMethod.GET; 35 | // setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 36 | // injectBeansByContext(context); 37 | // setLog(LoggerFactory.getLogger(getClass())); 38 | // } 39 | // 40 | // @Override 41 | // protected Elements getElements(Document document) { 42 | // Elements firstElements = document.getElementsByClass(FIRST_ITEM_KEY); 43 | // Elements otherElements = document.getElementsByClass(OTHER_ITEM_KEY); 44 | // Elements resultElements = new Elements(); 45 | // resultElements.addAll(firstElements); 46 | // resultElements.addAll(otherElements); 47 | // return resultElements; 48 | // } 49 | // 50 | // @Override 51 | // protected List getInfoDataByElements(Elements elements) { 52 | // List infoList = new ArrayList<>(); 53 | // String firstTitle = elements.get(0).getElementsByTag("a").attr("href"); 54 | // String firstUrl = elements.get(0).getElementsByClass("css-x01ngn").get(0).html(); 55 | // infoList.add(new Info(firstTitle, firstUrl)); 56 | // for (int i = 1; i < elements.size(); i++){ 57 | // Element element = elements.get(i).getElementsByTag("a").get(0); 58 | // String infoUrl = element.attr("href"); 59 | // String infoTitle = element.getElementsByClass("css-14kabif").get(0).html(); 60 | // infoList.add(new Info(infoTitle, infoUrl)); 61 | // } 62 | // return infoList; 63 | // } 64 | //} 65 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/YouTubeHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad;//package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | // 3 | //import com.taogen.hotcrawler.commons.crawler.HotProcessor; 4 | //import com.taogen.hotcrawler.commons.entity.Info; 5 | //import org.jsoup.Connection; 6 | //import org.jsoup.Jsoup; 7 | //import org.jsoup.nodes.Document; 8 | //import org.jsoup.nodes.Element; 9 | //import org.jsoup.select.Elements; 10 | //import org.slf4j.Logger; 11 | //import org.slf4j.LoggerFactory; 12 | //import org.springframework.beans.factory.annotation.Autowired; 13 | //import org.springframework.stereotype.Component; 14 | // 15 | //import java.io.IOException; 16 | //import java.util.ArrayList; 17 | //import java.util.List; 18 | // 19 | //@Component("YouTubeHotProcessor") 20 | //public class YouTubeHotProcessor implements HotProcessor 21 | //{ 22 | // private static final Logger log = LoggerFactory.getLogger(YouTubeHotProcessor.class); 23 | // 24 | // @Autowired 25 | // private BaseHotProcessor baseHotProcessor; 26 | // 27 | // public static final String DOMAIN = "https://www.youtube.com"; 28 | // public static final String HOT_PAGE_URL = "https://www.youtube.com/feed/trending"; 29 | // public static final String ITEM_KEY = "title-and-badge"; 30 | // 31 | // @Override 32 | // public List crawlHotList() 33 | // { 34 | // List list = new ArrayList<>(); 35 | // 36 | // // document 37 | // Document doc = null; 38 | // Connection connection = Jsoup.connect(HOT_PAGE_URL); 39 | // try 40 | // { 41 | // doc = connection.timeout(10 * 1000).get(); 42 | // } 43 | // catch (IOException e) 44 | // { 45 | // log.error("Fail to connect!", e); 46 | // } 47 | // if (doc == null) 48 | // { 49 | // return list; 50 | // } 51 | // 52 | // // elements 53 | // Elements elements = doc.getElementsByClass(ITEM_KEY); 54 | // 55 | // int i = 0; 56 | // for (Element element : elements) 57 | // { 58 | // try 59 | // { 60 | // Element item = element.getElementsByTag("a").get(0); 61 | // String infoTitle = item.html(); 62 | // StringBuilder infoUrl = new StringBuilder(); 63 | // infoUrl.append(DOMAIN); 64 | // infoUrl.append(item.attr("href")); 65 | // String id = String.valueOf(++i); 66 | // list.add(new Info(id, infoTitle, infoUrl.toString())); 67 | // } 68 | // catch(IndexOutOfBoundsException e) 69 | // { 70 | // log.error("Can't find attribute!", e); 71 | // } 72 | // } 73 | // 74 | // return baseHotProcessor.handleData(list); 75 | // } 76 | //} 77 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/news/GeekParkHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("GeekParkHotProcessor") 16 | public class GeekParkHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "article-item"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | Element item = element.getElementsByClass("img-cover-wrap").get(0); 39 | String infoTitle = item.attr("data-event-label"); 40 | int index = infoTitle.indexOf("/news"); 41 | if (index != -1) { 42 | infoTitle = infoTitle.substring(0, index); 43 | } 44 | StringBuilder infoUrl = new StringBuilder(); 45 | infoUrl.append(this.prefix); 46 | infoUrl.append(item.attr("href")); 47 | Info info = new Info(); 48 | info.setTitle(infoTitle); 49 | info.setUrl(infoUrl.toString()); 50 | return info; 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/news/HuxiuHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("HuxiuHotProcessor") 16 | public class HuxiuHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "article-items"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | Element titleItem = element.getElementsByClass("article-item__content__title").get(0); 39 | String infoTitle = titleItem.html(); 40 | Element urlItem = element.getElementsByClass("article-item__img").get(0).parent(); 41 | StringBuilder infoUrl = new StringBuilder(this.prefix); 42 | infoUrl.append(urlItem.attr("href")); 43 | Info info = new Info(); 44 | info.setTitle(infoTitle); 45 | info.setUrl(infoUrl.toString()); 46 | return info; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/news/IfanrHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("IfanrHotProcessor") 16 | public class IfanrHotProcessor extends SimpleDocumentHotProcessor { 17 | public static final String ITEM_KEY = "article-item"; 18 | 19 | @Autowired 20 | private SiteProperties siteProperties; 21 | 22 | @Autowired 23 | private ApplicationContext context; 24 | 25 | @Override 26 | @PostConstruct 27 | protected void initialize(){ 28 | RequestMethod requestMethod = RequestMethod.GET; 29 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 30 | injectBeansByContext(context); 31 | setLog(LoggerFactory.getLogger(getClass())); 32 | this.elementClass = ITEM_KEY; 33 | } 34 | 35 | @Override 36 | protected Info getInfoByElement(Element element) { 37 | element = element.getElementsByClass("js-title-transform").get(0); 38 | String infoUrl = element.attr("href"); 39 | String infoTitle = element.html(); 40 | return new Info(infoTitle, infoUrl); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/news/NatureHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("NatureHotProcessor") 16 | public class NatureHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "c-article-item__wrapper"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | String infoTitle = element.getElementsByClass("c-article-item__title").get(0).html(); 39 | StringBuilder infoUrl = new StringBuilder(); 40 | infoUrl.append(this.prefix); 41 | infoUrl.append(element.getElementsByTag("a").get(0).attr("href")); 42 | Info info = new Info(); 43 | info.setTitle(infoTitle); 44 | info.setUrl(infoUrl.toString()); 45 | return info; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/news/ReadhubHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.jayway.jsonpath.JsonPath; 4 | import com.taogen.hotcrawler.commons.config.SiteProperties; 5 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 6 | import com.taogen.hotcrawler.commons.crawler.APIHotProcessor; 7 | import com.taogen.hotcrawler.commons.entity.Info; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | @Component("ReadhubHotProcessor") 18 | public class ReadhubHotProcessor extends APIHotProcessor 19 | { 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | } 34 | 35 | @Override 36 | protected List getInfoDataByJson(String json) { 37 | List list = new ArrayList<>(); 38 | List titles = JsonPath.read(json, "$.data.[*].title"); 39 | List urls = JsonPath.read(json, "$.data.[*].id"); 40 | 41 | for (int i = 0; i < urls.size(); i++) 42 | { 43 | urls.set(i, new StringBuilder(this.prefix).append("/").append(urls.get(i)).toString()); 44 | } 45 | 46 | for (int i = 0; i < titles.size(); i++) 47 | { 48 | list.add(new Info(String.valueOf(i), titles.get(i), urls.get(i))); 49 | } 50 | return list; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/news/SolidotHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.beans.factory.annotation.Autowired; 11 | import org.springframework.context.ApplicationContext; 12 | import org.springframework.stereotype.Component; 13 | 14 | import javax.annotation.PostConstruct; 15 | 16 | @Component("SolidotHotProcessor") 17 | public class SolidotHotProcessor extends SimpleDocumentHotProcessor 18 | { 19 | public static final String ITEM_KEY = "block_m"; 20 | 21 | @Autowired 22 | private SiteProperties siteProperties; 23 | 24 | @Autowired 25 | private ApplicationContext context; 26 | 27 | @Override 28 | @PostConstruct 29 | protected void initialize(){ 30 | RequestMethod requestMethod = RequestMethod.GET; 31 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 32 | injectBeansByContext(context); 33 | setLog(LoggerFactory.getLogger(getClass())); 34 | this.elementClass = ITEM_KEY; 35 | } 36 | 37 | @Override 38 | protected Info getInfoByElement(Element element) { 39 | Elements elements = element.getElementsByClass("bg_htit").get(0).getElementsByTag("h2").get(0).getElementsByTag("a"); 40 | Element item = elements.get(elements.size() - 1); 41 | String infoTitle = item.html(); 42 | StringBuilder infoUrl = new StringBuilder(); 43 | infoUrl.append(this.prefix); 44 | infoUrl.append(item.attr("href")); 45 | Info info = new Info(); 46 | info.setTitle(infoTitle); 47 | info.setUrl(infoUrl.toString()); 48 | return info; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/DoubanHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("DoubanHotProcessor") 16 | public class DoubanHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "channel-item"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | element = element.getElementsByClass("bd").get(0).getElementsByTag("a").get(0); 39 | String infoUrl = element.attr("href"); 40 | String infoTitle = element.html(); 41 | Info info = new Info(); 42 | info.setTitle(infoTitle); 43 | info.setUrl(infoUrl); 44 | return info; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/DoubanTopicsHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.DocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.context.ApplicationContext; 13 | import org.springframework.stereotype.Component; 14 | 15 | import javax.annotation.PostConstruct; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | 19 | @Component 20 | public class DoubanTopicsHotProcessor extends DocumentHotProcessor { 21 | 22 | @Autowired 23 | private SiteProperties siteProperties; 24 | 25 | @Autowired 26 | private ApplicationContext context; 27 | 28 | @Override 29 | @PostConstruct 30 | protected void initialize(){ 31 | RequestMethod requestMethod = RequestMethod.GET; 32 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 33 | injectBeansByContext(context); 34 | setLog(LoggerFactory.getLogger(getClass())); 35 | } 36 | 37 | @Override 38 | protected Elements getElements(Document document) { 39 | return document.getElementsByClass("trend").get(0).getElementsByTag("li"); 40 | } 41 | 42 | @Override 43 | protected List getInfoDataByElements(Elements elements) { 44 | List infoList = new ArrayList<>(); 45 | for (Element element : elements){ 46 | String infoTitle = element.getElementsByTag("a").get(0).text(); 47 | String infoUrl = element.getElementsByTag("a").get(0).attr("href"); 48 | infoList.add(new Info(infoTitle, infoUrl)); 49 | } 50 | return infoList; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/HupuHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.DocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.context.ApplicationContext; 13 | import org.springframework.stereotype.Component; 14 | 15 | import javax.annotation.PostConstruct; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | 19 | @Component 20 | public class HupuHotProcessor extends DocumentHotProcessor { 21 | public static final String ITEM_KEY = "indexs"; 22 | 23 | @Autowired 24 | private SiteProperties siteProperties; 25 | 26 | @Autowired 27 | private ApplicationContext context; 28 | 29 | @Override 30 | @PostConstruct 31 | protected void initialize(){ 32 | RequestMethod requestMethod = RequestMethod.GET; 33 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 34 | injectBeansByContext(context); 35 | setLog(LoggerFactory.getLogger(getClass())); 36 | } 37 | 38 | 39 | @Override 40 | protected Elements getElements(Document document) { 41 | return document.getElementsByClass("list").get(0).getElementsByClass("textSpan"); 42 | } 43 | 44 | @Override 45 | protected List getInfoDataByElements(Elements elements) { 46 | List infoList = new ArrayList<>(); 47 | for (Element element : elements) { 48 | StringBuilder infoUrl = new StringBuilder(); 49 | infoUrl.append(this.prefix); 50 | infoUrl.append(element.getElementsByTag("a").get(0).attr("href")); 51 | String infoTitle = element.getElementsByTag("a").get(0).text(); 52 | infoList.add(new Info(infoTitle, infoUrl.toString())); 53 | } 54 | return infoList; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/JiandanHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("JiandanHotProcessor") 16 | public class JiandanHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "indexs"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | Element item = element.getElementsByTag("a").get(0); 39 | String infoTitle = item.html(); 40 | StringBuilder infoUrl = new StringBuilder(); 41 | infoUrl.append(item.attr("href")); 42 | Info info = new Info(); 43 | info.setTitle(infoTitle); 44 | info.setUrl(infoUrl.toString()); 45 | return info; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/SspaiHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.beans.factory.annotation.Autowired; 11 | import org.springframework.context.ApplicationContext; 12 | import org.springframework.stereotype.Component; 13 | 14 | import javax.annotation.PostConstruct; 15 | 16 | @Component 17 | public class SspaiHotProcessor extends SimpleDocumentHotProcessor { 18 | public static final String ITEM_KEY = "articleCard"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | Elements paiElements = element.getElementsByAttributeValueContaining("class", "pai_title"); 39 | StringBuilder infoUrl = new StringBuilder(); 40 | if (paiElements != null && ! paiElements.isEmpty()){ 41 | infoUrl.append(this.prefix); 42 | infoUrl.append(paiElements.get(0).getElementsByTag("a").get(0).attr("href")); 43 | Elements infoTitleElements = paiElements.get(0).getElementsByClass("time").get(0).getElementsByTag("div"); 44 | StringBuilder infoTitle = new StringBuilder(); 45 | infoTitle.append("派日报"); 46 | infoTitle.append(" "); 47 | infoTitle.append(infoTitleElements.get(0).text()); 48 | return new Info(infoTitle.toString(), infoUrl.toString()); 49 | }else{ 50 | element = element.getElementsByClass("card_content").get(0); 51 | infoUrl.append(this.prefix); 52 | infoUrl.append(element.getElementsByTag("a").get(0).attr("href")); 53 | String infoTitle = element.getElementsByClass("title").get(0).html(); 54 | return new Info(infoTitle, infoUrl.toString()); 55 | } 56 | 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/TianyaHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("TianyaHotProcessor") 16 | public class TianyaHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "td-title"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | element = element.getElementsByTag("a").get(0); 39 | StringBuilder infoUrl = new StringBuilder(); 40 | infoUrl.append(this.prefix); 41 | infoUrl.append(element.attr("href")); 42 | String infoTitle = element.html(); 43 | Info info = new Info(); 44 | info.setTitle(infoTitle); 45 | info.setUrl(infoUrl.toString()); 46 | return info; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/V2exHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.beans.factory.annotation.Autowired; 11 | import org.springframework.context.ApplicationContext; 12 | import org.springframework.stereotype.Component; 13 | 14 | import javax.annotation.PostConstruct; 15 | 16 | @Component("V2exHotProcessor") 17 | public class V2exHotProcessor extends SimpleDocumentHotProcessor 18 | { 19 | private static final String ITEM_KEY = "item_title"; 20 | 21 | @Autowired 22 | private SiteProperties siteProperties; 23 | 24 | @Autowired 25 | private ApplicationContext context; 26 | 27 | @Override 28 | @PostConstruct 29 | protected void initialize(){ 30 | RequestMethod requestMethod = RequestMethod.GET; 31 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 32 | injectBeansByContext(context); 33 | setLog(LoggerFactory.getLogger(getClass())); 34 | this.elementClass = ITEM_KEY; 35 | } 36 | 37 | @Override 38 | protected Info getInfoByElement(Element element) { 39 | Elements elements1 = element.getElementsByTag("a"); 40 | String infoTitle = elements1.html(); 41 | StringBuilder infoUrl = new StringBuilder(); 42 | infoUrl.append(this.prefix); 43 | infoUrl.append(elements1.attr("href")); 44 | String url = infoUrl.substring(0, infoUrl.indexOf("#")); 45 | Info info = new Info(); 46 | info.setTitle(infoTitle); 47 | info.setUrl(url); 48 | return info; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/WeiboHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.DocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.context.ApplicationContext; 13 | import org.springframework.stereotype.Component; 14 | 15 | import javax.annotation.PostConstruct; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | 19 | @Component("WeiboHotProcessor") 20 | public class WeiboHotProcessor extends DocumentHotProcessor 21 | { 22 | public static final String ITEM_KEY = "tr"; 23 | 24 | @Autowired 25 | private SiteProperties siteProperties; 26 | 27 | @Autowired 28 | private ApplicationContext context; 29 | 30 | @Override 31 | @PostConstruct 32 | protected void initialize(){ 33 | RequestMethod requestMethod = RequestMethod.GET; 34 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 35 | injectBeansByContext(context); 36 | setLog(LoggerFactory.getLogger(getClass())); 37 | } 38 | 39 | @Override 40 | protected Elements getElements(Document document) { 41 | return document.getElementsByTag(ITEM_KEY); 42 | } 43 | 44 | @Override 45 | protected List getInfoDataByElements(Elements elements) { 46 | List list = new ArrayList<>(); 47 | if (elements != null) { 48 | // remove two tr elements 49 | elements.remove(0); 50 | elements.remove(0); 51 | int i = 0; 52 | for (Element element : elements) { 53 | Element itemElement = element.getElementsByClass("td-02").get(0).getElementsByTag("a").get(0); 54 | String id = String.valueOf(++i); 55 | String infoUrl = itemElement.attr("href"); 56 | String infoTitle = itemElement.html(); 57 | infoUrl = this.prefix + infoUrl; 58 | list.add(new Info(id, infoTitle, infoUrl)); 59 | } 60 | } 61 | return list; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/slack/ZhihuHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.jayway.jsonpath.JsonPath; 4 | import com.taogen.hotcrawler.commons.config.SiteProperties; 5 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 6 | import com.taogen.hotcrawler.commons.crawler.APIHotProcessor; 7 | import com.taogen.hotcrawler.commons.entity.Info; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | @Component("ZhihuHotProcessor") 18 | public class ZhihuHotProcessor extends APIHotProcessor 19 | { 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | } 34 | 35 | @Override 36 | public List getInfoDataByJson(String json) { 37 | List list = new ArrayList<>(); 38 | if (json == null) 39 | { 40 | return list; 41 | } 42 | 43 | List titles = JsonPath.read(json, "$.data.[*].target.title"); 44 | List urls = JsonPath.read(json, "$.data.[*].target.url"); 45 | 46 | for (int i = 0; i < urls.size(); i++) 47 | { 48 | urls.set(i, urls.get(i).replace("https://api.zhihu.com/questions", this.prefix + "/question")); 49 | } 50 | 51 | for (int i = 1; i < titles.size(); i++) 52 | { 53 | list.add(new Info(String.valueOf(i), titles.get(i), urls.get(i))); 54 | } 55 | return list; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/stream/BilibiliHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.stream;//package com.taogen.hotcrawler.commons.crawler.impl.stream; 2 | // 3 | //import com.taogen.hotcrawler.commons.crawler.HotProcessor; 4 | //import com.taogen.hotcrawler.commons.entity.Info; 5 | //import com.taogen.hotcrawler.commons.util.OsUtils; 6 | //import org.jsoup.nodes.Document; 7 | //import org.jsoup.nodes.Element; 8 | //import org.jsoup.select.Elements; 9 | //import org.slf4j.Logger; 10 | //import org.slf4j.LoggerFactory; 11 | //import org.springframework.beans.factory.annotation.Autowired; 12 | //import org.springframework.beans.factory.annotation.Value; 13 | //import org.springframework.stereotype.Component; 14 | // 15 | //import java.util.ArrayList; 16 | //import java.util.List; 17 | // 18 | //@Component("BilibiliHotProcessor") 19 | //public class BilibiliHotProcessor implements HotProcessor 20 | //{ 21 | // private static final Logger log = LoggerFactory.getLogger(BilibiliHotProcessor.class); 22 | // 23 | // @Autowired 24 | // private BaseHotProcessor baseHotProcessor; 25 | // 26 | // @Value("${crawler.chromeDriver.enable}") 27 | // private Boolean enable; 28 | // 29 | // @Value("${crawler.chromeDriver.linuxPath}") 30 | // private String linuxPath; 31 | // 32 | // @Value("${crawler.chromeDriver.winPath}") 33 | // private String winPath; 34 | // 35 | // public static final String DOMAIN = "https://www.bilibili.com"; 36 | // public static final String HOT_PAGE_URL = "https://www.bilibili.com/v/kichiku/guide/#/all/click/0/1/"; 37 | //// public static final String HOT_PAGE_URL = "https://www.bilibili.com/v/kichiku/guide/"; 38 | // public static final String ITEM_KEY = "l-item"; 39 | // 40 | // 41 | // @Override 42 | // public List crawlHotList() 43 | // { 44 | // List list = new ArrayList<>(); 45 | // if (! enable) 46 | // { 47 | // return list; 48 | // } 49 | // String osType = OsUtils.getOsType(); 50 | // log.info("os type is {}", osType); 51 | //// if (OsUtils.OS_TYPE_WINDOWS.equals(osType)) 52 | //// { 53 | //// System.setProperty("webdriver.chrome.driver", winPath); 54 | //// log.info("driver path: {}", winPath); 55 | //// } 56 | //// else 57 | //// { 58 | //// System.setProperty("webdriver.chrome.driver", linuxPath); 59 | //// log.info("driver path: {}", linuxPath); 60 | //// } 61 | // 62 | // // doc 63 | // Document doc = baseHotProcessor.getDocByWebDriver(HOT_PAGE_URL, log); 64 | // if (doc == null) 65 | // { 66 | // return list; 67 | // } 68 | // 69 | // // elements 70 | // Elements elements = doc.getElementsByClass(ITEM_KEY); 71 | // log.debug("elements size is {}", elements.size()); 72 | // 73 | // int i = 0; 74 | // for (Element element : elements) 75 | // { 76 | // Element itemElement = null; 77 | // try 78 | // { 79 | // itemElement = element.getElementsByClass("title").get(0); 80 | // } 81 | // catch (NullPointerException | IndexOutOfBoundsException e) 82 | // { 83 | // log.error("Can't found item element by attribute!", e); 84 | // continue; 85 | // } 86 | // // id 87 | // String id = String.valueOf(++i); 88 | // 89 | // // url 90 | // String infoUrl = itemElement.attr("href").substring(2); 91 | // infoUrl = "https://" + infoUrl; 92 | // 93 | // // title 94 | // String infoTitle = itemElement.html(); 95 | // 96 | // list.add(new Info(id, infoTitle, infoUrl)); 97 | // } 98 | // 99 | // return baseHotProcessor.handleData(list); 100 | // } 101 | //} 102 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/stream/CloudmusicHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.stream; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.DocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Document; 8 | import org.jsoup.nodes.Element; 9 | import org.jsoup.select.Elements; 10 | import org.slf4j.LoggerFactory; 11 | import org.springframework.beans.factory.annotation.Autowired; 12 | import org.springframework.context.ApplicationContext; 13 | import org.springframework.stereotype.Component; 14 | 15 | import javax.annotation.PostConstruct; 16 | import java.util.ArrayList; 17 | import java.util.HashMap; 18 | import java.util.List; 19 | import java.util.Map; 20 | 21 | @Component("CloudmusicHotProcessor") 22 | public class CloudmusicHotProcessor extends DocumentHotProcessor 23 | { 24 | public static final String ITEM_KEY = "song-list-pre-cache"; 25 | 26 | @Autowired 27 | private SiteProperties siteProperties; 28 | 29 | @Autowired 30 | private ApplicationContext context; 31 | 32 | @Override 33 | @PostConstruct 34 | protected void initialize(){ 35 | RequestMethod requestMethod = RequestMethod.GET; 36 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 37 | injectBeansByContext(context); 38 | setLog(LoggerFactory.getLogger(getClass())); 39 | } 40 | 41 | @Override 42 | protected Elements getElements(Document document) { 43 | Elements elements = null; 44 | Element contentElement = document.getElementById(ITEM_KEY); 45 | if (contentElement != null) 46 | { 47 | elements = contentElement.getElementsByTag("li"); 48 | } 49 | return elements; 50 | } 51 | 52 | @Override 53 | protected List getInfoDataByElements(Elements elements) { 54 | List list = new ArrayList<>(); 55 | if (elements != null) { 56 | int i = 0; 57 | for (Element element : elements) { 58 | Element itemElement = null; 59 | try { 60 | itemElement = element.getElementsByTag("a").get(0); 61 | String id = String.valueOf(++i); 62 | StringBuilder infoUrl = new StringBuilder(); 63 | infoUrl.append(this.prefix); 64 | infoUrl.append("#"); 65 | infoUrl.append(itemElement.attr("href")); 66 | String infoTitle = itemElement.html(); 67 | list.add(new Info(id, infoTitle, infoUrl.toString())); 68 | } catch (NullPointerException | IndexOutOfBoundsException e) { 69 | log.error("Can't found item element by attribute!", e); 70 | } 71 | } 72 | } 73 | return list; 74 | } 75 | 76 | @Override 77 | protected Map generateHeader() { 78 | Map header = new HashMap<>(); 79 | header.put("Host", "music.163.com"); 80 | header.put("Referer", "https://music.163.com/"); 81 | return header; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/AliyunHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique;//package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | // 3 | //import com.taogen.hotcrawler.commons.crawler.HotProcessor; 4 | //import com.taogen.hotcrawler.commons.entity.Info; 5 | //import org.jsoup.nodes.Document; 6 | //import org.jsoup.nodes.Element; 7 | //import org.jsoup.select.Elements; 8 | //import org.slf4j.Logger; 9 | //import org.slf4j.LoggerFactory; 10 | //import org.springframework.beans.factory.annotation.Autowired; 11 | //import org.springframework.stereotype.Component; 12 | // 13 | //import java.util.ArrayList; 14 | //import java.util.List; 15 | // 16 | //@Component("AliyunHotProcessor") 17 | //public class AliyunHotProcessor implements HotProcessor 18 | //{ 19 | // private static final Logger log = LoggerFactory.getLogger(AliyunHotProcessor.class); 20 | // 21 | // @Autowired 22 | // private BaseHotProcessor baseHotProcessor; 23 | // 24 | // public static final String DOMAIN = "https://yq.aliyun.com"; 25 | // public static final String HOT_PAGE_URL = "https://yq.aliyun.com"; 26 | // public static final String ITEM_KEY = "normal-item"; 27 | // 28 | // @Override 29 | // public List crawlHotList() 30 | // { 31 | // List list = new ArrayList<>(); 32 | // 33 | // // document 34 | // Document doc = baseHotProcessor.getDoc(HOT_PAGE_URL, null, log); 35 | // if (doc == null) 36 | // { 37 | // return list; 38 | // } 39 | // 40 | // // elements 41 | // Elements elements = doc.getElementsByClass(ITEM_KEY); 42 | // 43 | // int i = 0; 44 | // for (Element element : elements) 45 | // { 46 | // try 47 | // { 48 | // // title 49 | // String infoTitle = element.getElementsByTag("h3").get(0).html(); 50 | // String removeTag = ""; 51 | // int index = infoTitle.indexOf(removeTag); 52 | // if (index > 0) 53 | // { 54 | // infoTitle = infoTitle.substring(index + removeTag.length()); 55 | // } 56 | // // url 57 | // StringBuilder infoUrl = new StringBuilder(); 58 | // infoUrl.append(DOMAIN); 59 | // infoUrl.append(element.getElementsByClass("alllink").get(0).attr("href")); 60 | // String id = String.valueOf(++i); 61 | // list.add(new Info(id, infoTitle, infoUrl.toString())); 62 | // } 63 | // catch(IndexOutOfBoundsException e) 64 | // { 65 | // log.error("Can't find attribute!", e); 66 | // } 67 | // } 68 | // 69 | // return baseHotProcessor.handleData(list); 70 | // } 71 | //} 72 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/DeveloperHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("DeveloperHotProcessor") 16 | public class DeveloperHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "post"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | element = element.getElementsByClass("title").get(0).getElementsByTag("a").get(0); 39 | String infoTitle = element.html(); 40 | StringBuilder infoUrl = new StringBuilder(); 41 | infoUrl.append(this.prefix); 42 | infoUrl.append(element.attr("href")); 43 | Info info = new Info(); 44 | info.setTitle(infoTitle); 45 | info.setUrl(infoUrl.toString()); 46 | return info; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/DzoneHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.jayway.jsonpath.JsonPath; 4 | import com.taogen.hotcrawler.commons.config.SiteProperties; 5 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 6 | import com.taogen.hotcrawler.commons.crawler.APIHotProcessor; 7 | import com.taogen.hotcrawler.commons.entity.Info; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | @Component("DzoneHotProcessor") 18 | public class DzoneHotProcessor extends APIHotProcessor 19 | { 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | } 34 | 35 | @Override 36 | protected List getInfoDataByJson(String json) { 37 | List list = new ArrayList<>(); 38 | if (json != null && json.length() > 0) { 39 | List titles = JsonPath.read(json, "$.result.data.nodes.[*].title"); 40 | List urls = JsonPath.read(json, "$.result.data.nodes.[*].articleLink"); 41 | urls = urlsAddPrefix(this.prefix, urls); 42 | List indexInfoList = getInfoListByTitlesAndUrls(titles, urls); 43 | list.addAll(indexInfoList); 44 | log.debug("index infoList size is {}", indexInfoList.size()); 45 | } 46 | return list; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/GithubHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("GithubHotProcessor") 16 | public class GithubHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "Box-row"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | Element urlElement = element.getElementsByTag("h1").get(0).getElementsByTag("a").get(0); 39 | Element descElement = null; 40 | if (! element.getElementsByTag("p").isEmpty()) { 41 | descElement = element.getElementsByTag("p").get(0); 42 | } 43 | String repositoryName = urlElement.attr("href"); 44 | // Title 45 | StringBuilder infoTitle = new StringBuilder(); 46 | infoTitle.append(repositoryName.substring(repositoryName.indexOf('/', 1) + 1)); 47 | infoTitle.append(". "); 48 | String desc = descElement == null ? "" : descElement.html(); 49 | infoTitle.append(desc); 50 | // Url 51 | StringBuilder infoUrl = new StringBuilder(); 52 | infoUrl.append(this.prefix); 53 | infoUrl.append(repositoryName); 54 | return new Info(infoTitle.toString(), infoUrl.toString()); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/InfoqcomHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("InfoqcomHotProcessor") 16 | public class InfoqcomHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "card__content"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | element = element.getElementsByClass("card__title").get(0).getElementsByTag("a").get(0); 39 | String infoTitle = element.html(); 40 | StringBuilder infoUrl = new StringBuilder(); 41 | infoUrl.append(this.prefix); 42 | infoUrl.append(element.attr("href")); 43 | Info info = new Info(); 44 | info.setTitle(infoTitle); 45 | info.setUrl(infoUrl.toString()); 46 | return info; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/JAXenterHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("JAXenterHotProcessor") 16 | public class JAXenterHotProcessor extends SimpleDocumentHotProcessor { 17 | 18 | public static final String ITEM_KEY = "post"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | element = element.getElementsByClass("title").get(0).getElementsByTag("a").get(0); 39 | Info info = new Info(); 40 | info.setTitle(element.html()); 41 | info.setUrl(element.attr("href")); 42 | return info; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/JavaWorldHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | 4 | import com.taogen.hotcrawler.commons.config.SiteProperties; 5 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 6 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 7 | import com.taogen.hotcrawler.commons.entity.Info; 8 | import org.jsoup.nodes.Element; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.beans.factory.annotation.Autowired; 11 | import org.springframework.context.ApplicationContext; 12 | import org.springframework.stereotype.Component; 13 | 14 | import javax.annotation.PostConstruct; 15 | 16 | @Component("JavaWorldHotProcessor") 17 | public class JavaWorldHotProcessor extends SimpleDocumentHotProcessor { 18 | public static final String ITEM_KEY = "post-cont"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | element = element.getElementsByTag("a").get(0); 39 | String infoTitle = element.html(); 40 | StringBuilder infoUrl = new StringBuilder(); 41 | infoUrl.append(this.prefix); 42 | infoUrl.append(element.attr("href")); 43 | Info info = new Info(); 44 | info.setTitle(infoTitle); 45 | info.setUrl(infoUrl.toString()); 46 | return info; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/JuejinHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleAPIHotProcessor; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.context.ApplicationContext; 9 | import org.springframework.stereotype.Component; 10 | 11 | import javax.annotation.PostConstruct; 12 | import java.util.Arrays; 13 | import java.util.HashMap; 14 | import java.util.Map; 15 | 16 | @Component("JuejinHotProcessor") 17 | public class JuejinHotProcessor extends SimpleAPIHotProcessor 18 | { 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.POST; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | setTitleJsonPaths(Arrays.asList("$.data.articleFeed.items.edges.[*].node.title")); 34 | setUrlJsonPaths(Arrays.asList("$.data.articleFeed.items.edges.[*].node.originalUrl")); 35 | } 36 | 37 | @Override 38 | protected Map generateHeader() { 39 | Map header = new HashMap<>(); 40 | header.put("Content-Type", "application/json"); 41 | header.put("Host", "web-api.juejin.im"); 42 | header.put("Origin","https://juejin.im"); 43 | header.put("Referer", "https://juejin.im/?sort=popular"); 44 | header.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0"); 45 | header.put("X-Agent", "Juejin/Web"); 46 | return header; 47 | } 48 | 49 | @Override 50 | protected String generateRequestBody() { 51 | return "{\"operationName\":\"\",\"query\":\"\",\"variables\":{\"first\":20,\"after\":\"\",\"order\":\"POPULAR\"},\"extensions\":{\"query\":{\"id\":\"21207e9ddb1de777adeaca7a2fb38030\"}}}"; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/crawler/impl/technique/SegmentFaultHotProcessor.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.config.SiteProperties; 4 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 5 | import com.taogen.hotcrawler.commons.crawler.SimpleDocumentHotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import org.jsoup.nodes.Element; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.stereotype.Component; 12 | 13 | import javax.annotation.PostConstruct; 14 | 15 | @Component("SegmentFaultHotProcessor") 16 | public class SegmentFaultHotProcessor extends SimpleDocumentHotProcessor 17 | { 18 | public static final String ITEM_KEY = "news-item"; 19 | 20 | @Autowired 21 | private SiteProperties siteProperties; 22 | 23 | @Autowired 24 | private ApplicationContext context; 25 | 26 | @Override 27 | @PostConstruct 28 | protected void initialize(){ 29 | RequestMethod requestMethod = RequestMethod.GET; 30 | setFieldsByProperties(siteProperties, requestMethod, generateHeader(),generateRequestBody()); 31 | injectBeansByContext(context); 32 | setLog(LoggerFactory.getLogger(getClass())); 33 | this.elementClass = ITEM_KEY; 34 | } 35 | 36 | @Override 37 | protected Info getInfoByElement(Element element) { 38 | String infoTitle = element.getElementsByClass("news__item-title").get(0).html(); 39 | StringBuilder infoUrl = new StringBuilder(); 40 | infoUrl.append(this.prefix); 41 | infoUrl.append(element.getElementsByTag("a").get(1).attr("href")); 42 | Info info = new Info(); 43 | info.setTitle(infoTitle); 44 | info.setUrl(infoUrl.toString()); 45 | return info; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/entity/Info.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.entity; 2 | 3 | import lombok.Data; 4 | 5 | import java.io.Serializable; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | @Data 10 | public class Info implements Serializable, Comparable 11 | { 12 | private static final long serialVersionUID = -3946734305303957850L; 13 | private String id; 14 | private String title; 15 | private String url; 16 | 17 | public Info() {} 18 | 19 | public Info(String id, String title, String url) 20 | { 21 | this.id = id; 22 | this.title = title; 23 | this.url = url; 24 | } 25 | 26 | public Info(String title, String url) 27 | { 28 | this.title = title; 29 | this.url = url; 30 | } 31 | 32 | @Override 33 | public int compareTo(Info info) { 34 | return Integer.valueOf(this.id).compareTo(Integer.valueOf(info.id)); 35 | } 36 | 37 | 38 | /** 39 | * @SuppressWarnings("unused") 40 | */ 41 | public Map getMap() 42 | { 43 | Map map = new HashMap<>(); 44 | map.put("id", this.id); 45 | map.put("title", this.title); 46 | map.put("url", this.url); 47 | return map; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/entity/InfoCate.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.entity; 2 | 3 | import lombok.AllArgsConstructor; 4 | import lombok.Data; 5 | 6 | import java.util.List; 7 | 8 | @Data 9 | @AllArgsConstructor 10 | public class InfoCate 11 | { 12 | private String code; 13 | private String name; 14 | private List infoTypes; 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/entity/InfoType.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.entity; 2 | 3 | import lombok.AllArgsConstructor; 4 | import lombok.Data; 5 | 6 | @Data 7 | @AllArgsConstructor 8 | public class InfoType 9 | { 10 | private String code; 11 | private String name; 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/entity/UserVisitStat.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.entity; 2 | 3 | import lombok.Data; 4 | 5 | import java.io.Serializable; 6 | 7 | @Data 8 | public class UserVisitStat implements Serializable 9 | { 10 | private static final long serialVersionUID = -3946754305303957850L; 11 | private String date; 12 | private String ip; // user_id 13 | private long recentVisitTime; 14 | private long todayVisitTimes; 15 | 16 | public UserVisitStat() {} 17 | 18 | public UserVisitStat(String date, String ip, long recentVisitTime, long todayVisitTimes) 19 | { 20 | this.date = date; 21 | this.ip = ip; 22 | this.recentVisitTime = recentVisitTime; 23 | this.todayVisitTimes = todayVisitTimes; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/repository/InfoRepository.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.repository; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import com.taogen.hotcrawler.commons.entity.UserVisitStat; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.data.redis.core.HashOperations; 8 | import org.springframework.data.redis.core.RedisTemplate; 9 | import org.springframework.stereotype.Repository; 10 | 11 | import javax.annotation.PostConstruct; 12 | import javax.annotation.Resource; 13 | import java.util.*; 14 | import java.util.concurrent.TimeUnit; 15 | 16 | @Repository 17 | public class InfoRepository 18 | { 19 | private static final Logger log = LoggerFactory.getLogger(InfoRepository.class); 20 | 21 | @Resource 22 | private RedisTemplate redisTemplate; 23 | private HashOperations hashOps; 24 | 25 | private HashOperations userVisitCountHashOps; 26 | 27 | @PostConstruct 28 | public void init() 29 | { 30 | hashOps = redisTemplate.opsForHash(); 31 | userVisitCountHashOps = redisTemplate.opsForHash(); 32 | } 33 | 34 | public void save(Info info, String code) 35 | { 36 | 37 | String key = code; 38 | hashOps.putIfAbsent(key, info.getId(), info); 39 | } 40 | 41 | public void saveAll(List infoList, String code) 42 | { 43 | String key = code; 44 | Map map = new HashMap<>(); 45 | infoList.forEach(info -> map.put(info.getId(), info)); 46 | hashOps.putAll(key, map); 47 | } 48 | 49 | public void update(Info info, String code) 50 | { 51 | String key = code; 52 | hashOps.put(key, info.getId(), info); 53 | } 54 | 55 | public List findByTypeId(String code) 56 | { 57 | Map infoMap = new HashMap<>(); 58 | if (redisTemplate.hasKey(code)) 59 | { 60 | infoMap = hashOps.entries(code); 61 | } 62 | List infoList = new ArrayList<>(); 63 | infoMap.forEach((k, v) -> infoList.add(v)); 64 | Collections.sort(infoList); 65 | log.debug("redis info list size is {}", infoList.size()); 66 | return infoList; 67 | } 68 | 69 | public Info findByInfoId(String code, String infoId) 70 | { 71 | Info info = null; 72 | if (redisTemplate.hasKey(code)) 73 | { 74 | info = hashOps.get(code, infoId); 75 | } 76 | return info; 77 | } 78 | 79 | public long countByTypeId(String code) 80 | { 81 | Long count = hashOps.size(code); 82 | return count == null ? 0 : count; 83 | } 84 | 85 | public void removeByTypeId(String code) 86 | { 87 | redisTemplate.delete(code); 88 | } 89 | 90 | public void statVisitUser(String ip, String today) 91 | { 92 | UserVisitStat userVisitStat = findVisitUser(ip, today); 93 | if (userVisitStat != null) 94 | { 95 | updateVisitUser(userVisitStat); 96 | } 97 | else 98 | { 99 | insertVisitUser(ip, today); 100 | } 101 | } 102 | 103 | public long countVisitUser(String date) 104 | { 105 | String key = getUserVisitCountKey(date); 106 | return userVisitCountHashOps.entries(key).size(); 107 | } 108 | 109 | private void updateVisitUser(UserVisitStat userVisitStat) 110 | { 111 | String date = userVisitStat.getDate(); 112 | String ip = userVisitStat.getIp(); 113 | userVisitCountHashOps.put(getUserVisitCountKey(date), ip, 114 | new UserVisitStat(date, ip, getCurrentTime(), userVisitStat.getTodayVisitTimes() + 1)); 115 | } 116 | 117 | private void insertVisitUser(String ip, String date) 118 | { 119 | String key = getUserVisitCountKey(date); 120 | userVisitCountHashOps.put(key, ip, 121 | new UserVisitStat(date, ip, getCurrentTime(), 1L)); 122 | userVisitCountHashOps.getOperations().expire(key, 24, TimeUnit.HOURS); 123 | } 124 | 125 | public UserVisitStat findVisitUser(String ip, String today) 126 | { 127 | String key = getUserVisitCountKey(today); 128 | return userVisitCountHashOps.get(key, ip); 129 | } 130 | 131 | public String getUserVisitCountKey(String date) 132 | { 133 | return "user_visit:" + date; 134 | } 135 | 136 | public static long getCurrentTime() 137 | { 138 | return System.currentTimeMillis(); 139 | } 140 | 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/task/CrawlerTask.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.task; 2 | 3 | import com.taogen.hotcrawler.api.service.BaseService; 4 | import com.taogen.hotcrawler.commons.config.SiteProperties; 5 | import com.taogen.hotcrawler.commons.crawler.HotProcessor; 6 | import com.taogen.hotcrawler.commons.entity.Info; 7 | import com.taogen.hotcrawler.commons.repository.InfoRepository; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | import org.springframework.beans.factory.annotation.Autowired; 11 | import org.springframework.beans.factory.annotation.Value; 12 | import org.springframework.context.annotation.Configuration; 13 | import org.springframework.scheduling.annotation.EnableScheduling; 14 | import org.springframework.scheduling.annotation.Scheduled; 15 | 16 | import java.util.List; 17 | import java.util.concurrent.ExecutorService; 18 | import java.util.concurrent.Executors; 19 | 20 | @Configuration 21 | @EnableScheduling 22 | public class CrawlerTask 23 | { 24 | private static final Logger log = LoggerFactory.getLogger(CrawlerTask.class); 25 | 26 | @Autowired 27 | private InfoRepository infoRepository; 28 | 29 | @Autowired 30 | private BaseService baseService; 31 | 32 | @Autowired 33 | private SiteProperties siteProperties; 34 | 35 | @Value("${crawler.task.enable}") 36 | private Boolean enable; 37 | 38 | @Value("${crawler.task.threadPoolNum}") 39 | private int threadPoolNum; 40 | 41 | @Scheduled(fixedRateString = "${crawler.task.fixedRate}", initialDelayString = "${crawler.task.initialDelay}") 42 | public void crawlHotList() 43 | { 44 | if (enable) 45 | { 46 | log.info("Crawler task begin..."); 47 | List sites = siteProperties.sites(); 48 | List cateList = siteProperties.getCates(); 49 | log.info("site list: {}", sites.size()); 50 | 51 | if (cateList != null) 52 | { 53 | executeTask(cateList, sites); 54 | 55 | } 56 | } 57 | } 58 | 59 | private void executeTask(List cateList, List sites) 60 | { 61 | threadPoolNum = threadPoolNum < cateList.size() ? threadPoolNum : sites.size(); 62 | ExecutorService executorService = Executors.newFixedThreadPool(threadPoolNum); 63 | 64 | for (SiteProperties.SiteCate cate : cateList) 65 | { 66 | for (SiteProperties.SiteInfo site : cate.getSites()) { 67 | executorService.submit(() -> { 68 | try { 69 | HotProcessor hotProcessor = null; 70 | hotProcessor = (HotProcessor) baseService.getBean(Class.forName(site.getProcessorClassPath())); 71 | List infoList = hotProcessor.crawlHotList(); 72 | infoRepository.removeByTypeId(site.getCode()); 73 | infoRepository.saveAll(infoList, site.getCode()); 74 | } catch (RuntimeException | ClassNotFoundException e) { 75 | log.error(e.getMessage(), e); 76 | } 77 | }); 78 | } 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/util/ClassUtils.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.util; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | 6 | public class ClassUtils { 7 | private static Logger log = LoggerFactory.getLogger(ClassUtils.class); 8 | 9 | public static Class getClassByClassPath(String classPath){ 10 | try { 11 | return Class.forName(classPath); 12 | } catch (ClassNotFoundException e) { 13 | log.error(e.getMessage(), e); 14 | } 15 | return null; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/util/OsUtils.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.util; 2 | 3 | public class OsUtils 4 | { 5 | public static final String OS_TYPE_LINUX = "LINUX"; 6 | public static final String OS_TYPE_WINDOWS = "WIN"; 7 | public static final String OS_TYPE_MAC = "MAC"; 8 | public static final String OS_TYPE_UNIX = "UNIX"; 9 | public static final String OS_TYPE_SOLARIS = "SOLARIS"; 10 | 11 | private static final String OS = System.getProperty("os.name").toLowerCase(); 12 | 13 | public static String getOsType() 14 | { 15 | if (isWindows()) 16 | { 17 | return OS_TYPE_WINDOWS; 18 | } 19 | else if (isMac()) 20 | { 21 | return OS_TYPE_MAC; 22 | } 23 | else if (isSolaris()) 24 | { 25 | return OS_TYPE_SOLARIS; 26 | } 27 | else if (isUnix()) 28 | { 29 | return OS_TYPE_UNIX; 30 | } 31 | else 32 | { 33 | return null; 34 | } 35 | } 36 | 37 | public static boolean isWindows() 38 | { 39 | return (OS.indexOf("win") >= 0); 40 | } 41 | 42 | public static boolean isMac() 43 | { 44 | return (OS.indexOf("mac") >= 0); 45 | } 46 | 47 | public static boolean isUnix() 48 | { 49 | return (OS.indexOf("nix") >= 0 || OS.indexOf("nux") >= 0 || OS.indexOf("aix") > 0 ); 50 | } 51 | 52 | public static boolean isSolaris() 53 | { 54 | return (OS.indexOf("sunos") >= 0); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/commons/vo/HttpRequest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.vo; 2 | 3 | import com.taogen.hotcrawler.commons.constant.RequestMethod; 4 | import lombok.Data; 5 | 6 | import java.util.Map; 7 | 8 | @Data 9 | public class HttpRequest { 10 | private String url; 11 | private RequestMethod requestMethod; 12 | private Map header; 13 | private String requestBody; 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/taogen/hotcrawler/frontend/Controller/IndexController.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.frontend.controller; 2 | 3 | import com.taogen.hotcrawler.api.service.InfoService; 4 | import com.taogen.hotcrawler.commons.config.SiteProperties; 5 | import com.taogen.hotcrawler.commons.entity.Info; 6 | import com.taogen.hotcrawler.commons.entity.InfoCate; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.beans.factory.annotation.Value; 11 | import org.springframework.stereotype.Controller; 12 | import org.springframework.ui.Model; 13 | import org.springframework.web.bind.annotation.GetMapping; 14 | import org.springframework.web.bind.annotation.RequestParam; 15 | 16 | import javax.servlet.http.HttpServletRequest; 17 | import java.util.List; 18 | 19 | @Controller 20 | public class IndexController 21 | { 22 | Logger log = LoggerFactory.getLogger(IndexController.class); 23 | 24 | @Autowired 25 | private InfoService infoService; 26 | 27 | @Autowired 28 | private SiteProperties siteProperties; 29 | 30 | @Value("${domain}") 31 | private String domain; 32 | 33 | public static final String KEY_DOMAIN = "domain"; 34 | public static final String DOMAIN_DESC = "The domain is {}"; 35 | /** 36 | * v1 37 | */ 38 | @GetMapping("/v1") 39 | public String toIndexPageV1(Model model) 40 | { 41 | log.debug(DOMAIN_DESC, domain); 42 | model.addAttribute(KEY_DOMAIN, domain); 43 | return "index"; //view 44 | } 45 | 46 | /** 47 | * v2 48 | */ 49 | @GetMapping("/") 50 | public String toIndexPageV2(@RequestParam(name = "tab", required = false) String tab, Model model, 51 | HttpServletRequest request) 52 | { 53 | log.debug(DOMAIN_DESC, domain); 54 | log.debug("tab: {}", tab); 55 | if (tab == null || tab.isEmpty()){ 56 | tab = siteProperties.getDefaultSiteInfo().getCode(); 57 | } 58 | SiteProperties.SiteInfo siteInfo = siteProperties.getSiteBySiteCode(tab); 59 | List cates = siteProperties.convertToInfoCateList(); 60 | List infos = infoService.findListByTypeId(siteInfo.getCode()); 61 | Long visitUserCount = infoService.countVisitUser(); 62 | 63 | model.addAttribute(KEY_DOMAIN, domain); 64 | model.addAttribute("cates", cates); 65 | model.addAttribute("infos", infos); 66 | model.addAttribute("thisSiteInfo", siteInfo); 67 | model.addAttribute("visitUserCount", visitUserCount); 68 | infoService.statVisitUser(request); 69 | log.info("Current visit by {}", InfoService.getRealIpAddress(request)); 70 | log.info("Today visited user size is {}", visitUserCount); 71 | return "index2"; //view 72 | } 73 | 74 | /** 75 | * v3 76 | */ 77 | @GetMapping("/v3") 78 | public String toIndexPageV3(Model model) 79 | { 80 | log.debug(DOMAIN_DESC, domain); 81 | model.addAttribute(KEY_DOMAIN, domain); 82 | return "index3"; //view 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/resources/application-dev.yml: -------------------------------------------------------------------------------- 1 | domain: http://localhost:${server.port} 2 | #domain: http://192.168.1.15:${server.port} 3 | crawler: 4 | task: 5 | enable: true 6 | threadPoolNum: 5 7 | initialDelay: 3000 8 | fixedRate: 300000 9 | cron: "*/5 * * * * ?" 10 | chromeDriver: 11 | enable: true 12 | linuxPath: "/hotcrawler/chromedriver" 13 | winPath: "D:/chromedriver-75.exe" 14 | #-------------------------------------------- 15 | spring: 16 | cache: 17 | type: redis 18 | redis: 19 | database: 0 20 | host: localhost 21 | port: 6379 22 | password: 23 | ssl: false 24 | jedis: 25 | pool: 26 | max-active: 7 27 | max-idle: 7 28 | min-idle: 2 29 | max-wait: -1ms 30 | logging: 31 | level: 32 | root: error 33 | org.springframework.web: info 34 | com.taogen.hotcrawler: debug 35 | org.hibernate: info 36 | server: 37 | port: 8080 -------------------------------------------------------------------------------- /src/main/resources/application-prod.yml: -------------------------------------------------------------------------------- 1 | domain: https://hot.const520.com 2 | crawler: 3 | task: 4 | enable: true 5 | threadPoolNum: 5 6 | initialDelay: 3000 7 | fixedRate: 300000 8 | cron: "*/5 * * * * ?" 9 | chromeDriver: 10 | enable: true 11 | linuxPath: "/hotcrawler/chromedriver" 12 | winPath: "D:/chromedriver-75.exe" 13 | #-------------------------------------------- 14 | spring: 15 | cache: 16 | type: redis 17 | redis: 18 | database: 0 19 | host: ENC(mr2PNDKFaXHiTr4A/DwFeLB2d/DaeQPr) 20 | port: 6379 21 | password: ENC(AB7cHmgCzhI1RvUFaNj8Is5KJ6pLqnQ95mFSB0bXsFI=) 22 | ssl: false 23 | jedis: 24 | pool: 25 | max-active: 7 26 | max-idle: 7 27 | min-idle: 2 28 | max-wait: -1ms 29 | logging: 30 | level: 31 | root: error 32 | org.springframework.web: error 33 | com.taogen.hotcrawler: info 34 | org.hibernate: error 35 | # path: /var/log/hot.const520.com 36 | # file: ${logging.path}/${spring.application.name}.log -------------------------------------------------------------------------------- /src/main/resources/application-test.yml: -------------------------------------------------------------------------------- 1 | domain: https://test.hot.const520.com 2 | crawler: 3 | task: 4 | enable: true 5 | threadPoolNum: 5 6 | initialDelay: 60000 7 | fixedRate: 180000 8 | cron: "*/5 * * * * ?" 9 | chromeDriver: 10 | enable: true 11 | linuxPath: "/hotcrawler/chromedriver" 12 | winPath: "D:/chromedriver-75.exe" 13 | #-------------------------------------------- 14 | spring: 15 | cache: 16 | type: redis 17 | redis: 18 | database: 1 19 | host: localhost 20 | port: 6379 21 | password: 22 | ssl: false 23 | jedis: 24 | pool: 25 | max-active: 7 26 | max-idle: 7 27 | min-idle: 2 28 | max-wait: -1ms 29 | logging: 30 | level: 31 | root: error 32 | org.springframework.web: error 33 | com.taogen.hotcrawler: debug 34 | org.hibernate: error 35 | server: 36 | port: 8081 -------------------------------------------------------------------------------- /src/main/resources/application.yml: -------------------------------------------------------------------------------- 1 | spring: 2 | profiles: 3 | active: prod 4 | # active: dev 5 | application: 6 | name: hotcrawler -------------------------------------------------------------------------------- /src/main/resources/logback-spring.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | /var/log/hot.const520.com/hotcrawler.log 8 | 9 | 10 | hotcrawler.%d{yyyy-MM-dd}.%i.log 11 | 12 | 50MB 13 | 14 | 30 15 | 16 | 17 | ${FILE_LOG_PATTERN} 18 | utf8 19 | 20 | 21 | 22 | 23 | 24 | /var/log/hot.const520.com/hotcrawler-test.log 25 | 26 | 27 | hotcrawler-test.%d{yyyy-MM-dd}.%i.log 28 | 29 | 50MB 30 | 31 | 30 32 | 33 | 34 | ${FILE_LOG_PATTERN} 35 | utf8 36 | 37 | 38 | 39 | 40 | 41 | D:/logs/hot.const520.com/hotcrawler-dev.log 42 | 43 | hotcrawler-dev.%d{yyyy-MM-dd}.%i.log 44 | 45 | 50MB 46 | 47 | 30 48 | 49 | 50 | ${FILE_LOG_PATTERN} 51 | utf8 52 | 53 | 54 | 55 | 56 | 57 | 58 | ${CONSOLE_LOG_PATTERN} 59 | utf8 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /src/main/resources/static/css/index.css: -------------------------------------------------------------------------------- 1 | #top{ 2 | background-color: darkseagreen; 3 | width: auto; 4 | height: 44px; 5 | padding: 0px 5px 0px 5px; 6 | } 7 | 8 | #top-content{ 9 | /*background-color: red;*/ 10 | min-width: 200px; 11 | max-width: 1100px; 12 | line-height: 44px; 13 | margin: 0 auto; 14 | padding: 0px 20px 0px 0px; 15 | font-size: 20px; 16 | font-weight: bold; 17 | } 18 | #top-content img{ 19 | height: 44px; 20 | } 21 | #top-logo { 22 | /*background-color: darkolivegreen;*/ 23 | width: 44px; 24 | height: 44px; 25 | line-height: 44px; 26 | margin-right: 10px; 27 | float: left; 28 | } 29 | #top-title{ 30 | /*background-color: darkolivegreen;*/ 31 | height: 44px; 32 | line-height: 44px; 33 | margin-right: 10px; 34 | display: inline; 35 | } 36 | #footer{ 37 | background-color: #e2e2e2; 38 | height: 100px; 39 | 40 | } 41 | 42 | /***************************************************/ 43 | 44 | 45 | #wrapper{ 46 | background-color: #e2e2e2; 47 | width: auto; 48 | line-height: 30px; 49 | padding: 5px 5px 25px 5px; 50 | } 51 | 52 | #content{ 53 | background-color: #FFFFFF; 54 | /*background-color: #009688;*/ 55 | min-width: 200px; 56 | max-width: 1100px; 57 | line-height: 30px; 58 | margin: 0 auto; 59 | padding: 0px 10px 0px 10px; 60 | } 61 | 62 | #menu-bar{ 63 | /*background-color: #009688;*/ 64 | line-height: 30px; 65 | min-height: 80px; 66 | padding: 10px 0 10px 0; 67 | } 68 | 69 | .menu-row{ 70 | /*background-color: antiquewhite;*/ 71 | width: 100%; 72 | } 73 | .menu-item, .menu-item-cate{ 74 | /*background-color: aquamarine;*/ 75 | padding: 3px 8px 3px 8px; 76 | display: inline; 77 | text-align: center; 78 | border-radius: 5px; 79 | font-size: 15px; 80 | /*font-family: FangSong;*/ 81 | font-family: "Helvetica Neue", "Luxi Sans", "DejaVu Sans", Tahoma, "Hiragino Sans GB", "Microsoft Yahei", sans-serif; 82 | /*font-weight: bold;*/ 83 | } 84 | 85 | .menu-item-split{ 86 | /*background-color: red;*/ 87 | width: 1%; 88 | display: inline-block; 89 | } 90 | 91 | .menu-item-selected{ 92 | /*TODO*/ 93 | border-bottom: 2px solid #009688; 94 | } 95 | 96 | #menu-split { 97 | background-color: #e2e2e2; 98 | /*width: 100%;*/ 99 | height: 3px; 100 | margin-right: -10px; 101 | margin-left: -10px; 102 | } 103 | #tip{ 104 | /*background-color: #009688;*/ 105 | color: darkgrey; 106 | margin-top: 5px; 107 | margin-bottom: -5px; 108 | } 109 | 110 | #main{ 111 | /*background-color: red;*/ 112 | min-height: 500px; 113 | padding: 10px 0 30px 0; 114 | } 115 | .info-item{ 116 | /*background-color: darkseagreen;*/ 117 | width: 100%; 118 | line-height: 23px; 119 | padding: 7px 0px 7px 0px; 120 | border-bottom: 1px solid darkgray; 121 | } 122 | 123 | 124 | -------------------------------------------------------------------------------- /src/main/resources/static/css/style.css: -------------------------------------------------------------------------------- 1 | body{ 2 | font-family: "Microsoft JhengHei"; 3 | font-size: 14px; 4 | padding: 0px; 5 | margin: 0px; 6 | /*font: 14px Helvetica Neue, Helvetica, PingFang SC, Tahoma, Arial, sans-serif;*/ 7 | } 8 | 9 | a{ 10 | color: black; 11 | text-decoration : none; 12 | cursor: pointer; 13 | /*remove mobile touch color*/ 14 | -webkit-tap-highlight-color: rgba(0,0,0,0); 15 | -webkit-tap-highlight-color: transparent; 16 | } 17 | .pointer {cursor: pointer;} 18 | a:link { 19 | color: black; 20 | } 21 | a:visited { 22 | color: black; 23 | } 24 | a:hover { 25 | color: gray; 26 | } 27 | #top a:hover{ 28 | color: #000000; 29 | } 30 | a:active { 31 | color: gray; 32 | } 33 | 34 | h1,h2,h3,h4,h5,h6{ 35 | margin: 10px 0px 10px 3%; 36 | } 37 | 38 | .no-break-word { 39 | display: inline-block; 40 | white-space:nowrap; 41 | } -------------------------------------------------------------------------------- /src/main/resources/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tagnja/hot-crawler/89d9311844470eaba1851fc4963cff4abfaf713e/src/main/resources/static/favicon.ico -------------------------------------------------------------------------------- /src/main/resources/static/img/const520-logo_200x200.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tagnja/hot-crawler/89d9311844470eaba1851fc4963cff4abfaf713e/src/main/resources/static/img/const520-logo_200x200.png -------------------------------------------------------------------------------- /src/main/resources/static/img/const520.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tagnja/hot-crawler/89d9311844470eaba1851fc4963cff4abfaf713e/src/main/resources/static/img/const520.ico -------------------------------------------------------------------------------- /src/main/resources/static/js/index.js: -------------------------------------------------------------------------------- 1 | var getTypesUrl = domain + "/api/v1/types"; 2 | $.ajax({ 3 | url: getTypesUrl, 4 | type: "get", 5 | async: false, 6 | dataType: "json", 7 | success: function(res){ 8 | //console.log(JSON.stringify(res)); 9 | var infoCates = res.data; 10 | for (var j = 0; j < infoCates.length; j++) 11 | { 12 | var infoCate = infoCates[j]; 13 | var menuRowId = "menu-row-"+j; 14 | var menuRow = '' 15 | $("#menu-bar").append(menuRow) 16 | var infoCateCol = ''; 17 | $("#"+menuRowId).append(infoCateCol); 18 | var infoTypes = infoCate.infoTypes; 19 | for (var i = 0; i < infoTypes.length; i++) 20 | { 21 | var menuItem = ""; 22 | $("#"+ menuRowId).append(menuItem); 23 | } 24 | } 25 | /*var contentBlock = "
"; 26 | $("#content").append(contentBlock);*/ 27 | 28 | // initial get infos 29 | var hash = window.location.hash.substr(1); 30 | var cateId, code; 31 | if (hash) { 32 | cateId = hash.split('-')[0]; 33 | code = hash.split('-')[1]; 34 | } else { 35 | cateId = infoCates[0].id; 36 | code = infoCates[0].infoTypes[0].id; 37 | } 38 | // console.log("cateId: " + cateId + ", code: " + code); 39 | getInfos(cateId, code); 40 | }, 41 | error: function(res){ 42 | console.log(JSON.stringify(res)) 43 | } 44 | }); 45 | 46 | $(".menu-item").click(function () { 47 | var cateId = $(this).attr("cateId"); 48 | var code = $(this).attr("code"); 49 | window.location = window.location.href.split('#')[0] + "#" + cateId +"-"+code; 50 | window.location.reload(); 51 | //getInfos(cateId, code); 52 | //console.log(infos); 53 | }); 54 | 55 | function selected(cateId, code) 56 | { 57 | $(".menu-item").each(function() { 58 | if ($(this).attr("cateId") == cateId && $(this).attr("code") == code) { 59 | $(this).css({"background-color": "#445", "color": "#FFFFFF"}); 60 | 61 | } 62 | }); 63 | } 64 | function getInfos(cateId, code) 65 | { 66 | selected(cateId, code); 67 | var infos; 68 | var getInfoUrl = domain + "/api/v1/cates/"+cateId+"/types/" + code + "/infos"; 69 | $.ajax({ 70 | url: getInfoUrl, 71 | type: "get", 72 | async: false, 73 | dataType: "json", 74 | success: function(res){ 75 | infos = res.data; 76 | putInfos("#main", infos); 77 | }, 78 | error: function(res){ 79 | console.log(JSON.stringify(res)) 80 | } 81 | }); 82 | return infos; 83 | } 84 | 85 | function putInfos(elementId, infos) { 86 | $(elementId).empty(); 87 | if (infos.length > 0){ 88 | var infoItem; 89 | for (var j = 0; j < infos.length; j++) 90 | { 91 | if (typeof window.orientation !== 'undefined') { 92 | infoItem = ""; 93 | }else { 94 | infoItem = ""; 95 | } 96 | $(elementId).append(infoItem); 97 | } 98 | }else { 99 | var blankTip = "
该站点暂无数据!
"; 100 | $(elementId).append(blankTip); 101 | } 102 | } -------------------------------------------------------------------------------- /src/main/resources/static/js/index2.js: -------------------------------------------------------------------------------- 1 | generateMenu(); 2 | showStatData(); 3 | selectTab(); 4 | generateInfos("#main", infos); 5 | console.log(thisSiteInfo); 6 | 7 | /** 8 | **************************** 9 | * functions 10 | **************************** 11 | */ 12 | 13 | function generateMenu(){ 14 | for (var j = 0; j < infoCates.length; j++) 15 | { 16 | var infoCate = infoCates[j]; 17 | var menuRowId = "menu-row-"+j; 18 | var menuRow = '' 19 | $("#menu-bar").append(menuRow) 20 | var infoCateCol = ''; 21 | $("#"+menuRowId).append(infoCateCol); 22 | var infoTypes = infoCate.infoTypes; 23 | for (var i = 0; i < infoTypes.length; i++) 24 | { 25 | var menuItem = "" 26 | + infoTypes[i].name + ""; 27 | if (i < infoTypes.length - 1) 28 | { 29 | menuItem = menuItem + ""; 30 | } 31 | $("#"+ menuRowId).append(menuItem); 32 | } 33 | } 34 | } 35 | 36 | function showStatData() { 37 | var visitUserCountDiv = '
\n' + 38 | '
今日访问人数:'+visitUserCount+'
\n' + 39 | '
'; 40 | $("#footer").html(visitUserCountDiv); 41 | } 42 | 43 | function generateInfos(elementId, infos) { 44 | $(elementId).empty(); 45 | if (infos.length > 0){ 46 | var infoItem; 47 | for (var j = 0; j < infos.length; j++) 48 | { 49 | if (typeof window.orientation !== 'undefined') { 50 | infoItem = ""; 51 | }else { 52 | infoItem = ""; 53 | } 54 | $(elementId).append(infoItem); 55 | } 56 | }else { 57 | var blankTip = "
该站点暂无数据!
"; 58 | $(elementId).append(blankTip); 59 | } 60 | } 61 | 62 | function selectTab() 63 | { 64 | var code; 65 | if (window.location.href.split("?").length == 2) 66 | { 67 | code = window.location.href.split("?")[1].split("=")[1]; 68 | } 69 | if (! code) { 70 | code = infoCates[0].infoTypes[0].code; 71 | } 72 | console.log("code is : " + code); 73 | // console.log("selected: cateId=" + cateId + ", code=" + code) 74 | $(".menu-item").each(function() { 75 | if ($(this).attr("code") == code) { 76 | $(this).css({"background-color": "#445", "color": "#FFFFFF"}); 77 | } 78 | }); 79 | showNote(); 80 | } 81 | function showNote(){ 82 | $("#tip").empty(); 83 | if (thisSiteInfo.type == "abroad"){ 84 | console.log("thisSiteInfo type is : " + thisSiteInfo) 85 | $("#tip").html("(该网站需要科学上网!)"); 86 | } 87 | } 88 | // function checkNetwork() { 89 | // var request = $.ajax({ 90 | // url: "https://google.com", 91 | // type: "get", 92 | // cache: false, 93 | // dataType: "jsonp", 94 | // processData: false, 95 | // timeout: 10000, 96 | // complete: function (data) { 97 | // if (data.status != 200) { 98 | // $("#tip").html("访问该网站需要科学上网!"); 99 | // request.abort(); 100 | // } 101 | // } 102 | // }); 103 | // } 104 | 105 | /** 106 | ************************* 107 | * Events 108 | ************************* 109 | */ 110 | 111 | $(".menu-item").click(function () { 112 | var code = $(this).attr("code"); 113 | /* window.location = window.location.href.split('#')[0] + "#" + cateId +"-"+code; 114 | window.location.reload();*/ 115 | window.location.href = window.location.href.split('#')[0].split('?')[0] + "?tab=" +code; 116 | //getInfos(cateId, code); 117 | //console.log(infos); 118 | }); 119 | -------------------------------------------------------------------------------- /src/main/resources/static/js/index3.js: -------------------------------------------------------------------------------- 1 | var getTypesUrl = domain + "/api/v1/types"; 2 | $.ajax({ 3 | url: getTypesUrl, 4 | type: "get", 5 | async: false, 6 | dataType: "json", 7 | success: function(res){ 8 | //console.log(JSON.stringify(res)); 9 | var infoCates = res.data; 10 | for (var j = 0; j < infoCates.length; j++) 11 | { 12 | var infoCate = infoCates[j]; 13 | var menuRowId = "menu-row-"+j; 14 | var menuRow = '' 15 | $("#menu-bar").append(menuRow) 16 | var infoCateCol = ''; 17 | $("#"+menuRowId).append(infoCateCol); 18 | var infoTypes = infoCate.infoTypes; 19 | for (var i = 0; i < infoTypes.length; i++) 20 | { 21 | var menuItem = "" + infoTypes[i].name + ""; 22 | $("#"+ menuRowId).append(menuItem); 23 | } 24 | } 25 | /*var contentBlock = "
"; 26 | $("#content").append(contentBlock);*/ 27 | 28 | // initial get infos 29 | // var hash = window.location.hash.substr(1); 30 | // var cateId, code; 31 | // if (hash) { 32 | // cateId = hash.split('-')[0]; 33 | // code = hash.split('-')[1]; 34 | // } else { 35 | // cateId = infoCates[0].id; 36 | // code = infoCates[0].infoTypes[0].id; 37 | // } 38 | // console.log("cateId: " + cateId + ", code: " + code); 39 | getInfos(infoCates[0].id, infoCates[0].infoTypes[0].id); 40 | }, 41 | error: function(res){ 42 | console.log(JSON.stringify(res)) 43 | } 44 | }); 45 | 46 | $(".menu-item").click(function () { 47 | var cateId = $(this).attr("cateId"); 48 | var code = $(this).attr("code"); 49 | // window.location = window.location.href.split('#')[0] + "#" + cateId +"-"+code; 50 | // window.location.reload(); 51 | getInfos(cateId, code); 52 | //console.log(infos); 53 | }); 54 | 55 | function selected(cateId, code) 56 | { 57 | $(".menu-item").each(function() { 58 | if ($(this).attr("cateId") == cateId && $(this).attr("code") == code) { 59 | $(this).css({"background-color": "#445", "color": "#FFFFFF"}); 60 | }else{ 61 | $(this).css({"background-color": "#FFFFFF", "color": "#000000"}); 62 | } 63 | }); 64 | } 65 | function getInfos(cateId, code) 66 | { 67 | var infos; 68 | var getInfoUrl = domain + "/api/v1/cates/"+cateId+"/types/" + code + "/infos"; 69 | $.ajax({ 70 | url: getInfoUrl, 71 | type: "get", 72 | async: false, 73 | dataType: "json", 74 | success: function(res){ 75 | infos = res.data; 76 | putInfos("#main", infos, cateId, code); 77 | }, 78 | error: function(res){ 79 | console.log(JSON.stringify(res)) 80 | } 81 | }); 82 | return infos; 83 | } 84 | 85 | function putInfos(elementId, infos, cateId, code) { 86 | $(elementId).empty(); 87 | if (infos.length > 0){ 88 | var infoItem; 89 | for (var j = 0; j < infos.length; j++) 90 | { 91 | if (typeof window.orientation !== 'undefined') { 92 | infoItem = ""; 93 | }else { 94 | infoItem = ""; 95 | } 96 | $(elementId).append(infoItem); 97 | } 98 | }else { 99 | var blankTip = "
该站点暂无数据!
"; 100 | $(elementId).append(blankTip); 101 | } 102 | selected(cateId, code); 103 | } -------------------------------------------------------------------------------- /src/main/resources/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | const520 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | 21 | 22 |
23 | 24 |
25 |
26 | 27 | 28 |
29 |
30 |
31 | 32 | 33 | 34 | 35 | 36 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /src/main/resources/templates/index2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | const520 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | 21 | 22 |
23 | 24 |
25 |
26 | 32 | 33 |
34 |
35 | 36 |
37 |
38 |
39 | 41 | 42 | 43 | 44 | 45 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /src/main/resources/templates/index3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | const520 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | 21 | 22 |
23 | 24 |
25 |
26 | 27 | 28 |
29 |
30 |
31 | 32 | 33 | 34 | 35 | 36 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /src/main/resources/webdrivermanager.properties: -------------------------------------------------------------------------------- 1 | wdm.targetPath=~/.m2/repository/webdriver 2 | wdm.forceCache=false 3 | wdm.override=false 4 | wdm.useMirror=false 5 | wdm.useBetaVersions=false 6 | wdm.avoidExport=false 7 | wdm.avoidOutputTree=false 8 | wdm.avoidAutoVersion=false 9 | wdm.avoidAutoReset=false 10 | wdm.avoidPreferences=false 11 | wdm.timeout=30 12 | wdm.serverPort=4041 13 | wdm.ttl=86400 14 | 15 | wdm.chromeDriverUrl=https://chromedriver.storage.googleapis.com/ 16 | wdm.chromeDriverMirrorUrl=http://npm.taobao.org/mirrors/chromedriver 17 | wdm.chromeDriverExport=webdriver.chrome.driver 18 | wdm.chromeDriverVersion=76 19 | #wdm.chromeDriverVersion=LATEST 20 | 21 | wdm.geckoDriverUrl=https://api.github.com/repos/mozilla/geckodriver/releases 22 | wdm.geckoDriverMirrorUrl=http://npm.taobao.org/mirrors/geckodriver 23 | wdm.geckoDriverExport=webdriver.gecko.driver 24 | 25 | wdm.operaDriverUrl=https://api.github.com/repos/operasoftware/operachromiumdriver/releases 26 | wdm.operaDriverMirrorUrl=http://npm.taobao.org/mirrors/operadriver 27 | wdm.operaDriverExport=webdriver.opera.driver 28 | 29 | wdm.phantomjsDriverUrl=https://bitbucket.org/ariya/phantomjs/downloads/ 30 | wdm.phantomjsDriverMirrorUrl=http://npm.taobao.org/mirrors/phantomjs 31 | wdm.phantomjsDriverExport=phantomjs.binary.path 32 | 33 | wdm.edgeDriverUrl=https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ 34 | wdm.edgeDriverExport=webdriver.edge.driver 35 | 36 | wdm.internetExplorerDriverUrl=https://selenium-release.storage.googleapis.com/ 37 | wdm.internetExplorerDriverExport=webdriver.ie.driver 38 | 39 | wdm.seleniumServerStandaloneUrl=https://selenium-release.storage.googleapis.com/ 40 | 41 | wdm.versionsPropertiesUrl=https://raw.githubusercontent.com/bonigarcia/webdrivermanager/master/src/main/resources/versions.properties -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import org.junit.runner.RunWith; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.boot.test.context.SpringBootTest; 9 | import org.springframework.mock.web.MockServletContext; 10 | import org.springframework.test.context.junit4.SpringRunner; 11 | import org.springframework.test.web.servlet.MockMvc; 12 | import org.springframework.test.web.servlet.MvcResult; 13 | import org.springframework.test.web.servlet.setup.MockMvcBuilders; 14 | import org.springframework.web.context.WebApplicationContext; 15 | 16 | import javax.servlet.ServletContext; 17 | 18 | import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.get; 19 | import static org.springframework.test.web.servlet.result.MockMvcResultHandlers.print; 20 | import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath; 21 | import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; 22 | 23 | 24 | /** 25 | * Unit test for simple App. 26 | */ 27 | @RunWith(SpringRunner.class) 28 | @SpringBootTest 29 | public class AppTest 30 | { 31 | @Autowired 32 | private WebApplicationContext wac; 33 | 34 | private MockMvc mockMvc; 35 | 36 | @Before 37 | public void setup() 38 | { 39 | this.mockMvc = MockMvcBuilders.webAppContextSetup(this.wac).build(); 40 | } 41 | 42 | @Test 43 | public void verifyContext() { 44 | ServletContext servletContext = wac.getServletContext(); 45 | 46 | org.junit.Assert.assertNotNull(servletContext); 47 | Assert.assertTrue(servletContext instanceof MockServletContext); 48 | Assert.assertNotNull(wac.getBean("InfoController")); 49 | } 50 | 51 | @Test 52 | public void testResponseBody() throws Exception { 53 | MvcResult mvcResult = this.mockMvc.perform(get("/")).andDo(print()) 54 | .andExpect(status().isOk()) 55 | .andReturn(); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/api/service/InfoServiceTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.service; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import org.junit.Test; 5 | import org.junit.runner.RunWith; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | import org.springframework.beans.factory.annotation.Autowired; 9 | import org.springframework.boot.test.context.SpringBootTest; 10 | import org.springframework.test.context.junit4.SpringRunner; 11 | 12 | import java.util.List; 13 | 14 | @RunWith(SpringRunner.class) 15 | @SpringBootTest 16 | public class InfoServiceTest 17 | { 18 | private static final Logger log = LoggerFactory.getLogger(InfoServiceTest.class); 19 | 20 | @Autowired 21 | private InfoService infoService; 22 | 23 | @Test 24 | public void findListByTypeIdTest() 25 | { 26 | List infoList = infoService.findListByTypeId("1"); 27 | log.info("info list size: " + infoList.size()); 28 | } 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/api/web/controller/InfoControllerTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.api.web.controller; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import org.junit.runner.RunWith; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.boot.test.context.SpringBootTest; 9 | import org.springframework.mock.web.MockServletContext; 10 | import org.springframework.test.context.junit4.SpringRunner; 11 | import org.springframework.test.web.servlet.MockMvc; 12 | import org.springframework.test.web.servlet.MvcResult; 13 | import org.springframework.test.web.servlet.setup.MockMvcBuilders; 14 | import org.springframework.web.context.WebApplicationContext; 15 | 16 | import javax.servlet.ServletContext; 17 | 18 | import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.get; 19 | import static org.springframework.test.web.servlet.result.MockMvcResultHandlers.print; 20 | import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.jsonPath; 21 | import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; 22 | 23 | @RunWith(SpringRunner.class) 24 | @SpringBootTest 25 | public class InfoControllerTest 26 | { 27 | @Autowired 28 | private InfoController infoController; 29 | 30 | @Autowired 31 | private WebApplicationContext wac; 32 | 33 | private MockMvc mockMvc; 34 | 35 | @Before 36 | public void setup() 37 | { 38 | this.mockMvc = MockMvcBuilders.webAppContextSetup(this.wac).build(); 39 | } 40 | 41 | @Test 42 | public void verifyContext() { 43 | ServletContext servletContext = wac.getServletContext(); 44 | 45 | org.junit.Assert.assertNotNull(servletContext); 46 | Assert.assertTrue(servletContext instanceof MockServletContext); 47 | Assert.assertNotNull(wac.getBean("InfoController")); 48 | } 49 | 50 | // @Test 51 | // public void testResponseBody() throws Exception { 52 | // MvcResult mvcResult = this.mockMvc.perform(get("/api/v1/test")).andDo(print()) 53 | // .andExpect(status().isOk()) 54 | // .andExpect(jsonPath("$.ret_code").value(0)) 55 | // .andExpect(jsonPath("$.ret_msg").value("ok")) 56 | // .andReturn(); 57 | // Assert.assertEquals("application/json;charset=UTF-8", mvcResult.getResponse().getContentType()); 58 | // } 59 | } 60 | -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/config/SitePropertiesTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.config; 2 | 3 | import org.junit.Test; 4 | import org.junit.runner.RunWith; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.boot.test.context.SpringBootTest; 9 | import org.springframework.context.ApplicationContext; 10 | import org.springframework.test.context.junit4.SpringRunner; 11 | 12 | import static org.junit.Assert.assertNotNull; 13 | 14 | @RunWith(SpringRunner.class) 15 | @SpringBootTest 16 | public class SitePropertiesTest { 17 | 18 | Logger log = LoggerFactory.getLogger(getClass()); 19 | @Autowired 20 | private ApplicationContext applicationContext; 21 | 22 | @Autowired 23 | private SiteProperties siteProperties; 24 | 25 | @Test 26 | public void argumentsCheck(){ 27 | log.debug("begin argument check in property file..."); 28 | // Is url right? Does processName exist? 29 | if (siteProperties.cates != null){ 30 | for (SiteProperties.SiteCate siteCate : siteProperties.cates){ 31 | if (siteCate.getSites() != null){ 32 | for (SiteProperties.SiteInfo siteInfo : siteCate.getSites()){ 33 | Object object = getObjectByClassPath(siteInfo.getProcessorClassPath()); 34 | if(object == null) { 35 | StringBuilder errorMessage = new StringBuilder(); 36 | errorMessage.append(siteInfo.getName()); 37 | errorMessage.append("'s processorClassPath configuration is error in sites.properties!"); 38 | log.error(errorMessage.toString()); 39 | } 40 | assertNotNull(object); 41 | } 42 | } 43 | } 44 | } 45 | } 46 | 47 | private Object getObjectByClassPath(String processorClassPath) { 48 | Object object = null; 49 | try { 50 | object = applicationContext.getBean(Class.forName(processorClassPath)); 51 | } catch (ClassNotFoundException e) { 52 | log.error(e.getMessage(), e); 53 | } 54 | return object; 55 | } 56 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/HotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import org.junit.Ignore; 5 | import org.junit.runner.RunWith; 6 | import org.slf4j.Logger; 7 | import org.springframework.boot.test.context.SpringBootTest; 8 | import org.springframework.test.context.junit4.SpringRunner; 9 | 10 | import java.util.List; 11 | 12 | import static org.junit.Assert.*; 13 | 14 | @RunWith(SpringRunner.class) 15 | @SpringBootTest 16 | @Ignore 17 | public class HotProcessorTest 18 | { 19 | protected Logger log; 20 | 21 | public void checkHotInfoList(List hotList) 22 | { 23 | if (hotList != null && ! hotList.isEmpty()) 24 | { 25 | for (int i = 0; i < hotList.size(); i++) 26 | { 27 | testInfo(hotList.get(i), i); 28 | } 29 | 30 | } 31 | } 32 | 33 | private void testInfo(Info info, int i) { 34 | testId(String.valueOf(i+1), info.getId()); 35 | testTitle(info.getTitle()); 36 | testUrl(info.getUrl()); 37 | } 38 | 39 | private void testId(String expectId, String actualId){ 40 | assertEquals(expectId, actualId); 41 | } 42 | 43 | private void testTitle(String title) { 44 | assertNotNull(title); 45 | } 46 | 47 | private void testUrl(String url) { 48 | assertNotNull(url); 49 | assertTrue(url.startsWith("http")); 50 | hasOnlySingleUrl(url); 51 | 52 | } 53 | 54 | private void hasOnlySingleUrl(String url){ 55 | int index1 = url.indexOf("://"); 56 | int index2 = url.indexOf("://", index1 + 1); 57 | if (index2 != -1) 58 | { 59 | log.error("error url is {}", url); 60 | } 61 | assertTrue(index1 != -1); 62 | assertTrue(index2 == -1); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/BBCNewsHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | 9 | public class BBCNewsHotProcessorTest extends HotProcessorTest { 10 | 11 | @Autowired 12 | private BBCNewsHotProcessor bbcNewsHotProcessor; 13 | 14 | public BBCNewsHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(bbcNewsHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/EconomistHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class EconomistHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private EconomistHotProcessor economistHotProcessor; 12 | 13 | public EconomistHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(economistHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/HackernewsHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class HackernewsHotProcessorTest extends HotProcessorTest { 11 | 12 | @Autowired 13 | private HackernewsHotProcessor hackernewsHotProcessor; 14 | 15 | public HackernewsHotProcessorTest(){ 16 | this.log = LoggerFactory.getLogger(getClass()); 17 | } 18 | 19 | @Test 20 | public void crawlHotList() { 21 | checkHotInfoList(hackernewsHotProcessor.crawlHotList()); 22 | } 23 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/LobstersHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.apache.http.util.Asserts; 5 | import org.junit.Assert; 6 | import org.junit.Before; 7 | import org.junit.Test; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | 11 | public class LobstersHotProcessorTest extends HotProcessorTest { 12 | 13 | @Autowired 14 | private LobstersHotProcessor lobstersHotProcessor; 15 | 16 | public LobstersHotProcessorTest(){ 17 | this.log = LoggerFactory.getLogger(getClass()); 18 | } 19 | 20 | @Test 21 | public void crawlHotList() { 22 | checkHotInfoList(lobstersHotProcessor.crawlHotList()); 23 | } 24 | 25 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/MediumHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class MediumHotProcessorTest extends HotProcessorTest { 9 | @Autowired 10 | private MediumHotProcessor mediumHotProcessor; 11 | 12 | public MediumHotProcessorTest(){ 13 | this.log = LoggerFactory.getLogger(getClass()); 14 | } 15 | 16 | @Test 17 | public void crawlHotList() { 18 | checkHotInfoList(mediumHotProcessor.crawlHotList()); 19 | } 20 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/abroad/TheNewYorkTimesHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | //package com.taogen.hotcrawler.commons.crawler.impl.abroad; 2 | // 3 | //import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | //import org.junit.Test; 5 | //import org.slf4j.LoggerFactory; 6 | //import org.springframework.beans.factory.annotation.Autowired; 7 | // 8 | //public class TheNewYorkTimesHotProcessorTest extends HotProcessorTest { 9 | // @Autowired 10 | // private TheNewYorkTimesHotProcessor theNewYorkTimesHotProcessor; 11 | // 12 | // public TheNewYorkTimesHotProcessorTest(){ 13 | // this.log = LoggerFactory.getLogger(getClass()); 14 | // } 15 | // 16 | // @Test 17 | // public void crawlHotList() { 18 | // checkHotInfoList(theNewYorkTimesHotProcessor.crawlHotList()); 19 | // } 20 | //} -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/news/GeekParkHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class GeekParkHotProcessorTest extends HotProcessorTest { 9 | @Autowired 10 | private GeekParkHotProcessor geekParkHotProcessor; 11 | 12 | public GeekParkHotProcessorTest(){ 13 | this.log = LoggerFactory.getLogger(getClass()); 14 | } 15 | 16 | @Test 17 | public void crawlHotList() { 18 | checkHotInfoList(geekParkHotProcessor.crawlHotList()); 19 | } 20 | 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/news/HuxiuHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class HuxiuHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private HuxiuHotProcessor huxiuHotProcessor; 12 | public HuxiuHotProcessorTest(){ 13 | this.log = LoggerFactory.getLogger(getClass()); 14 | } 15 | 16 | @Test 17 | public void crawlHotList() { 18 | checkHotInfoList(huxiuHotProcessor.crawlHotList()); 19 | } 20 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/news/IfanrHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class IfanrHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private IfanrHotProcessor ifanrHotProcessor; 12 | 13 | public IfanrHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(ifanrHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/news/NatureHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class NatureHotProcessorTest extends HotProcessorTest { 11 | 12 | @Autowired 13 | private NatureHotProcessor natureHotProcessor; 14 | public NatureHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(natureHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/news/ReadhubHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class ReadhubHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private ReadhubHotProcessor readhubHotProcessor; 12 | public ReadhubHotProcessorTest(){ 13 | this.log = LoggerFactory.getLogger(getClass()); 14 | } 15 | 16 | @Test 17 | public void crawlHotList() { 18 | checkHotInfoList(readhubHotProcessor.crawlHotList()); 19 | } 20 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/news/SolidotHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class SolidotHotProcessorTest extends HotProcessorTest { 11 | @Autowired 12 | private SolidotHotProcessor solidotHotProcessor; 13 | 14 | public SolidotHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(solidotHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/news/TechmemeHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.news; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import com.taogen.hotcrawler.commons.crawler.impl.abroad.TechmemeHotProcessor; 5 | import org.junit.Test; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | 9 | public class TechmemeHotProcessorTest extends HotProcessorTest { 10 | 11 | @Autowired 12 | private TechmemeHotProcessor techmemeHotProcessor; 13 | 14 | public TechmemeHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(techmemeHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/DoubanHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class DoubanHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private DoubanHotProcessor doubanHotProcessor; 12 | 13 | public DoubanHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(doubanHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/DoubanTopicsHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class DoubanTopicsHotProcessorTest extends HotProcessorTest { 9 | @Autowired 10 | private DoubanTopicsHotProcessor doubanTopicsHotProcessor; 11 | 12 | public DoubanTopicsHotProcessorTest(){ 13 | this.log = LoggerFactory.getLogger(getClass()); 14 | } 15 | 16 | @Test 17 | public void crawlHotList() { 18 | checkHotInfoList(doubanTopicsHotProcessor.crawlHotList()); 19 | } 20 | 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/HupuHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class HupuHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private HupuHotProcessor hupuHotProcessor; 12 | 13 | public HupuHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(hupuHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/JiandanHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class JiandanHotProcessorTest extends HotProcessorTest { 9 | @Autowired 10 | private JiandanHotProcessor jiandanHotProcessor; 11 | 12 | public JiandanHotProcessorTest(){ 13 | this.log = LoggerFactory.getLogger(getClass()); 14 | } 15 | 16 | @Test 17 | public void crawlHotList() { 18 | checkHotInfoList(jiandanHotProcessor.crawlHotList()); 19 | } 20 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/SspaiHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import com.taogen.hotcrawler.commons.entity.Info; 5 | import org.junit.Test; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | 9 | import java.util.List; 10 | 11 | public class SspaiHotProcessorTest extends HotProcessorTest { 12 | 13 | @Autowired 14 | private SspaiHotProcessor sspaiHotProcessor; 15 | 16 | public SspaiHotProcessorTest(){ 17 | this.log = LoggerFactory.getLogger(getClass()); 18 | } 19 | 20 | @Test 21 | public void crawlHotList() { 22 | List infoList = sspaiHotProcessor.crawlHotList(); 23 | log.debug(infoList.toString()); 24 | checkHotInfoList(infoList); 25 | } 26 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/TianyaHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class TianyaHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private TianyaHotProcessor tianyaHotProcessor; 12 | 13 | public TianyaHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(tianyaHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/V2exHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class V2exHotProcessorTest extends HotProcessorTest { 11 | @Autowired 12 | private V2exHotProcessor v2exHotProcessor; 13 | 14 | public V2exHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(v2exHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/WeiboHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class WeiboHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private WeiboHotProcessor weiboHotProcessor; 12 | 13 | public WeiboHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(weiboHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/slack/ZhihuHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.slack; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class ZhihuHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private ZhihuHotProcessor zhihuHotProcessor; 12 | 13 | public ZhihuHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(zhihuHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/stream/CloudmusicHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.stream; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class CloudmusicHotProcessorTest extends HotProcessorTest { 11 | @Autowired 12 | private CloudmusicHotProcessor cloudmusicHotProcessor; 13 | 14 | public CloudmusicHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(cloudmusicHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/DeveloperHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class DeveloperHotProcessorTest extends HotProcessorTest { 9 | @Autowired 10 | private DeveloperHotProcessor developerHotProcessor; 11 | 12 | public DeveloperHotProcessorTest(){ 13 | this.log = LoggerFactory.getLogger(getClass()); 14 | } 15 | 16 | @Test 17 | public void crawlHotList() { 18 | checkHotInfoList(developerHotProcessor.crawlHotList()); 19 | } 20 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/DzoneHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class DzoneHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private DzoneHotProcessor dzoneHotProcessor; 12 | 13 | public DzoneHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(dzoneHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/GithubHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class GithubHotProcessorTest extends HotProcessorTest { 11 | 12 | @Autowired 13 | private GithubHotProcessor githubHotProcessor; 14 | 15 | public GithubHotProcessorTest(){ 16 | this.log = LoggerFactory.getLogger(getClass()); 17 | } 18 | 19 | @Test 20 | public void crawlHotList() { 21 | checkHotInfoList(githubHotProcessor.crawlHotList()); 22 | } 23 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/InfoqHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class InfoqHotProcessorTest extends HotProcessorTest { 11 | 12 | @Autowired 13 | private InfoqHotProcessor infoqHotProcessor; 14 | public InfoqHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(infoqHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/InfoqcomHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class InfoqcomHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private InfoqcomHotProcessor infoqcomHotProcessor; 12 | 13 | public InfoqcomHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(infoqcomHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/JAXenterHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class JAXenterHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private JAXenterHotProcessor jaXenterHotProcessor; 12 | 13 | public JAXenterHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(jaXenterHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/JavaWorldHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | public class JavaWorldHotProcessorTest extends HotProcessorTest { 9 | 10 | @Autowired 11 | private JavaWorldHotProcessor javaWorldHotProcessor; 12 | 13 | public JavaWorldHotProcessorTest(){ 14 | this.log = LoggerFactory.getLogger(getClass()); 15 | } 16 | 17 | @Test 18 | public void crawlHotList() { 19 | checkHotInfoList(javaWorldHotProcessor.crawlHotList()); 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/JuejinHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class JuejinHotProcessorTest extends HotProcessorTest { 11 | @Autowired 12 | private JuejinHotProcessor juejinHotProcessor; 13 | 14 | public JuejinHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(juejinHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/crawler/impl/technique/SegmentFaultHotProcessorTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.crawler.impl.technique; 2 | 3 | import com.taogen.hotcrawler.commons.crawler.HotProcessorTest; 4 | import org.junit.Test; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | 8 | import static org.junit.Assert.*; 9 | 10 | public class SegmentFaultHotProcessorTest extends HotProcessorTest { 11 | @Autowired 12 | private SegmentFaultHotProcessor segmentFaultHotProcessor; 13 | 14 | public SegmentFaultHotProcessorTest(){ 15 | this.log = LoggerFactory.getLogger(getClass()); 16 | } 17 | 18 | @Test 19 | public void crawlHotList() { 20 | checkHotInfoList(segmentFaultHotProcessor.crawlHotList()); 21 | } 22 | } -------------------------------------------------------------------------------- /src/test/java/com/taogen/hotcrawler/commons/repository/InfoRepositoryTest.java: -------------------------------------------------------------------------------- 1 | package com.taogen.hotcrawler.commons.repository; 2 | 3 | import com.taogen.hotcrawler.commons.entity.Info; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | import org.junit.runner.RunWith; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.beans.factory.annotation.Autowired; 10 | import org.springframework.boot.test.context.SpringBootTest; 11 | import org.springframework.test.context.junit4.SpringRunner; 12 | 13 | import java.util.Arrays; 14 | import java.util.List; 15 | 16 | @RunWith(SpringRunner.class) 17 | @SpringBootTest 18 | public class InfoRepositoryTest 19 | { 20 | private Logger log = LoggerFactory.getLogger(InfoRepositoryTest.class); 21 | @Autowired 22 | private InfoRepository infoRepository; 23 | 24 | @Test 25 | public void saveTest() 26 | { 27 | infoRepository.save(new Info("1", "haha", "http://bvaidu"),"1"); 28 | infoRepository.save(new Info("2", "haha2", "http://bvaidu22"), "1"); 29 | Assert.assertNotNull(infoRepository.findByInfoId("1", "1")); 30 | Assert.assertNotNull(infoRepository.findByInfoId("1", "2")); 31 | } 32 | 33 | @Test 34 | public void saveAllTest() 35 | { 36 | List infoList = Arrays.asList(new Info("11", "11", "http://bvaidu"), new Info("22", "22", "http://bvaidu")); 37 | infoRepository.removeByTypeId("1"); 38 | infoRepository.saveAll(infoList, "1"); 39 | Assert.assertNotNull(infoRepository.findByInfoId("1", "11")); 40 | Assert.assertNotNull(infoRepository.findByInfoId("1", "22")); 41 | } 42 | @Test 43 | public void countByTypeIdTest() 44 | { 45 | long count = infoRepository.countByTypeId("1"); 46 | log.info("count: "+ count); 47 | } 48 | 49 | @Test 50 | public void findByTypeIdTest() 51 | { 52 | log.info("list: " + infoRepository.findByTypeId("1")); 53 | } 54 | 55 | @Test 56 | public void findByInfoIdTest() 57 | { 58 | log.info("item: " + infoRepository.findByInfoId("1", "1")); 59 | } 60 | 61 | @Test 62 | public void remveByTypeId() 63 | { 64 | log.info("before remove: " + infoRepository.findByTypeId("1")); 65 | infoRepository.removeByTypeId("1"); 66 | log.info("after remove: " + infoRepository.findByTypeId("1")); 67 | } 68 | 69 | 70 | } 71 | --------------------------------------------------------------------------------