├── .gitattributes
├── README.md
├── data-analysis
├── data-analysis.iml
├── pom.xml
├── src
│ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── crow
│ │ │ ├── domain
│ │ │ ├── AuxiliaryModels
│ │ │ │ └── NameValue.java
│ │ │ ├── Comment.java
│ │ │ ├── CommentMapper.java
│ │ │ ├── CommentWord.java
│ │ │ ├── CommentWordMapper.java
│ │ │ ├── Post.java
│ │ │ ├── PostMapper.java
│ │ │ ├── TitleWord.java
│ │ │ ├── TitleWordMapper.java
│ │ │ ├── User.java
│ │ │ └── UserMapper.java
│ │ │ ├── service
│ │ │ ├── CommentService.java
│ │ │ ├── CommentWordService.java
│ │ │ ├── PostService.java
│ │ │ ├── TitleWordService.java
│ │ │ └── UserService.java
│ │ │ └── web
│ │ │ └── EchartsController.java
│ │ ├── resources
│ │ ├── com
│ │ │ └── crow
│ │ │ │ └── domain
│ │ │ │ ├── CommentWordMapper.xml
│ │ │ │ ├── TitleWordMapper.xml
│ │ │ │ └── UserMapper.xml
│ │ ├── db.properties
│ │ ├── log4j.properties
│ │ ├── mybatis
│ │ │ └── sqlMapConfig.xml
│ │ └── spring
│ │ │ ├── applicationContext-dao.xml
│ │ │ ├── applicationContext-service.xml
│ │ │ ├── applicationContext-transaction.xml
│ │ │ └── springmvc.xml
│ │ └── webapp
│ │ ├── WEB-INF
│ │ ├── jsp
│ │ │ ├── CommentWords.jsp
│ │ │ ├── Gender.jsp
│ │ │ ├── ProvinceAddress.jsp
│ │ │ ├── TitleWords.jsp
│ │ │ ├── Titles.jsp
│ │ │ └── Views.jsp
│ │ └── web.xml
│ │ ├── index.jsp
│ │ └── js
│ │ ├── echarts-wordcloud.js
│ │ ├── echarts-wordcloud.min.js
│ │ ├── echarts.common.min.js
│ │ ├── jquery-3.2.1.min.js
│ │ └── theme
│ │ ├── dark.js
│ │ ├── infographic.js
│ │ ├── macarons.js
│ │ ├── roma.js
│ │ └── vintage.js
└── target
│ ├── classes
│ ├── com
│ │ └── crow
│ │ │ ├── domain
│ │ │ ├── AuxiliaryModels
│ │ │ │ └── NameValue.class
│ │ │ ├── Comment.class
│ │ │ ├── CommentMapper.class
│ │ │ ├── CommentWord.class
│ │ │ ├── CommentWordMapper.class
│ │ │ ├── CommentWordMapper.xml
│ │ │ ├── Post.class
│ │ │ ├── PostMapper.class
│ │ │ ├── TitleWord.class
│ │ │ ├── TitleWordMapper.class
│ │ │ ├── TitleWordMapper.xml
│ │ │ ├── User.class
│ │ │ ├── UserMapper.class
│ │ │ └── UserMapper.xml
│ │ │ ├── service
│ │ │ ├── CommentService.class
│ │ │ ├── CommentWordService.class
│ │ │ ├── PostService.class
│ │ │ ├── TitleWordService.class
│ │ │ └── UserService.class
│ │ │ └── web
│ │ │ └── EchartsController.class
│ ├── db.properties
│ ├── log4j.properties
│ ├── mybatis
│ │ └── sqlMapConfig.xml
│ └── spring
│ │ ├── applicationContext-dao.xml
│ │ ├── applicationContext-service.xml
│ │ ├── applicationContext-transaction.xml
│ │ └── springmvc.xml
│ └── data-analysis
│ ├── META-INF
│ └── MANIFEST.MF
│ ├── WEB-INF
│ ├── classes
│ │ ├── com
│ │ │ └── crow
│ │ │ │ ├── domain
│ │ │ │ ├── AuxiliaryModels
│ │ │ │ │ └── NameValue.class
│ │ │ │ ├── Comment.class
│ │ │ │ ├── CommentMapper.class
│ │ │ │ ├── CommentWord.class
│ │ │ │ ├── CommentWordMapper.class
│ │ │ │ ├── CommentWordMapper.xml
│ │ │ │ ├── Post.class
│ │ │ │ ├── PostMapper.class
│ │ │ │ ├── TitleWord.class
│ │ │ │ ├── TitleWordMapper.class
│ │ │ │ ├── TitleWordMapper.xml
│ │ │ │ ├── User.class
│ │ │ │ ├── UserMapper.class
│ │ │ │ └── UserMapper.xml
│ │ │ │ ├── service
│ │ │ │ ├── CommentService.class
│ │ │ │ ├── CommentWordService.class
│ │ │ │ ├── PostService.class
│ │ │ │ ├── TitleWordService.class
│ │ │ │ └── UserService.class
│ │ │ │ └── web
│ │ │ │ └── EchartsController.class
│ │ ├── db.properties
│ │ ├── log4j.properties
│ │ ├── mybatis
│ │ │ └── sqlMapConfig.xml
│ │ └── spring
│ │ │ ├── applicationContext-dao.xml
│ │ │ ├── applicationContext-service.xml
│ │ │ ├── applicationContext-transaction.xml
│ │ │ └── springmvc.xml
│ ├── jsp
│ │ ├── CommentWords.jsp
│ │ ├── Gender.jsp
│ │ ├── ProvinceAddress.jsp
│ │ ├── TitleWords.jsp
│ │ ├── Titles.jsp
│ │ └── Views.jsp
│ ├── lib
│ │ ├── ansj_seg-5.1.1.jar
│ │ ├── aopalliance-1.0.jar
│ │ ├── aspectjweaver-1.8.7.jar
│ │ ├── classmate-1.1.0.jar
│ │ ├── commons-dbcp-1.4.jar
│ │ ├── commons-fileupload-1.3.1.jar
│ │ ├── commons-io-2.2.jar
│ │ ├── commons-logging-1.2.jar
│ │ ├── commons-pool-1.5.4.jar
│ │ ├── hibernate-validator-5.2.4.Final.jar
│ │ ├── jackson-annotations-2.4.0.jar
│ │ ├── jackson-core-2.4.3.jar
│ │ ├── jackson-databind-2.4.3.jar
│ │ ├── javax.servlet-api-3.1.0.jar
│ │ ├── jboss-logging-3.2.1.Final.jar
│ │ ├── jsp-api-2.2.jar
│ │ ├── jstl-1.2.jar
│ │ ├── log4j-1.2.17.jar
│ │ ├── mybatis-3.3.1.jar
│ │ ├── mybatis-spring-1.2.4.jar
│ │ ├── mysql-connector-java-5.1.38.jar
│ │ ├── nlp-lang-1.7.2.jar
│ │ ├── slf4j-api-1.7.18.jar
│ │ ├── spring-aop-4.2.4.RELEASE.jar
│ │ ├── spring-aspects-4.2.4.RELEASE.jar
│ │ ├── spring-beans-4.2.4.RELEASE.jar
│ │ ├── spring-context-4.2.4.RELEASE.jar
│ │ ├── spring-core-4.2.4.RELEASE.jar
│ │ ├── spring-expression-4.2.4.RELEASE.jar
│ │ ├── spring-jdbc-4.2.4.RELEASE.jar
│ │ ├── spring-orm-4.2.4.RELEASE.jar
│ │ ├── spring-test-4.2.4.RELEASE.jar
│ │ ├── spring-tx-4.2.4.RELEASE.jar
│ │ ├── spring-web-4.2.4.RELEASE.jar
│ │ ├── spring-webmvc-4.2.4.RELEASE.jar
│ │ ├── standard-1.1.2.jar
│ │ └── validation-api-1.1.0.Final.jar
│ └── web.xml
│ ├── index.jsp
│ └── js
│ ├── echarts-wordcloud.js
│ ├── echarts-wordcloud.min.js
│ ├── echarts.common.min.js
│ ├── jquery-3.2.1.min.js
│ └── theme
│ ├── dark.js
│ ├── infographic.js
│ ├── macarons.js
│ ├── roma.js
│ └── vintage.js
├── hupu-spider
├── .gitignore
├── pom.xml
└── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── crow
│ │ │ ├── HupuSpiderApplication.java
│ │ │ ├── domain
│ │ │ ├── Comment.java
│ │ │ ├── CommentList.java
│ │ │ ├── CommentMapper.java
│ │ │ ├── HupuBxjPostInfo.java
│ │ │ ├── Post.java
│ │ │ ├── PostInfoMapper.java
│ │ │ ├── PostMapper.java
│ │ │ ├── ProxyIp.java
│ │ │ ├── ProxyIpMapper.java
│ │ │ ├── TitleWord.java
│ │ │ ├── TitleWordMapper.java
│ │ │ ├── User.java
│ │ │ └── UserMapper.java
│ │ │ ├── utils
│ │ │ ├── IPCheckUtil.java
│ │ │ ├── ProxyGeneratedUtil.java
│ │ │ ├── URLGeneratedUtil.java
│ │ │ └── UserAgentUtil.java
│ │ │ ├── web
│ │ │ └── StartUpController.java
│ │ │ └── webmagic
│ │ │ ├── downloader
│ │ │ └── CrowProxyProvider.java
│ │ │ ├── pageprocessor
│ │ │ └── HupuBxjPageProcessor.java
│ │ │ └── pipeline
│ │ │ └── HupuSpiderPipeline.java
│ └── resources
│ │ ├── application.yml
│ │ └── db.sql
│ └── test
│ └── java
│ └── com
│ └── crow
│ └── HupuspiderApplicationTests.java
└── ip-spider
├── .gitignore
├── pom.xml
└── src
├── main
├── java
│ └── com
│ │ └── crow
│ │ ├── IpspiderApplication.java
│ │ ├── domain
│ │ ├── ProxyIp.java
│ │ └── ProxyIpMapper.java
│ │ ├── utils
│ │ └── UserAgentUtil.java
│ │ ├── web
│ │ └── StartUpController.java
│ │ └── webmagic
│ │ ├── pageprocessor
│ │ ├── ProxyPoolProcessor1.java
│ │ └── ProxyPoolProcessor2.java
│ │ └── pipeline
│ │ └── IPSpiderPipeline.java
└── resources
│ └── application.yml
└── test
└── java
└── com
└── crow
├── DataprocessingApplicationTests.java
├── IpspiderApplicationTests.java
└── MagictoeApplicationTests.java
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=Java
2 | *.html linguist-language=Java
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MagicToe
2 | MagicToe是一个基于Java爬虫框架[WebMagic](https://github.com/code4craft/webmagic)的Java爬虫实战案例,MagicToe提供了从获取数据到数据持久化、可视化分析以及构建简单的代理池等一系列完整流程,旨在为初涉Java爬虫的程序员提供一个参考教程和一整套完整的解决方案。
3 |
4 | ## 仓库目录
5 | + [hupu-spider](https://github.com/CrowHawk/MagicToe/tree/master/hupu-spider):爬虫功能实现模块,使用**WebMagic + SpringBoot + MyBatis**基础架构,NLP工具包是[Ansj中文分词](https://github.com/NLPchina/ansj_seg),定制抽取逻辑,将爬取的数据持久化到**MySQL**数据库中,本仓库中的代码示例爬取的是虎扑步行街。
6 | + [data-analysis](https://github.com/CrowHawk/MagicToe/tree/master/data-analysis):数据分析及可视化模块,使用**Spring + SpringMVC + MyBatis**的基础架构,数据可视化采用的前端技术是 **jsp +** [Echarts](http://www.echartsjs.com/)。
7 | + [ip-spider](https://github.com/CrowHawk/MagicToe/tree/master/data-analysis)(可选):爬取代理网站模块,技术选型同hupu-spider,将代理网站上的免费代理地址爬取到本地数据库中,实现一个简单的IP池,以供hupu-spider作为代理使用。
8 |
9 | ## QuickStart
10 |
11 | **爬虫模块环境准备:**
12 | + JDK 1.8+
13 | + maven 4.0.0+
14 | + webmagic 0.7.3+
15 | + ansj_seg 5.1.1+
16 | + springboot 1.5.7+
17 | + mybatis 1.3.1+
18 | + mysql 5.1.21+
19 |
20 | **运行爬虫:**
21 | 以爬取虎扑步行街的帖子、用户和评论为例。
22 | 1. 初始化数据库
23 | 在本地MySQL中创建自己的schema,执行初始化数据库的脚本 [`hupu-spider/src/main/resources/db.sql`](https://github.com/CrowHawk/MagicToe/blob/master/hupu-spider/src/main/resources/db.sql) ,并根据自己的数据库信息修改配置文件 [`hupu-spider/src/main/resources/application.yml`](https://github.com/CrowHawk/MagicToe/blob/master/hupu-spider/src/main/resources/application.yml) 中的数据源信息。
24 | 2. 启动爬虫
25 | hupuspider通过URL请求的方式运行,在浏览器中键入 **localhost:8080/**(默认端口为8080,如果遇到端口冲突,可以在配置文件 [`hupu-spider/src/main/resources/application.yml`](https://github.com/CrowHawk/MagicToe/blob/master/hupu-spider/src/main/resources/application.yml) 中修改端口),爬虫即可开始运行了。
26 | 3. 运行数据可视化模块
27 | 将数据爬取到数据库中后,直接在Tomcat中运行[data-analysis](https://github.com/CrowHawk/MagicToe/tree/master/data-analysis)模块即可,通过在浏览器中输入不同的URL可以得到不同的图表,具体请查看 [`data-analysis/src/main/java/com/crow/web/EchartsController.java`](https://github.com/CrowHawk/MagicToe/blob/master/data-analysis/src/main/java/com/crow/web/EchartsController.java) 。
28 |
29 | ## 效果展示
30 | 以虎扑用户的地域分布为例:
31 |
32 |
33 |
34 | 更多详细的分析请参考我的博客[《数据不说谎:用网络爬虫探秘虎扑步行街》](https://crowhawk.github.io/2017/10/25/hupuspider/)。
35 |
36 | ## TODO
37 | * [ ] 使用Redis分布式队列实现分布式爬取。
38 | * [ ] 使用Quartz实现定时更新数据。
39 |
40 | ## 联系作者
41 | + Personal Website:[Crow Home](https://crowhawk.github.io/)
42 | + 知乎:[Martin Crow](https://www.zhihu.com/people/martin-crow/activities)
43 |
44 |
--------------------------------------------------------------------------------
/data-analysis/data-analysis.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/data-analysis/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.crow
5 | data-analysis
6 | war
7 | 1.0-SNAPSHOT
8 | data-analysis Maven Webapp
9 | http://maven.apache.org
10 |
11 | UTF-8
12 |
13 | 4.2.4.RELEASE
14 | 1.8
15 | 1.8
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 | javax.servlet
24 | javax.servlet-api
25 | 3.1.0
26 |
27 |
28 | javax.servlet.jsp
29 | jsp-api
30 | 2.2
31 |
32 |
33 | javax.servlet
34 | jstl
35 | 1.2
36 |
37 |
38 |
39 | org.springframework
40 | spring-webmvc
41 | ${spring.version}
42 |
43 |
44 |
45 | org.springframework
46 | spring-core
47 | ${spring.version}
48 |
49 |
50 |
51 | org.springframework
52 | spring-orm
53 | ${spring.version}
54 |
55 |
56 |
57 | org.springframework
58 | spring-aspects
59 | ${spring.version}
60 |
61 |
62 |
63 | org.springframework
64 | spring-test
65 | ${spring.version}
66 |
67 |
68 |
69 | org.springframework
70 | spring-jdbc
71 | ${spring.version}
72 |
73 |
74 |
75 |
76 | org.mybatis
77 | mybatis
78 | 3.3.1
79 |
80 |
81 | org.mybatis
82 | mybatis-spring
83 | 1.2.4
84 |
85 |
86 |
87 | mysql
88 | mysql-connector-java
89 | 5.1.38
90 |
91 |
92 |
93 | commons-dbcp
94 | commons-dbcp
95 | 1.4
96 |
97 |
98 |
99 | log4j
100 | log4j
101 | 1.2.17
102 |
103 |
104 |
105 | org.slf4j
106 | slf4j-api
107 | 1.7.18
108 |
109 |
110 |
111 |
112 | javax.servlet
113 | jstl
114 | 1.2
115 |
116 |
117 | taglibs
118 | standard
119 | 1.1.2
120 |
121 |
122 |
123 |
124 | org.hibernate
125 | hibernate-validator
126 | 5.2.4.Final
127 |
128 |
129 |
130 |
131 | commons-fileupload
132 | commons-fileupload
133 | 1.3.1
134 |
135 |
136 |
137 |
138 | com.fasterxml.jackson.core
139 | jackson-core
140 | 2.4.3
141 |
142 |
143 | com.fasterxml.jackson.core
144 | jackson-databind
145 | 2.4.3
146 |
147 |
148 |
149 |
150 | org.ansj
151 | ansj_seg
152 | 5.1.1
153 |
154 |
155 |
156 |
157 | data-analysis
158 |
159 |
160 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/AuxiliaryModels/NameValue.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain.AuxiliaryModels;
2 |
3 | /**
4 | * Created by CrowHawk on 17/10/23.
5 | */
6 |
7 | /**
8 | * 用来拼接json数据返回给前端
9 | */
10 | public class NameValue {
11 | private Integer value;
12 | private String name;
13 |
14 | public NameValue(Integer value, String name) {
15 | this.value = value;
16 | this.name = name;
17 | }
18 |
19 | public Integer getValue() {
20 | return value;
21 | }
22 |
23 | public void setValue(Integer value) {
24 | this.value = value;
25 | }
26 |
27 | public String getName() {
28 | return name;
29 | }
30 |
31 | public void setName(String name) {
32 | this.name = name;
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/Comment.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | /**
4 | * Created by CrowHawk on 17/10/11.
5 | */
6 | public class Comment {
7 |
8 | private int id;
9 | private int litNum;//评论点亮数
10 | private String author;
11 | private String content;
12 | private String title;
13 |
14 | public int getId() {
15 | return id;
16 | }
17 |
18 | public void setId(int id) {
19 | this.id = id;
20 | }
21 |
22 | public int getLitNum() {
23 | return litNum;
24 | }
25 |
26 | public void setLitNum(int litNum) {
27 | this.litNum = litNum;
28 | }
29 |
30 | public String getAuthor() {
31 | return author;
32 | }
33 |
34 | public void setAuthor(String author) {
35 | this.author = author;
36 | }
37 |
38 | public String getContent() {
39 | return content;
40 | }
41 |
42 | public void setContent(String content) {
43 | this.content = content;
44 | }
45 |
46 | public String getTitle() {
47 | return title;
48 | }
49 |
50 | public void setTitle(String title) {
51 | this.title = title;
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/CommentMapper.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | import org.apache.ibatis.annotations.Insert;
4 | import org.apache.ibatis.annotations.Result;
5 | import org.apache.ibatis.annotations.Results;
6 | import org.apache.ibatis.annotations.Select;
7 |
8 | import java.util.List;
9 |
10 | /**
11 | * Created by CrowHawk on 17/10/12.
12 | */
13 |
14 | public interface CommentMapper {
15 | @Insert("insert into comment (`content`,`author`,`lit_num`,`title`) values(#{content},#{author},#{litNum},#{title})")
16 | void insert(Comment comment);
17 |
18 | @Results(id = "commentResult", value = {
19 | @Result(property = "id", column = "id", id = true),
20 | @Result(property = "litNum", column = "lit_num"),
21 | @Result(property = "author", column = "author"),
22 | @Result(property = "content", column = "content"),
23 | @Result(property = "title", column = "title")
24 | })
25 | @Select("select * from comment order by lit_num desc limit #{selectLimitNum}")
26 | List selectAllCommentsSorted(Integer selectLimitNum);
27 | }
28 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/CommentWord.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | /**
4 | * Created by CrowHawk on 17/10/24.
5 | */
6 | public class CommentWord {
7 | private Integer id;
8 | private String word;//分词内容
9 | private Integer wordCount;//分词出现次数
10 |
11 | /*
12 | public CommentWord(String word) {
13 | this.word = word;
14 | }
15 | */
16 |
17 | public Integer getId() {
18 | return id;
19 | }
20 |
21 | public void setId(Integer id) {
22 | this.id = id;
23 | }
24 |
25 | public String getWord() {
26 | return word;
27 | }
28 |
29 | public void setWord(String word) {
30 | this.word = word;
31 | }
32 |
33 | public Integer getWordCount() {
34 | return wordCount;
35 | }
36 |
37 | public void setWordCount(Integer wordCount) {
38 | this.wordCount = wordCount;
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/CommentWordMapper.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | import java.util.List;
4 |
5 | /**
6 | * Created by CrowHawk on 17/10/24.
7 | */
8 | public interface CommentWordMapper {
9 | /*
10 | @Insert("insert into comment_word (`word`) values (#{word})")
11 | void insert(CommentWord commentWord);
12 | */
13 | //获取所有标题的分词,并按出现频率排序
14 | List selectWordsSorted(Integer selectLimitNum);
15 | }
16 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/Post.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | /**
4 | * Created by CrowHawk on 17/10/11.
5 | */
6 | public class Post {
7 |
8 | private int id;
9 | private String title;
10 | private String author;
11 | private int replyNum;
12 |
13 | public int getId() {
14 | return id;
15 | }
16 |
17 | public void setId(int id) {
18 | this.id = id;
19 | }
20 |
21 | public String getTitle() {
22 | return title;
23 | }
24 |
25 | public void setTitle(String title) {
26 | this.title = title;
27 | }
28 |
29 | public String getAuthor() {
30 | return author;
31 | }
32 |
33 | public void setAuthor(String author) {
34 | this.author = author;
35 | }
36 |
37 | public int getReplyNum() {
38 | return replyNum;
39 | }
40 |
41 | public void setReplyNum(int replyNum) {
42 | this.replyNum = replyNum;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/PostMapper.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | import org.apache.ibatis.annotations.Insert;
4 | import org.apache.ibatis.annotations.Result;
5 | import org.apache.ibatis.annotations.Results;
6 | import org.apache.ibatis.annotations.Select;
7 |
8 | import java.util.List;
9 |
10 | /**
11 | * Created by CrowHawk on 17/10/12.
12 | */
13 |
14 | public interface PostMapper {
15 | @Insert("insert ignore into post (`title`,`author`,`reply_num`) values (#{title},#{author},#{replyNum})")
16 | void insert(Post post);
17 |
18 | @Results(id = "postResult", value = {
19 | @Result(property = "id", column = "id", id = true),
20 | @Result(property = "title", column = "title"),
21 | @Result(property = "author", column = "author"),
22 | @Result(property = "replyNum", column = "reply_num")
23 | })
24 | @Select("select * from post order by reply_num desc limit #{selectLimitNum}")
25 | List selectAllPostsSorted(Integer selectLimitNum);
26 | }
27 |
28 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/TitleWord.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | /**
4 | * Created by CrowHawk on 17/10/11.
5 | */
6 |
7 | /**
8 | * 帖子标题的分词结果
9 | */
10 | public class TitleWord {
11 | private Integer id;
12 | private String word;//分词内容
13 |
14 | private Integer wordCount;//分词出现次数
15 |
16 | public Integer getId() {
17 | return id;
18 | }
19 |
20 | public void setId(Integer id) {
21 | this.id = id;
22 | }
23 |
24 | public String getWord() {
25 | return word;
26 | }
27 |
28 | public void setWord(String word) {
29 | this.word = word;
30 | }
31 |
32 | public Integer getWordCount() {
33 | return wordCount;
34 | }
35 |
36 | public void setWordCount(Integer wordCount) {
37 | this.wordCount = wordCount;
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/TitleWordMapper.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | import org.apache.ibatis.annotations.Insert;
4 |
5 | import java.util.List;
6 |
7 | /**
8 | * Created by CrowHawk on 17/10/12.
9 | */
10 |
11 | public interface TitleWordMapper {
12 | @Insert("insert into title_word (`word`) values (#{word})")
13 | void insert(TitleWord titleWord);
14 | //获取所有标题的分词,并按出现频率排序
15 | List selectWordsSorted(Integer selectLimitNum);
16 | }
17 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/User.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | /**
4 | * Created by CrowHawk on 17/10/11.
5 | */
6 | public class User {
7 |
8 | private int id;
9 | private String name;
10 | private String gender;
11 | private String homeTeam;//用户主队
12 | private String address;//用户所在地
13 | private Integer views;//用户主页访问量
14 |
15 | private String provinceAddress;//省名
16 |
17 | private Integer peopleNum;//各省人数
18 |
19 | private Integer maleNum;//男性人数
20 |
21 | private Integer femaleNum;//女性人数
22 |
23 | private Integer unknownNum;//未填写性别的人数
24 |
25 | private Integer genderNum;
26 |
27 | public int getId() {
28 | return id;
29 | }
30 |
31 | public void setId(int id) {
32 | this.id = id;
33 | }
34 |
35 | public String getName() {
36 | return name;
37 | }
38 |
39 | public void setName(String name) {
40 | this.name = name;
41 | }
42 |
43 | public String getGender() {
44 | return gender;
45 | }
46 |
47 | public void setGender(String gender) {
48 | this.gender = gender;
49 | }
50 |
51 | public String getHomeTeam() {
52 | return homeTeam;
53 | }
54 |
55 | public void setHomeTeam(String homeTeam) {
56 | this.homeTeam = homeTeam;
57 | }
58 |
59 | public String getAddress() {
60 | return address;
61 | }
62 |
63 | public void setAddress(String address) {
64 | this.address = address;
65 | }
66 |
67 | public Integer getViews() {
68 | return views;
69 | }
70 |
71 | public void setViews(Integer views) {
72 | this.views = views;
73 | }
74 |
75 | public String getProvinceAddress() {
76 | return provinceAddress;
77 | }
78 |
79 | public void setProvinceAddress(String provinceAddress) {
80 | this.provinceAddress = provinceAddress;
81 | }
82 |
83 | public Integer getPeopleNum() {
84 | return peopleNum;
85 | }
86 |
87 | public void setPeopleNum(Integer peopleNum) {
88 | this.peopleNum = peopleNum;
89 | }
90 |
91 | public Integer getGenderNum() {
92 | return genderNum;
93 | }
94 |
95 | public void setGenderNum(Integer genderNum) {
96 | this.genderNum = genderNum;
97 | }
98 |
99 | public Integer getMaleNum() {
100 | return maleNum;
101 | }
102 |
103 | public void setMaleNum(Integer maleNum) {
104 | this.maleNum = maleNum;
105 | }
106 |
107 | public Integer getFemaleNum() {
108 | return femaleNum;
109 | }
110 |
111 | public void setFemaleNum(Integer femaleNum) {
112 | this.femaleNum = femaleNum;
113 | }
114 |
115 | public Integer getUnknownNum() {
116 | return unknownNum;
117 | }
118 |
119 | public void setUnknownNum(Integer unknownNum) {
120 | this.unknownNum = unknownNum;
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/domain/UserMapper.java:
--------------------------------------------------------------------------------
1 | package com.crow.domain;
2 |
3 | import org.apache.ibatis.annotations.Insert;
4 |
5 | import java.util.List;
6 |
7 | /**
8 | * Created by CrowHawk on 17/10/12.
9 | */
10 |
11 | public interface UserMapper {
12 |
13 | @Insert("insert ignore into user (`name`,`gender`,`home_team`,`address`) values (#{name},#{gender},#{homeTeam},#{address})")
14 | void insert(User user);
15 | //获取用户的地域分布
16 | List selectAllAddressesSort(Integer selectLimitNum);
17 | //获取用户的性别分布
18 | List selectAllGender();
19 | //获取所有用户,并按照访问量排序
20 | List selectAllUsersSortedByViews(Integer selectLimitNum);
21 | }
22 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/service/CommentService.java:
--------------------------------------------------------------------------------
1 | package com.crow.service;
2 |
3 | import com.crow.domain.Comment;
4 | import com.crow.domain.CommentMapper;
5 | import org.springframework.beans.factory.annotation.Autowired;
6 | import org.springframework.stereotype.Service;
7 |
8 | import java.util.List;
9 |
10 | /**
11 | * Created by CrowHawk on 17/10/21.
12 | */
13 | @Service
14 | public class CommentService {
15 | @Autowired
16 | CommentMapper commentMapper;
17 |
18 | public List getAllComments(Integer selectLimitNum) {
19 | return commentMapper.selectAllCommentsSorted(selectLimitNum);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/service/CommentWordService.java:
--------------------------------------------------------------------------------
1 | package com.crow.service;
2 |
3 | import com.crow.domain.CommentMapper;
4 | import com.crow.domain.CommentWord;
5 | import com.crow.domain.CommentWordMapper;
6 | import org.springframework.beans.factory.annotation.Autowired;
7 | import org.springframework.stereotype.Service;
8 |
9 | import java.util.ArrayList;
10 | import java.util.List;
11 |
12 | /**
13 | * Created by CrowHawk on 17/10/24.
14 | */
15 | @Service
16 | public class CommentWordService {
17 | @Autowired
18 | CommentWordMapper commentWordMapper;
19 |
20 | @Autowired
21 | CommentMapper commentMapper;
22 |
23 | /**
24 | * 根据数据库中的评论内容
25 | * 生成评论分词
26 | */
27 | /*
28 | public void insertAllWords() {
29 | List commentList = commentMapper.selectAllCommentsSorted(100000);
30 | for(Comment comment: commentList) {
31 | String content = comment.getContent().replaceAll( "[\\p{P}+~$`^=|<>~`$^+=|<>¥×]" , "");
32 | String[] strings = ToAnalysis.parse(content).toString().split(",");//分词的结果是用","分隔的
33 | for(String word: strings) {
34 | commentWordMapper.insert(new CommentWord(word));
35 | }
36 | }
37 | }
38 | */
39 |
40 | public List getAllWords(Integer selectLimitNum) {
41 | List commentWords = commentWordMapper.selectWordsSorted(selectLimitNum);
42 | List commentWordList = new ArrayList<>();
43 | for(CommentWord commentWord: commentWords) {
44 | if(commentWord.getWord().matches("[\\u4e00-\\u9fa5]+/(n|a|vn|ad|b|t)")) {//去掉分词结果中的助词、语气词等
45 | commentWordList.add(commentWord);
46 | }
47 | }
48 | return commentWordList;
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/service/PostService.java:
--------------------------------------------------------------------------------
1 | package com.crow.service;
2 |
3 | import com.crow.domain.Post;
4 | import com.crow.domain.PostMapper;
5 | import org.springframework.beans.factory.annotation.Autowired;
6 | import org.springframework.stereotype.Service;
7 |
8 | import java.util.List;
9 |
10 | /**
11 | * Created by CrowHawk on 17/10/21.
12 | */
13 | @Service
14 | public class PostService {
15 | @Autowired
16 | PostMapper postMapper;
17 |
18 | public List getAllPosts(Integer selectLimitNum) {
19 | return postMapper.selectAllPostsSorted(selectLimitNum);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/service/TitleWordService.java:
--------------------------------------------------------------------------------
1 | package com.crow.service;
2 |
3 | import com.crow.domain.TitleWord;
4 | import com.crow.domain.TitleWordMapper;
5 | import org.springframework.beans.factory.annotation.Autowired;
6 | import org.springframework.stereotype.Service;
7 |
8 | import java.util.List;
9 |
10 | /**
11 | * Created by CrowHawk on 17/10/21.
12 | */
13 | @Service
14 | public class TitleWordService {
15 | @Autowired
16 | TitleWordMapper titleWordMapper;
17 |
18 | public List getAllWords(Integer selectLimitNum) {
19 | return titleWordMapper.selectWordsSorted(selectLimitNum);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/service/UserService.java:
--------------------------------------------------------------------------------
1 | package com.crow.service;
2 |
3 | import com.crow.domain.User;
4 | import com.crow.domain.UserMapper;
5 | import org.springframework.beans.factory.annotation.Autowired;
6 | import org.springframework.stereotype.Service;
7 |
8 | import java.util.List;
9 |
10 | /**
11 | * Created by CrowHawk on 17/10/21.
12 | */
13 | @Service
14 | public class UserService {
15 | @Autowired
16 | UserMapper userMapper;
17 |
18 | public List getAddresses(Integer selectLimitNum) {
19 | return userMapper.selectAllAddressesSort(selectLimitNum);
20 | }
21 |
22 | public List getGender() {
23 | return userMapper.selectAllGender();
24 | }
25 |
26 | public List getAllUsers(Integer selectLimitNum) {
27 | return userMapper.selectAllUsersSortedByViews(selectLimitNum);
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/data-analysis/src/main/java/com/crow/web/EchartsController.java:
--------------------------------------------------------------------------------
1 | package com.crow.web;
2 |
3 | import com.crow.domain.AuxiliaryModels.NameValue;
4 | import com.crow.domain.CommentWord;
5 | import com.crow.domain.Post;
6 | import com.crow.domain.TitleWord;
7 | import com.crow.domain.User;
8 | import com.crow.service.*;
9 | import org.springframework.beans.factory.annotation.Autowired;
10 | import org.springframework.stereotype.Controller;
11 | import org.springframework.web.bind.annotation.RequestMapping;
12 | import org.springframework.web.bind.annotation.ResponseBody;
13 |
14 | import java.util.ArrayList;
15 | import java.util.List;
16 |
17 | /**
18 | * Created by CrowHawk on 17/10/21.
19 | */
20 | @Controller
21 | public class EchartsController {
22 | @Autowired
23 | CommentService commentService;
24 | @Autowired
25 | PostService postService;
26 | @Autowired
27 | TitleWordService titleWordService;
28 | @Autowired
29 | UserService userService;
30 | @Autowired
31 | CommentWordService commentWordService;
32 |
33 | @RequestMapping(value = "/getAddress")
34 | @ResponseBody
35 | public List getAddress() {
36 | List users = userService.getAddresses(18);
37 | return users;
38 | }
39 |
40 | @RequestMapping(value = "/getGender")
41 | @ResponseBody
42 | public List getGender() {
43 | List users = userService.getGender();
44 | List result = new ArrayList<>();
45 | for(User user: users) {
46 | result.add(new NameValue(user.getGenderNum(), user.getGender()));
47 | }
48 | return result;
49 | }
50 |
51 | @RequestMapping(value = "/getTitleWord")
52 | @ResponseBody
53 | public List getTitleWord() {
54 | List titleWords = titleWordService.getAllWords(30);
55 | List result = new ArrayList<>();
56 | for(TitleWord titleWord: titleWords) {
57 | result.add(new NameValue(titleWord.getWordCount(), titleWord.getWord()));
58 | }
59 | return result;
60 | }
61 |
62 | @RequestMapping(value = "/getTitle")
63 | @ResponseBody
64 | public List getTitle() {
65 | List posts = postService.getAllPosts(20);
66 | List result = new ArrayList<>();
67 | for(Post post: posts) {
68 | result.add(new NameValue(post.getReplyNum(), post.getTitle()));
69 | }
70 | return result;
71 | }
72 |
73 | @RequestMapping(value = "/getViews")
74 | @ResponseBody
75 | public List getViews() {
76 | List users = userService.getAllUsers(10);
77 | return users;
78 | }
79 |
80 | @RequestMapping(value = "/getCommentWord")
81 | @ResponseBody
82 | public List getCommentWord() {
83 | List commentWords = commentWordService.getAllWords(300);
84 | List result = new ArrayList<>();
85 | for(CommentWord commentWord: commentWords) {
86 | String word = commentWord.getWord();
87 | result.add(new NameValue(commentWord.getWordCount(), word.substring(0, word.indexOf("/"))));//去掉分词中的词性标识
88 | }
89 | return result;
90 | }
91 |
92 | /*
93 | @RequestMapping(value = "/genCommentWord")
94 | public void genCommentWord() {
95 | commentWordService.insertAllWords();
96 | }
97 | */
98 |
99 | @RequestMapping(value = "/province")
100 | public String getAddressEcharts() {
101 | return "ProvinceAddress";
102 | }
103 |
104 | @RequestMapping(value = "/gender")
105 | public String getGenderEcharts() {
106 | return "Gender";
107 | }
108 |
109 | @RequestMapping(value = "/views")
110 | public String getViewsEcharts() {
111 | return "Views";
112 | }
113 |
114 | @RequestMapping(value = "/titleword")
115 | public String getTitleWordEcharts() {
116 | return "TitleWords";
117 | }
118 |
119 | @RequestMapping(value = "/title")
120 | public String getTitleEcharts() {
121 | return "Titles";
122 | }
123 |
124 | @RequestMapping(value = "/commentword")
125 | public String getCommentWordEcharts() {
126 | return "CommentWords";
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/com/crow/domain/CommentWordMapper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | comment_word
5 |
9 |
10 |
11 |
12 |
23 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/com/crow/domain/TitleWordMapper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | title_word
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
23 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/com/crow/domain/UserMapper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | USER
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
37 |
38 |
47 |
48 |
56 |
57 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/db.properties:
--------------------------------------------------------------------------------
1 | jdbc.driver=com.mysql.jdbc.Driver
2 | jdbc.url=jdbc:mysql://localhost:3306/HupuSpider?characterEncoding=utf-8
3 | jdbc.username=root
4 | jdbc.password=wyj
5 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Global logging configuration
2 | # 在开发环境下日志级别设置成DEBUG,在生产环境下才设置为ERROR和INFO
3 | log4j.rootLogger=DEBUG, stdout
4 | # MyBatis logging configuration...
5 | log4j.logger.org.mybatis.example.BlogMapper=TRACE
6 | # Console output...
7 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
8 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
9 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n
10 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/mybatis/sqlMapConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
19 |
20 |
23 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/spring/applicationContext-dao.xml:
--------------------------------------------------------------------------------
1 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/spring/applicationContext-service.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/spring/applicationContext-transaction.xml:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
13 |
14 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/data-analysis/src/main/resources/spring/springmvc.xml:
--------------------------------------------------------------------------------
1 |
15 |
16 |
20 |
21 |
22 |
25 |
26 |
27 |
28 |
29 |
32 |
33 |
36 |
37 |
38 |
43 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
57 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/data-analysis/src/main/webapp/WEB-INF/jsp/CommentWords.jsp:
--------------------------------------------------------------------------------
1 | <%--
2 | Created by IntelliJ IDEA.
3 | User: CrowHawk
4 | Date: 17/10/24
5 | Time: 上午11:12
6 | To change this template use File | Settings | File Templates.
7 | --%>
8 | <%@ page contentType="text/html;charset=UTF-8" language="java" %>
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
25 |
26 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/data-analysis/src/main/webapp/WEB-INF/jsp/Gender.jsp:
--------------------------------------------------------------------------------
1 | <%--
2 | Created by IntelliJ IDEA.
3 | User: CrowHawk
4 | Date: 17/10/23
5 | Time: 下午6:19
6 | To change this template use File | Settings | File Templates.
7 | --%>
8 | <%@ page contentType="text/html;charset=UTF-8" language="java" %>
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
130 |
131 |
--------------------------------------------------------------------------------
/data-analysis/src/main/webapp/WEB-INF/jsp/ProvinceAddress.jsp:
--------------------------------------------------------------------------------
1 | <%--
2 | Created by IntelliJ IDEA.
3 | User: CrowHawk
4 | Date: 17/10/22
5 | Time: 下午10:21
6 | To change this template use File | Settings | File Templates.
7 | --%>
8 |
9 | <%@ page contentType="text/html;charset=UTF-8" language="java" %>
10 |
11 |
12 | ECharts
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
124 |
125 |