├── .gitignore
├── LICENSE
├── README
├── pom.xml
├── result-sources
├── EdmundDXu
│ └── files
│ │ ├── city_url.sql
│ │ ├── clean_result_test.txt
│ │ ├── emp.txt
│ │ ├── index.jsp
│ │ ├── lagou.doc
│ │ ├── lagou.jsp
│ │ ├── lagou.sql
│ │ ├── lagou.txt
│ │ └── ready_url.sql
├── radish
│ └── BossUrl.txt
├── random
│ ├── emp.txt
│ └── job_message.sql
└── wcy
│ └── emp.txt
├── result
├── 概要设计
│ ├── BossUrl.txt
│ ├── cities.txt
│ ├── emp.txt
│ └── 概要设计.doc
├── 模块一_各大省会招聘概况
│ ├── 数据可视化-2018-4-26
│ │ └── view
│ │ │ ├── js
│ │ │ ├── echarts.min.js
│ │ │ ├── jquery-1.11.2.js
│ │ │ └── js.js
│ │ │ └── view1.html
│ └── 数据清洗阶段-2018-4-26
│ │ ├── 清洗前_lagou.sql+job_data.sql
│ │ ├── job_data.sql
│ │ ├── lagou.sql
│ │ └── url_list.sql
│ │ ├── 清洗后_job_data_result.sql
│ │ └── job_data_result.sql
│ │ └── 清洗过程.doc
├── 模块二_各大编程语言的工作能力成熟度分析
│ ├── file
│ ├── 数据可视化-2018-4-27
│ │ └── view
│ │ │ ├── dynamic
│ │ │ └── EChartsDemo.war
│ │ │ └── static
│ │ │ ├── android.html
│ │ │ ├── auto_scroll.html
│ │ │ ├── c#.html
│ │ │ ├── c++.html
│ │ │ ├── index.html
│ │ │ ├── java.html
│ │ │ ├── js
│ │ │ ├── echarts-wordcloud.js
│ │ │ └── echarts.js
│ │ │ ├── linux.html
│ │ │ ├── python.html
│ │ │ └── web.html
│ └── 数据清洗阶段-2018-4-27
│ │ ├── 清洗前_lagou.sql+job_message.sql
│ │ ├── job_message.sql
│ │ └── lagou.sql
│ │ ├── 清洗后_key_map.sql
│ │ ├── key_map.sql
│ │ └── lagou_export
│ │ │ ├── android.txt
│ │ │ ├── c#.txt
│ │ │ ├── c++.txt
│ │ │ ├── java.txt
│ │ │ ├── linux.txt
│ │ │ ├── python.txt
│ │ │ └── web.txt
│ │ └── 清洗过程.doc
├── 源文件
│ ├── CrawlerApp-0.0.1-SNAPSHOT-javadoc.jar
│ ├── CrawlerApp-0.0.1-SNAPSHOT-sources.jar
│ └── CrawlerApp-0.0.1-SNAPSHOT.jar
└── 项目演讲.ppt
├── sources
├── hadoop.dll
└── winutils.exe
├── src
└── main
│ └── java
│ ├── com
│ ├── edmund
│ │ ├── crawler
│ │ │ ├── JobCrawler.java
│ │ │ ├── KeyMapMerger.java
│ │ │ ├── LGJobCleaner.java
│ │ │ ├── LGJobCrawler.java
│ │ │ ├── LGJobCrawlerThread.java
│ │ │ └── LGJobUrlGenerator.java
│ │ ├── properties
│ │ ├── test
│ │ │ └── Test.java
│ │ ├── utils
│ │ │ ├── DBUtils.java
│ │ │ ├── DataBaseConnection.java
│ │ │ ├── LGCleanUtils.java
│ │ │ └── LGDBUtils.java
│ │ └── vo
│ │ │ ├── Job.java
│ │ │ ├── KeyMap.java
│ │ │ └── LGJob.java
│ ├── radish
│ │ ├── HDFSUtil
│ │ │ └── HDFSTest.java
│ │ ├── analysis
│ │ │ └── DataAnalysiser.java
│ │ ├── crawler
│ │ │ ├── BOSSCrawlerManager.java
│ │ │ ├── BOSSProvinceCrawler.java
│ │ │ ├── Test.java
│ │ │ └── distributed
│ │ │ │ ├── DistributedCrawler.java
│ │ │ │ └── Test.java
│ │ ├── dataclean
│ │ │ └── DataCleaner.java
│ │ ├── util
│ │ │ ├── MyUtil.java
│ │ │ └── UrlListIniter.java
│ │ └── vo
│ │ │ ├── BOSSUrlVO.java
│ │ │ └── JobDataVO.java
│ ├── random
│ │ ├── crawler
│ │ │ ├── BOSSRequestMessageCrawler.java
│ │ │ ├── TaskManager.java
│ │ │ └── Test.java
│ │ ├── properties
│ │ └── test
│ │ │ └── Demo.java
│ └── wcy
│ │ └── test
│ │ └── Test.java
│ └── log4j.properties
└── 概要设计.doc
/.gitignore:
--------------------------------------------------------------------------------
1 | /.settings
2 | /.classpath
3 | /.project
4 | /target
5 | /bin/
6 | /~$oject_Info.doc
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 radishT
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | 各位使用的时候注意分寸。。
2 |
3 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.radish.Crawler
6 | CrawlerApp
7 | 0.0.1-SNAPSHOT
8 | jar
9 | CrawlerApp
10 | http://maven.apache.org
11 |
12 | UTF-8
13 |
14 |
15 |
16 |
17 | jdk.tools
18 | jdk.tools
19 | 1.8
20 | system
21 | ${JAVA_HOME}/lib/tools.jar
22 |
23 |
24 | junit
25 | junit
26 | 3.8.1
27 | test
28 |
29 |
30 |
31 | org.jsoup
32 | jsoup
33 | 1.10.3
34 |
35 |
36 |
37 | mysql
38 | mysql-connector-java
39 | 5.1.29
40 |
41 |
42 |
43 | org.apache.hadoop
44 | hadoop-hdfs
45 | 2.7.2
46 |
47 |
48 |
49 | org.apache.hadoop
50 | hadoop-common
51 | 2.7.2
52 | provided
53 |
54 |
55 |
56 | log4j
57 | log4j
58 | 1.2.17
59 |
60 |
61 |
62 | org.seleniumhq.selenium
63 | selenium-chrome-driver
64 | 3.6.0
65 |
66 |
67 |
68 | org.seleniumhq.selenium
69 | selenium-java
70 | 3.6.0
71 |
72 |
73 |
74 | com.google.guava
75 | guava
76 | 23.0
77 |
78 |
79 |
80 | com.alibaba
81 | fastjson
82 | 1.2.37
83 |
84 |
85 |
86 | org.apache.lucene
87 | lucene-core
88 | 2.0.0
89 |
90 |
95 |
96 | je
97 | analysis
98 | 1.5.3
99 |
100 |
101 |
102 |
103 |
104 | org.apache.maven.plugins
105 | maven-compiler-plugin
106 | 3.5.1
107 |
108 | 1.8
109 | 1.8
110 |
111 |
112 |
113 |
114 |
115 | org.apache.maven.plugins
116 | maven-javadoc-plugin
117 | 2.7
118 |
119 |
120 | attach-javadocs
121 |
122 | jar
123 |
124 |
125 | -Xdoclint:none
126 |
127 |
128 |
129 |
130 |
131 |
132 | org.apache.maven.plugins
133 | maven-source-plugin
134 | 3.0.1
135 |
136 |
137 | attach-sources
138 |
139 | jar
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
--------------------------------------------------------------------------------
/result-sources/EdmundDXu/files/clean_result_test.txt:
--------------------------------------------------------------------------------
1 | # 数据清理结束后用于检查是否有漏网之鱼
2 | SELECT
3 | *
4 | FROM
5 | job_data_result
6 | WHERE
7 | min_experience < 0
8 | OR min_education < 0
9 | OR min_salary = 0
10 | OR max_salary = 0
11 | OR avg_salary = 0;
--------------------------------------------------------------------------------
/result-sources/EdmundDXu/files/index.jsp:
--------------------------------------------------------------------------------
1 | <%@ page language="java" contentType="text/html; charset=UTF-8"
2 | pageEncoding="UTF-8"%>
3 | <%
4 | String path = request.getContextPath();
5 | String basePath = request.getScheme() + "://"
6 | + request.getServerName() + ":" + request.getServerPort()
7 | + path + "/";
8 | %>
9 |
10 |
11 |
12 |
13 |
14 | Echarts Demo
15 |
16 |
17 |
100 |
101 |
102 |
103 |
104 |
--------------------------------------------------------------------------------
/result-sources/EdmundDXu/files/lagou.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result-sources/EdmundDXu/files/lagou.doc
--------------------------------------------------------------------------------
/result-sources/EdmundDXu/files/lagou.jsp:
--------------------------------------------------------------------------------
1 | <%@ page language="java" contentType="text/html; charset=UTF-8"
2 | pageEncoding="UTF-8"%>
3 | <%
4 | String path = request.getContextPath();
5 | String basePath = request.getScheme() + "://"
6 | + request.getServerName() + ":" + request.getServerPort()
7 | + path + "/";
8 | %>
9 |
10 |
11 |
12 |
13 |
14 | Echarts Demo
15 |
16 |
17 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
--------------------------------------------------------------------------------
/result-sources/EdmundDXu/files/lagou.txt:
--------------------------------------------------------------------------------
1 | 北京
2 | 上海
3 | 深圳
4 | 广州
5 | 杭州
6 | 成都
7 | 南京
8 | 武汉
9 | 西安
10 | 厦门
11 | 长沙
12 | 苏州
13 | 天津
14 | 安庆
15 | 鞍山
16 | 澳门特别行政区
17 | 安阳
18 | 阿克苏
19 | 北京
20 | 保定
21 | 包头
22 | 北海
23 | 蚌埠
24 | 滨州
25 | 宝鸡
26 | 百色
27 | 巴中
28 | 亳州
29 | 本溪
30 | 保山
31 | 白山
32 | 巴彦淖尔
33 | 白银
34 | 成都
35 | 长沙
36 | 重庆
37 | 长春
38 | 常州
39 | 沧州
40 | 常德
41 | 潮州
42 | 赤峰
43 | 承德
44 | 滁州
45 | 郴州
46 | 楚雄
47 | 池州
48 | 昌吉
49 | 朝阳
50 | 东莞
51 | 德阳
52 | 东营
53 | 达州
54 | 德州
55 | 大庆
56 | 大同
57 | 丹东
58 | 德宏
59 | 定西
60 | 迪庆
61 | 恩施
62 | 鄂尔多斯
63 | 鄂州
64 | 佛山
65 | 福州
66 | 阜阳
67 | 抚顺
68 | 抚州
69 | 防城港
70 | 阜新
71 | 广州
72 | 贵阳
73 | 赣州
74 | 广元
75 | 贵港
76 | 甘孜藏族自治州
77 | 广安
78 | 高雄
79 | 杭州
80 | 合肥
81 | 哈尔滨
82 | 惠州
83 | 海口
84 | 呼和浩特
85 | 湖州
86 | 邯郸
87 | 淮安
88 | 菏泽
89 | 黄冈
90 | 衡水
91 | 河源
92 | 衡阳
93 | 黄石
94 | 汉中
95 | 河池
96 | 淮北
97 | 红河
98 | 怀化
99 | 淮南
100 | 黄山
101 | 贺州
102 | 鹤壁
103 | 黑河
104 | 济南
105 | 金华
106 | 嘉兴
107 | 江门
108 | 济宁
109 | 揭阳
110 | 荆州
111 | 晋中
112 | 九江
113 | 景德镇
114 | 晋城
115 | 酒泉
116 | 焦作
117 | 吉安
118 | 鸡西
119 | 锦州
120 | 佳木斯
121 | 昆明
122 | 开封
123 | 克拉玛依
124 | 廊坊
125 | 兰州
126 | 洛阳
127 | 拉萨
128 | 泸州
129 | 龙岩
130 | 漯河
131 | 乐山
132 | 莱芜
133 | 娄底
134 | 来宾
135 | 绵阳
136 | 梅州
137 | 眉山
138 | 马鞍山
139 | 茂名
140 | 牡丹江
141 | 南京
142 | 宁波
143 | 南昌
144 | 南宁
145 | 南通
146 | 南充
147 | 南阳
148 | 宁德
149 | 内江
150 | 南平
151 | 莆田
152 | 濮阳
153 | 攀枝花
154 | 盘锦
155 | 平顶山
156 | 萍乡
157 | 青岛
158 | 泉州
159 | 秦皇岛
160 | 清远
161 | 衢州
162 | 黔西南
163 | 曲靖
164 | 齐齐哈尔
165 | 庆阳
166 | 黔东南
167 | 黔南
168 | 钦州
169 | 日照
170 | 上海
171 | 深圳
172 | 苏州
173 | 沈阳
174 | 石家庄
175 | 汕头
176 | 绍兴
177 | 三亚
178 | 韶关
179 | 商丘
180 | 十堰
181 | 宿迁
182 | 汕尾
183 | 上饶
184 | 遂宁
185 | 邵阳
186 | 三明
187 | 三沙
188 | 宿州
189 | 随州
190 | 三门峡
191 | 松原
192 | 石嘴山
193 | 商洛
194 | 双鸭山
195 | 朔州
196 | 天津
197 | 太原
198 | 唐山
199 | 台州
200 | 泰安
201 | 泰州
202 | 台北
203 | 天水
204 | 铜仁
205 | 通化
206 | 武汉
207 | 无锡
208 | 温州
209 | 潍坊
210 | 威海
211 | 芜湖
212 | 渭南
213 | 梧州
214 | 吴忠
215 | 武威
216 | 文山
217 | 西安
218 | 厦门
219 | 徐州
220 | 西宁
221 | 咸阳
222 | 香港特别行政区
223 | 新乡
224 | 邢台
225 | 襄阳
226 | 湘潭
227 | 许昌
228 | 咸宁
229 | 信阳
230 | 新余
231 | 宣城
232 | 孝感
233 | 新北
234 | 忻州
235 | 湘西土家族苗族自治州
236 | 烟台
237 | 扬州
238 | 银川
239 | 盐城
240 | 宜昌
241 | 宜春
242 | 宜宾
243 | 岳阳
244 | 永州
245 | 阳江
246 | 运城
247 | 益阳
248 | 阳泉
249 | 雅安
250 | 云浮
251 | 延安
252 | 鹰潭
253 | 玉溪
254 | 延边
255 | 营口
256 | 郑州
257 | 珠海
258 | 中山
259 | 肇庆
260 | 淄博
261 | 镇江
262 | 湛江
263 | 株洲
264 | 漳州
265 | 遵义
266 | 驻马店
267 | 资阳
268 | 舟山
269 | 张家口
270 | 长治
271 | 枣庄
272 | 中卫
273 | 周口
274 | 张家界
275 | 张掖
276 | 昭通
--------------------------------------------------------------------------------
/result-sources/random/job_message.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Navicat MySQL Data Transfer
3 |
4 | Source Server : localDB
5 | Source Server Version : 50554
6 | Source Host : localhost:3306
7 | Source Database : crawler_db
8 |
9 | Target Server Type : MYSQL
10 | Target Server Version : 50554
11 | File Encoding : 65001
12 |
13 | Date: 2018-04-27 20:06:34
14 | */
15 |
16 | SET FOREIGN_KEY_CHECKS=0;
17 |
18 | -- ----------------------------
19 | -- Table structure for job_message
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `job_message`;
22 | CREATE TABLE `job_message` (
23 | `url_id` int(11) NOT NULL AUTO_INCREMENT,
24 | `key_word` varchar(255) DEFAULT NULL,
25 | `url` varchar(500) DEFAULT NULL,
26 | `message` varchar(2000) DEFAULT NULL,
27 | `status` int(10) DEFAULT NULL,
28 | `message_map` blob,
29 | PRIMARY KEY (`url_id`)
30 | ) ENGINE=InnoDB AUTO_INCREMENT=46354 DEFAULT CHARSET=utf8;
31 |
--------------------------------------------------------------------------------
/result/概要设计/概要设计.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/概要设计/概要设计.doc
--------------------------------------------------------------------------------
/result/模块一_各大省会招聘概况/数据可视化-2018-4-26/view/js/js.js:
--------------------------------------------------------------------------------
1 | var dataMap = {};
2 | function dataFormatter(obj) {
3 | var pList = ['北京','天津','河北','山西','内蒙古','辽宁','吉林','黑龙江','上海','江苏','浙江','安徽','福建','江西','山东','河南','湖北','湖南','广东','广西','海南','重庆','四川','贵州','云南','西藏','陕西','甘肃','青海','宁夏','新疆'];
4 | var temp;
5 | for (var year = 2018; year <= 2018; year++) {
6 | var max = 0;
7 | var sum = 0;
8 | temp = obj[year];
9 | for (var i = 0, l = temp.length; i < l; i++) {
10 | max = Math.max(max, temp[i]);
11 | sum += temp[i];
12 | obj[year][i] = {
13 | name : pList[i],
14 | value : temp[i]
15 | }
16 | }
17 | obj[year + 'max'] = Math.floor(max / 100) * 100;
18 | obj[year + 'sum'] = sum;
19 | }
20 | return obj;
21 | }
22 |
23 | dataMap.javadata1 = dataFormatter({
24 | //max : 60000,
25 | 2018:[994,419,596,336,155,624,314,301,750,2191,999,367,1099,388,888,592,599,1096,1669,246,113,30,366,295,295,7,590,106,28,55,135]
26 | });
27 | dataMap.javadata2 = dataFormatter({
28 | //max : 4000,
29 | 2018:[82.44,84.21,956.84,197.8,374.69,590.2,446.17,474.2,79.68,1110.44,685.2,783.66,664.78,535.98,1390,1288.36,707,847.25,1015.08,601.99,222.89,317.87,1047.95,281.1,463.44,39.75,282.21,215.51,47.31,52.95,305]
30 | });
31 | dataMap.javadata1=dataFormatter({2018:[994,419,596,336,155,624,314,301,750,2191,999,367,1099,388,888,592,599,1096,1669,246,113,30,366,295,295,7,590,106,28,55,135]});
32 | dataMap.javadata2=dataFormatter({2018:[21764.59,10559.67,8471.48,7636.9,7425.81,8369.39,7033.44,7458.47,17978.67,11696.71,11914.41,9843.32,11484.53,7956.19,8363.74,8919.76,11954.09,10407.85,14360.1,8768.29,9088.5,9700.0,12265.03,8766.1,8584.75,8642.86,11192.37,7830.19,7928.57,7345.45,8633.33]});
33 | dataMap.javadata3=dataFormatter({2018:[27463.78,13226.73,10619.13,9473.21,9406.45,10479.17,8789.81,9172.76,22457.33,14712.46,15050.05,12247.96,14495.91,10046.39,10451.58,11150.34,14989.98,12933.39,18004.19,10995.93,11345.13,12166.67,15330.6,10867.8,10654.24,10000.0,14125.42,9820.75,10142.86,9145.45,10666.67]});
34 | dataMap.javadata4=dataFormatter({2018:[16065.39,7892.6,6323.83,5800.6,5445.16,6259.62,5277.07,5744.19,13500.0,8680.97,8778.78,7438.69,8473.16,5865.98,6275.9,6689.19,8918.2,7882.3,10716.0,6540.65,6831.86,7233.33,9199.45,6664.41,6515.25,7285.71,8259.32,5839.62,5714.29,5545.45,6600.0]});
35 | dataMap.javadata5=dataFormatter({2018:[0.88,0.68,0.46,0.58,0.66,0.55,0.57,0.53,0.75,0.63,0.58,0.61,0.55,0.51,0.53,0.51,0.68,0.52,0.6,0.5,0.65,0.57,0.62,0.66,0.53,0.57,0.73,0.55,0.64,0.6,0.54]});
36 | dataMap.javadata6=dataFormatter({2018:[0.17,0.38,0.45,0.4,0.52,0.41,0.53,0.49,0.21,0.35,0.47,0.44,0.4,0.51,0.47,0.36,0.29,0.25,0.31,0.41,0.47,0.37,0.32,0.48,0.53,0.29,0.27,0.58,0.5,0.45,0.55]});
37 | dataMap.cdata1=dataFormatter({2018:[436,263,172,6,11,425,66,2,404,854,635,245,606,8,557,319,238,141,971,64,23,171,384,26,64,2,267,2,1,4,13]});
38 | dataMap.cdata2=dataFormatter({2018:[14083.72,8471.48,7229.65,6833.33,7181.82,8261.18,7242.42,6500.0,13647.28,10152.22,10826.77,9383.67,9854.79,7125.0,8529.62,8716.3,9405.46,9127.66,10754.38,7046.88,7456.52,8418.13,10277.34,7269.23,8242.19,9000.0,9174.16,6500.0,3500.0,7625.0,8038.46]});
39 | dataMap.cdata3=dataFormatter({2018:[17729.36,10608.37,9069.77,8666.67,8818.18,10294.12,9151.52,8500.0,17178.22,12761.12,13634.65,11730.61,12438.94,8750.0,10673.25,10827.59,11798.32,11326.24,13568.49,8765.63,9260.87,10608.19,12861.98,9038.46,10312.5,10000.0,11531.84,8500.0,5000.0,9250.0,10153.85]});
40 | dataMap.cdata4=dataFormatter({2018:[10438.07,6334.6,5389.53,5000.0,5545.45,6228.24,5333.33,4500.0,10116.34,7543.33,8018.9,7036.73,7270.63,5500.0,6386.0,6605.02,7012.61,6929.08,7940.27,5328.13,5652.17,6228.07,7692.71,5500.0,6171.88,8000.0,6816.48,4500.0,2000.0,6000.0,5923.08]});
41 | dataMap.cdata5=dataFormatter({2018:[0.72,0.59,0.44,0.5,0.27,0.45,0.59,0.5,0.62,0.53,0.53,0.61,0.38,0.38,0.46,0.36,0.49,0.35,0.43,0.36,0.3,0.44,0.52,0.42,0.47,0.0,0.64,0.5,1.0,0.5,0.31]});
42 | dataMap.cdata6=dataFormatter({2018:[0.4,0.48,0.41,0.5,0.36,0.4,0.45,0.0,0.44,0.52,0.52,0.51,0.5,0.63,0.5,0.49,0.5,0.52,0.53,0.58,0.61,0.53,0.5,0.73,0.61,0.0,0.4,1.0,1.0,0.5,0.69]});
43 | dataMap.linuxdata1=dataFormatter({2018:[697,307,460,17,104,518,185,6,179,957,688,331,682,107,784,361,350,330,1650,207,80,300,352,209,152,22,339,59,19,44,93]});
44 | dataMap.linuxdata2=dataFormatter({2018:[20027.98,10506.51,8125.0,6941.18,6865.38,9862.93,7364.86,8583.33,17500.0,11242.95,12292.88,9965.26,11005.87,7523.36,9188.14,8860.11,11292.86,10207.58,12750.3,8978.26,8643.75,10453.33,12170.45,8607.66,8434.21,7090.91,11467.55,7322.03,7052.63,7488.64,7629.03]});
45 | dataMap.linuxdata3=dataFormatter({2018:[25428.98,13172.64,10136.96,8705.88,8557.69,12158.3,9205.41,10833.33,22162.01,14208.99,15475.29,12522.66,13806.45,9308.41,11562.5,11077.56,14220.0,12742.42,16087.88,11309.18,10637.5,13140.0,15292.61,10741.63,10388.16,8545.45,14486.73,9016.95,8947.37,9590.91,9376.34]});
46 | dataMap.linuxdata4=dataFormatter({2018:[14626.97,7840.39,6113.04,5176.47,5173.08,7567.57,5524.32,6333.33,12837.99,8276.91,9110.47,7407.85,8205.28,5738.32,6813.78,6642.66,8365.71,7672.73,9412.73,6647.34,6650.0,7766.67,9048.3,6473.68,6480.26,5636.36,8448.38,5627.12,5157.89,5386.36,5881.72]});
47 | dataMap.linuxdata5=dataFormatter({2018:[0.81,0.66,0.45,0.41,0.49,0.66,0.56,0.5,0.71,0.58,0.59,0.58,0.51,0.43,0.54,0.37,0.62,0.5,0.56,0.45,0.51,0.54,0.65,0.64,0.49,0.41,0.77,0.59,0.47,0.48,0.55]});
48 | dataMap.linuxdata6=dataFormatter({2018:[0.27,0.49,0.43,0.59,0.55,0.38,0.54,0.67,0.25,0.45,0.47,0.52,0.5,0.59,0.46,0.44,0.41,0.4,0.47,0.46,0.51,0.42,0.4,0.56,0.57,0.59,0.35,0.54,0.58,0.64,0.55]});
49 | dataMap.pythondata1=dataFormatter({2018:[300,263,124,55,20,378,38,78,392,835,528,317,302,99,355,89,349,306,901,48,29,220,308,63,50,2,335,7,1,4,26]});
50 | dataMap.pythondata2=dataFormatter({2018:[19906.67,11363.12,8475.81,7727.27,8225.0,9935.19,8315.79,9185.9,17931.12,12986.83,15310.61,11348.58,11470.2,8333.33,10349.3,9949.44,12750.72,11196.08,14235.85,11791.67,10017.24,11611.36,12951.3,9896.83,10950.0,10000.0,12594.03,6714.29,7500.0,8125.0,10019.23]});
51 | dataMap.pythondata3=dataFormatter({2018:[25460.0,14288.97,10532.26,9672.73,10350.0,12423.28,10315.79,11217.95,22609.69,16513.77,19464.02,14441.64,14417.22,10393.94,13064.79,12483.15,16240.69,14068.63,18011.1,15125.0,12448.28,14722.73,16334.42,12285.71,13320.0,12000.0,16074.63,7857.14,10000.0,10750.0,12500.0]});
52 | dataMap.pythondata4=dataFormatter({2018:[14353.33,8437.26,6419.35,5781.82,6100.0,7447.09,6315.79,7153.85,13252.55,9459.88,11157.2,8255.52,8523.18,6272.73,7633.8,7415.73,9260.74,8323.53,10460.6,8458.33,7586.21,8500.0,9568.18,7507.94,8580.0,8000.0,9113.43,5571.43,5000.0,5500.0,7538.46]});
53 | dataMap.pythondata5=dataFormatter({2018:[0.83,0.82,0.45,0.78,0.7,0.73,0.61,0.67,0.82,0.67,0.71,0.76,0.56,0.65,0.63,0.6,0.76,0.59,0.66,0.58,0.55,0.63,0.73,0.75,0.52,0.0,0.8,0.43,0.0,1.0,0.62]});
54 | dataMap.pythondata6=dataFormatter({2018:[0.36,0.47,0.43,0.42,0.75,0.4,0.42,0.44,0.39,0.39,0.51,0.44,0.46,0.6,0.43,0.44,0.4,0.39,0.47,0.44,0.59,0.4,0.38,0.51,0.48,1.0,0.32,0.57,0.0,1.0,0.5]});
55 | dataMap.webdata1=dataFormatter({2018:[745,341,489,307,103,636,257,39,120,1370,1401,365,713,364,998,404,541,342,2663,207,112,30,707,238,267,4,427,66,9,32,59]});
56 | dataMap.webdata2=dataFormatter({2018:[18572.48,8282.99,6674.85,6778.5,5990.29,7473.27,6250.97,6512.82,13766.67,9156.57,11143.83,8538.36,8714.59,7328.3,7329.16,6997.52,9791.13,8149.12,11408.56,7557.97,8093.75,8183.33,10073.55,8048.32,7569.29,7250.0,9241.22,7507.58,8944.44,6734.38,7177.97]});
57 | dataMap.webdata3=dataFormatter({2018:[23524.83,10348.97,8370.14,8384.36,7495.15,9319.18,7821.01,8051.28,17175.0,11546.72,14066.38,10753.42,10955.12,9162.09,9174.35,8727.72,12316.08,10146.2,14328.58,9381.64,10053.57,10233.33,12736.92,10079.83,9423.22,8000.0,11665.11,9151.52,11444.44,8593.75,8796.61]});
58 | dataMap.webdata4=dataFormatter({2018:[13620.13,6217.01,4979.55,5172.64,4485.44,5627.36,4680.93,4974.36,10358.33,6766.42,8221.27,6323.29,6474.05,5494.51,5483.97,5267.33,7266.17,6152.05,8488.55,5734.3,6133.93,6133.33,7410.18,6016.81,5715.36,6500.0,6817.33,5863.64,6444.44,4875.0,5559.32]});
59 | dataMap.webdata5=dataFormatter({2018:[0.84,0.56,0.36,0.57,0.42,0.55,0.49,0.44,0.72,0.5,0.53,0.58,0.42,0.39,0.45,0.38,0.55,0.37,0.48,0.37,0.54,0.57,0.55,0.57,0.44,0.5,0.65,0.5,0.78,0.44,0.54]});
60 | dataMap.webdata6=dataFormatter({2018:[0.28,0.54,0.55,0.51,0.66,0.49,0.59,0.56,0.45,0.53,0.53,0.56,0.58,0.6,0.58,0.57,0.42,0.51,0.47,0.55,0.49,0.63,0.43,0.55,0.56,0.5,0.4,0.61,0.22,0.56,0.66]});
61 | dataMap.cppdata1=dataFormatter({2018:[209,8,4,2,0,0,0,3,15,44,274,25,53,3,14,1,124,42,454,1,1,21,181,2,1,0,45,0,0,0,0]});
62 | dataMap.cppdata2=dataFormatter({2018:[22866.03,10687.5,8000.0,4750.0,0.0,0.0,0.0,9333.33,19133.33,13500.0,16945.26,11400.0,13141.51,9833.33,9071.43,11500.0,12516.13,11535.71,18411.89,7000.0,1500.0,11928.57,12447.51,9000.0,8000.0,0.0,12088.89,0.0,0.0,0.0,0.0]});
63 | dataMap.cppdata3=dataFormatter({2018:[29071.77,13875.0,10250.0,5500.0,0.0,0.0,0.0,11333.33,24200.0,17431.82,21653.28,14760.0,16735.85,11666.67,11571.43,15000.0,15903.23,14809.52,23484.58,8000.0,2000.0,15190.48,15585.64,11500.0,10000.0,0.0,15644.44,0.0,0.0,0.0,0.0]});
64 | dataMap.cppdata4=dataFormatter({2018:[16660.29,7500.0,5750.0,4000.0,0.0,0.0,0.0,7333.33,14066.67,9568.18,12237.23,8040.0,9547.17,8000.0,6571.43,8000.0,9129.03,8261.9,13339.21,6000.0,1000.0,8666.67,9309.39,6500.0,6000.0,0.0,8533.33,0.0,0.0,0.0,0.0]});
65 | dataMap.cppdata5=dataFormatter({2018:[0.91,0.75,0.5,0.5,0.0,0.0,0.0,0.67,0.87,0.59,0.76,0.8,0.64,1.0,0.64,0.0,0.67,0.64,0.74,0.0,0.0,0.48,0.63,1.0,0.0,0.0,0.78,0.0,0.0,0.0,0.0]});
66 | dataMap.cppdata6=dataFormatter({2018:[0.27,0.63,0.75,0.5,0.0,0.0,0.0,0.0,0.73,0.41,0.36,0.24,0.42,0.33,0.64,0.0,0.34,0.26,0.31,0.0,0.0,0.48,0.35,0.5,1.0,0.0,0.33,0.0,0.0,0.0,0.0]});
67 | dataMap.androiddata1=dataFormatter({2018:[450,0,7,3,1,0,1,0,434,198,461,62,2,13,34,2,178,56,915,0,4,0,4,7,12,0,74,1,0,0,0]});
68 | dataMap.androiddata2=dataFormatter({2018:[23723.33,0.0,7500.0,6166.67,8000.0,0.0,9500.0,0.0,18748.85,13020.2,15767.9,9467.74,7000.0,8730.77,8455.88,5750.0,10828.65,10535.71,14014.75,0.0,8250.0,0.0,7750.0,8142.86,9791.67,0.0,10250.0,9000.0,0.0,0.0,0.0]});
69 | dataMap.androiddata3=dataFormatter({2018:[30215.56,0.0,9857.14,7666.67,9000.0,0.0,12000.0,0.0,23658.99,16585.86,20060.74,12080.65,8500.0,10923.08,10852.94,7000.0,13775.28,13285.71,17634.97,0.0,10500.0,0.0,9250.0,10000.0,12500.0,0.0,13040.54,12000.0,0.0,0.0,0.0]});
70 | dataMap.androiddata4=dataFormatter({2018:[17231.11,0.0,5142.86,4666.67,7000.0,0.0,7000.0,0.0,13838.71,9454.55,11475.05,6854.84,5500.0,6538.46,6058.82,4500.0,7882.02,7785.71,10394.54,0.0,6000.0,0.0,6250.0,6285.71,7083.33,0.0,7459.46,6000.0,0.0,0.0,0.0]});
71 | dataMap.androiddata5=dataFormatter({2018:[0.94,0.0,0.14,0.33,0.0,0.0,1.0,0.0,0.79,0.6,0.72,0.58,0.5,0.38,0.29,0.0,0.61,0.55,0.63,0.0,0.5,0.0,0.5,0.57,0.17,0.0,0.65,0.0,0.0,0.0,0.0]});
72 | dataMap.androiddata6=dataFormatter({2018:[0.14,0.0,0.29,1.0,0.0,0.0,0.0,0.0,0.27,0.46,0.39,0.52,1.0,0.54,0.68,0.5,0.38,0.36,0.42,0.0,0.75,0.0,0.25,0.43,0.5,0.0,0.35,1.0,0.0,0.0,0.0]});
73 |
74 | option = {
75 | baseOption: {
76 | timeline: {
77 | // y: 0,
78 | axisType: 'category',
79 | // realtime: false,
80 | // loop: false,
81 | autoPlay: false,
82 | // currentIndex: 2,
83 | playInterval: 1000,
84 | // controlStyle: {
85 | // position: 'left'
86 | // },
87 | data: [
88 | 'java','C#','linux','python','web','C++','android'
89 | ],
90 | label: {
91 | formatter : function(s) {
92 | return s;
93 | }
94 | }
95 | },
96 | title: {
97 | subtext: '数据来自第八组萝卜中队'
98 | },
99 | tooltip: {
100 |
101 | },
102 | legend: {
103 | x: 'right',
104 | data: ['岗位需求量', '平均薪资', '最高薪资', '最低薪资', '本科及以上员工比例', '工作经验不限的比例'],
105 | //show:false
106 | },
107 | calculable : true,
108 | grid: {
109 | top: 80,
110 | bottom: 100,
111 | tooltip: {
112 | trigger: 'axis',
113 | axisPointer: {
114 | type: 'shadow',
115 | label: {
116 | show: true,
117 | formatter: function (params) {
118 | return params.value.replace('\n', '');
119 | }
120 | }
121 | }
122 | }
123 | },
124 | xAxis: [
125 | {
126 | 'type':'category',
127 | 'axisLabel':{'interval':0},
128 | 'data':[
129 | '北京','\n天津','河北','\n山西','内蒙古','\n辽宁','吉林','\n黑龙江',
130 | '上海','\n江苏','浙江','\n安徽','福建','\n江西','山东','\n河南',
131 | '湖北','\n湖南','广东','\n广西','海南','\n重庆','四川','\n贵州',
132 | '云南','\n西藏','陕西','\n甘肃','青海','\n宁夏','新疆'
133 | ],
134 | splitLine: {show: true}
135 | }
136 | ],
137 | yAxis: [
138 | {
139 | type: 'value',
140 | name: ''
141 | }
142 | ],
143 | series: [
144 | {name: '岗位需求量', type: 'bar'},
145 | {name: '平均薪资', type: 'bar'},
146 | {name: '最高薪资', type: 'bar'},
147 | {name: '最低薪资', type: 'bar'},
148 | {name: '本科及以上员工比例', type: 'bar'},
149 | {name: '工作经验不限的比例', type: 'bar'},
150 | ]
151 | },
152 | options: [
153 | {
154 | title: {text: '2018-4-java'},
155 | series: [
156 | {data: dataMap.javadata1['2018']},// menu4
157 | {data: dataMap.javadata2['2018']},// menu5
158 | {data: dataMap.javadata3['2018']},//menu6
159 | {data: dataMap.javadata4['2018']},// menu1
160 | {data: dataMap.javadata5['2018']},// menu2
161 | {data: dataMap.javadata6['2018']},// menu3
162 | ]
163 | },
164 | {
165 | title: {text: '2018-4-C#'},
166 | series: [
167 | {data: dataMap.cdata1['2018']},// menu4
168 | {data: dataMap.cdata2['2018']},// menu5
169 | {data: dataMap.cdata3['2018']},//menu6
170 | {data: dataMap.cdata4['2018']},// menu1
171 | {data: dataMap.cdata5['2018']},// menu2
172 | {data: dataMap.cdata6['2018']},// menu3
173 | ]
174 | },
175 | {
176 | title: {text: '2018-4-linux'},
177 | series: [
178 | {data: dataMap.linuxdata1['2018']},// menu4
179 | {data: dataMap.linuxdata2['2018']},// menu5
180 | {data: dataMap.linuxdata3['2018']},//menu6
181 | {data: dataMap.linuxdata4['2018']},// menu1
182 | {data: dataMap.linuxdata5['2018']},// menu2
183 | {data: dataMap.linuxdata6['2018']},// menu3
184 | ]
185 | },
186 | {
187 | title: {text: '2018-4-python'},
188 | series: [
189 | {data: dataMap.pythondata1['2018']},// menu4
190 | {data: dataMap.pythondata2['2018']},// menu5
191 | {data: dataMap.pythondata3['2018']},//menu6
192 | {data: dataMap.pythondata4['2018']},// menu1
193 | {data: dataMap.pythondata5['2018']},// menu2
194 | {data: dataMap.pythondata6['2018']},// menu3
195 | ]
196 | },
197 | {
198 | title: {text: '2018-4-web'},
199 | series: [
200 | {data: dataMap.webdata1['2018']},// menu4
201 | {data: dataMap.webdata2['2018']},// menu5
202 | {data: dataMap.webdata3['2018']},//menu6
203 | {data: dataMap.webdata4['2018']},// menu1
204 | {data: dataMap.webdata5['2018']},// menu2
205 | {data: dataMap.webdata6['2018']},// menu3
206 | ]
207 | },
208 | {
209 | title: {text: '2018-4-C++'},
210 | series: [
211 | {data: dataMap.cppdata1['2018']},// menu4
212 | {data: dataMap.cppdata2['2018']},// menu5
213 | {data: dataMap.cppdata3['2018']},//menu6
214 | {data: dataMap.cppdata4['2018']},// menu1
215 | {data: dataMap.cppdata5['2018']},// menu2
216 | {data: dataMap.cppdata6['2018']},// menu3
217 | ]
218 | },
219 | {
220 | title: {text: '2018-4-Android'},
221 | series: [
222 | {data: dataMap.androiddata1['2018']},// menu4
223 | {data: dataMap.androiddata2['2018']},// menu5
224 | {data: dataMap.androiddata3['2018']},//menu6
225 | {data: dataMap.androiddata4['2018']},// menu1
226 | {data: dataMap.androiddata5['2018']},// menu2
227 | {data: dataMap.androiddata6['2018']},// menu3
228 | ]
229 | }
230 | ]
231 | };
232 | $(function () {
233 | chartOutChar = echarts.init(document.getElementById('showChart'));
234 | chartOutChar.setOption(option);
235 |
236 | });
237 |
238 | // dispatchAction({
239 | // type: 'legendSelect',
240 | // // 图例名称
241 | // name: string
242 | // })
243 |
--------------------------------------------------------------------------------
/result/模块一_各大省会招聘概况/数据可视化-2018-4-26/view/view1.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 各大省会招聘概况
6 |
7 |
9 |
11 |
30 |
31 |
32 |
33 | 各大省会招聘概况
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/result/模块一_各大省会招聘概况/数据清洗阶段-2018-4-26/清洗过程.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/模块一_各大省会招聘概况/数据清洗阶段-2018-4-26/清洗过程.doc
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/模块二_各大编程语言的工作能力成熟度分析/file
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/dynamic/EChartsDemo.war:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/dynamic/EChartsDemo.war
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/android.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/auto_scroll.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 职业技能分析图
9 |
10 |
11 |
12 |
13 |
14 |
15 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/c#.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/c++.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
15 | 职业技能分析图
16 |
17 |
18 |
34 |
35 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/java.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/linux.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/python.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/web.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据清洗阶段-2018-4-27/清洗后_key_map.sql/lagou_export/c#.txt:
--------------------------------------------------------------------------------
1 | year 2
2 | yeas 4
3 | interacting 1
4 | ios 12
5 | remote 1
6 | building 6
7 | pick 1
8 | xml 79
9 | visual 60
10 | understanding 2
11 | diagnostic 1
12 | audience 1
13 | solid 1
14 | zookeeper 2
15 | dealing 1
16 | devices 2
17 | excellent 1
18 | raw 1
19 | google 2
20 | optional 1
21 | analysis 4
22 | ktv 1
23 | cdn 1
24 | lighting 1
25 | standards 2
26 | innovations 2
27 | linq 29
28 | andorid 1
29 | science 2
30 | metrics 1
31 | tfs 6
32 | providing 1
33 | restful 11
34 | cef 5
35 | least 3
36 | manual 1
37 | ceo 2
38 | cet 1
39 | procedures 2
40 | aggressive 1
41 | mssql 34
42 | interbase 1
43 | b 104
44 | website 1
45 | c 1064
46 | learn 2
47 | ooxml 1
48 | accessories 2
49 | i 5
50 | k 6
51 | l 1
52 | vlc 1
53 | knockoutjs 1
54 | n 2
55 | abp 1
56 | o 6
57 | singalr 1
58 | p 1
59 | s 220
60 | t 1
61 | u 7
62 | modbus 2
63 | qualities 1
64 | y 1
65 | information 1
66 | routines 1
67 | standard 2
68 | reports 2
69 | msdn 1
70 | good 12
71 | specifications 2
72 | deploy 2
73 | webservcie 1
74 | spring 5
75 | pad 1
76 | post 1
77 | dapper 7
78 | implement 1
79 | finish 1
80 | nhibenate 2
81 | protocols 1
82 | others 1
83 | its 1
84 | gps 3
85 | arcore 1
86 | ado 1
87 | tests 1
88 | shell 3
89 | guideline 1
90 | http 56
91 | technical 3
92 | proactive 3
93 | nodejs 3
94 | prevent 1
95 | ehchache 1
96 | functionalities 1
97 | provides 3
98 | webforms 2
99 | responsible 3
100 | jetty 1
101 | winformsocket 2
102 | afc 1
103 | effectiveness 1
104 | practicing 1
105 | perform 1
106 | mobile 3
107 | multiple 2
108 | prewritten 1
109 | powerbi 1
110 | powerdesigner 3
111 | pda 2
112 | improvement 1
113 | ivy 4
114 | windowsforms 1
115 | activex 2
116 | environment 1
117 | myql 1
118 | vpn 1
119 | service 31
120 | handling 1
121 | requirejs 1
122 | pdt 1
123 | tomcat 2
124 | focus 1
125 | pkpm 2
126 | years 6
127 | qualifications 4
128 | write 5
129 | flow 2
130 | hololens 1
131 | cli 2
132 | testing 7
133 | clr 3
134 | understand 1
135 | script 7
136 | aeon 1
137 | silverlight 7
138 | angular 5
139 | system 1
140 | windowsce 1
141 | kafka 2
142 | analyze 1
143 | openxml 1
144 | mysq 1
145 | owin 2
146 | gui 4
147 | razor 1
148 | presenting 1
149 | fundamentals 1
150 | algorithms 1
151 | tob 1
152 | against 1
153 | functionality 1
154 | local 1
155 | rails 1
156 | echarts 1
157 | vss 9
158 | deployment 1
159 | erp 41
160 | product 7
161 | robust 1
162 | television 1
163 | spirit 1
164 | produce 2
165 | prototype 1
166 | javasctipt 1
167 | tpl 1
168 | framework 85
169 | php 6
170 | rose 2
171 | entity 21
172 | com 11
173 | environments 1
174 | vue 5
175 | screen 1
176 | employee 1
177 | windorm 1
178 | mode 1
179 | automation 8
180 | optimize 1
181 | sqlsever 1
182 | etc 4
183 | enhancements 1
184 | websocket 7
185 | visualize 1
186 | eth 1
187 | ffmpeg 3
188 | net 514
189 | etl 1
190 | verification 1
191 | new 11
192 | including 5
193 | read 3
194 | already 1
195 | snomed 1
196 | less 2
197 | unit 1
198 | improve 1
199 | basic 4
200 | unix 6
201 | outing 2
202 | financial 1
203 | jquery 129
204 | pki 1
205 | vsto 3
206 | unity 5
207 | weui 2
208 | and 1
209 | winsock 5
210 | design 11
211 | soapui 1
212 | working 6
213 | mongodb 19
214 | crm 10
215 | magento 1
216 | rpc 1
217 | efcore 1
218 | plc 4
219 | ant 4
220 | quartz 2
221 | contributing 1
222 | lamp 1
223 | requirement 2
224 | sqlit 1
225 | application 7
226 | spoken 4
227 | msmq 1
228 | xslt 6
229 | maintenance 3
230 | wall 1
231 | forms 1
232 | wininet 4
233 | expanding 1
234 | css 123
235 | aop 3
236 | winserver 1
237 | maintain 3
238 | establishing 1
239 | orcle 1
240 | nhibernate 12
241 | advertising 2
242 | ajax 105
243 | professional 2
244 | skills 15
245 | pmp 3
246 | java 55
247 | cordova 1
248 | pacs 1
249 | jave 1
250 | pms 1
251 | english 9
252 | optimizing 1
253 | api 57
254 | nhiberate 1
255 | fully 1
256 | cto 2
257 | app 20
258 | ext 1
259 | using 4
260 | cache 1
261 | xamarin 3
262 | investigating 1
263 | mircosoft 1
264 | javascript 149
265 | nosql 19
266 | geek 1
267 | ooad 1
268 | creating 2
269 | alternatives 1
270 | directx 1
271 | memcached 5
272 | document 1
273 | adonet 1
274 | hibernate 1
275 | cookies 3
276 | aspnet 1
277 | routing 1
278 | knowledgeable 1
279 | vendor 1
280 | does 1
281 | kinect 1
282 | netbpm 2
283 | popular 1
284 | arm 1
285 | perforce 5
286 | cvs 1
287 | winfom 1
288 | boostrap 1
289 | forecasting 1
290 | team 10
291 | services 3
292 | rtp 1
293 | automated 1
294 | mogondb 1
295 | classic 1
296 | ppt 3
297 | asp 13
298 | medical 1
299 | engagement 1
300 | documents 1
301 | developing 2
302 | run 1
303 | unittest 1
304 | incorporate 1
305 | microsoft 19
306 | research 2
307 | restfulapi 1
308 | lifecycle 1
309 | features 2
310 | view 2
311 | white 1
312 | lis 2
313 | atl 1
314 | atm 5
315 | results 3
316 | hooking 4
317 | tech 1
318 | sqlite 13
319 | enity 1
320 | verbal 2
321 | engineering 5
322 | develop 5
323 | sharing 1
324 | technologies 6
325 | colleagues 1
326 | designs 1
327 | compatibility 1
328 | dotnetty 1
329 | next 2
330 | android 14
331 | edition 1
332 | video 1
333 | beijing 1
334 | solidworks 1
335 | winform 191
336 | mono 2
337 | not 1
338 | sqlserver 104
339 | documenting 1
340 | debugging 5
341 | hadoop 1
342 | javascprit 1
343 | news 4
344 | center 2
345 | wap 2
346 | windows 138
347 | engineer 4
348 | anomalies 1
349 | remoting 3
350 | xhtml 4
351 | manage 1
352 | grey 1
353 | guidance 1
354 | risk 1
355 | window 1
356 | finance 1
357 | applications 1
358 | studio 56
359 | play 1
360 | leader 7
361 | developers 1
362 | palm 1
363 | quickly 1
364 | interactive 1
365 | extracting 1
366 | program 5
367 | when 1
368 | required 4
369 | redis 40
370 | proficiency 3
371 | multi 1
372 | jit 1
373 | wcf 77
374 | plan 1
375 | case 1
376 | hardware 2
377 | insightful 1
378 | leveldb 1
379 | philosophy 1
380 | xunit 1
381 | creative 1
382 | npoi 1
383 | socekt 1
384 | feasibility 1
385 | provide 1
386 | opencascade 1
387 | phone 3
388 | style 1
389 | centos 1
390 | boss 1
391 | dotnet 4
392 | angela 1
393 | log 1
394 | enterprise 1
395 | methods 1
396 | testable 2
397 | bugzilla 4
398 | smart 3
399 | controltemplate 1
400 | webserivce 2
401 | computer 3
402 | premium 2
403 | web 207
404 | entityframwork 1
405 | phases 1
406 | enables 1
407 | efficient 2
408 | cosmos 1
409 | opengl 12
410 | reactjs 1
411 | shopify 1
412 | organization 1
413 | webservice 61
414 | levels 1
415 | wfp 1
416 | autofac 3
417 | used 1
418 | experience 49
419 | cloud 1
420 | protocol 1
421 | responsibilities 5
422 | sliverlight 1
423 | group 1
424 | fixing 2
425 | vxworks 1
426 | servicefabric 1
427 | webvr 1
428 | eclipse 1
429 | daily 3
430 | job 6
431 | dhtml 3
432 | soap 2
433 | itil 1
434 | udp 14
435 | contribute 1
436 | webkit 9
437 | candidate 1
438 | database 2
439 | bacnet 1
440 | ltc 1
441 | win 6
442 | designed 2
443 | rest 2
444 | process 3
445 | requirements 4
446 | chromium 1
447 | debug 3
448 | ddd 4
449 | designer 1
450 | easyui 15
451 | third 1
452 | build 1
453 | visio 3
454 | jqeury 1
455 | lua 1
456 | vuejs 1
457 | jqueryui 1
458 | further 1
459 | user 1
460 | opencv 4
461 | methodology 1
462 | webservices 1
463 | mostly 1
464 | projects 2
465 | emgucv 1
466 | videos 1
467 | executing 2
468 | conducting 1
469 | evaluation 1
470 | dev 9
471 | finds 1
472 | fix 1
473 | bat 2
474 | complex 2
475 | manufacturing 1
476 | knowledge 26
477 | databases 1
478 | documentation 3
479 | ai 3
480 | opportunity 1
481 | engineers 1
482 | personal 1
483 | rtsp 1
484 | javascipt 1
485 | senior 3
486 | bbq 4
487 | ar 11
488 | profiling 1
489 | plans 2
490 | moxa 1
491 | webar 1
492 | looking 2
493 | sdk 11
494 | agile 3
495 | drive 1
496 | attitude 1
497 | strong 24
498 | supermap 2
499 | prototyping 1
500 | bi 2
501 | dubbo 1
502 | skyline 1
503 | coding 3
504 | bs 19
505 | responsibility 2
506 | oculus 1
507 | wms 3
508 | embedded 1
509 | cc 2
510 | innovation 1
511 | rdbms 3
512 | business 4
513 | operational 1
514 | familiar 6
515 | bing 2
516 | integrate 1
517 | cs 13
518 | ct 1
519 | winforms 6
520 | cv 1
521 | partner 1
522 | wpfwinformsilverlight 1
523 | monogodb 1
524 | printstudio 1
525 | db 4
526 | added 1
527 | terabytes 1
528 | arkit 1
529 | language 3
530 | sites 2
531 | interpersonal 3
532 | div 23
533 | aspnetcore 2
534 | reply 1
535 | workbench 1
536 | ea 1
537 | programming 2
538 | info 1
539 | ef 29
540 | test 21
541 | chrome 1
542 | wpf 196
543 | restapi 1
544 | es 1
545 | budgets 1
546 | nunit 2
547 | scrum 2
548 | helps 1
549 | party 6
550 | identifying 1
551 | analytical 1
552 | session 4
553 | golang 2
554 | capable 1
555 | mach 3
556 | yui 1
557 | ft 1
558 | desktop 1
559 | related 2
560 | uml 14
561 | skill 7
562 | await 2
563 | json 38
564 | client 1
565 | gc 3
566 | views 1
567 | reporting 1
568 | knockout 2
569 | billions 1
570 | dll 6
571 | custom 1
572 | asia 1
573 | existing 6
574 | form 10
575 | hub 1
576 | management 3
577 | soket 2
578 | myssql 2
579 | big 2
580 | expert 1
581 | advanced 1
582 | windowsserver 1
583 | improves 1
584 | bim 1
585 | hr 3
586 | invoke 1
587 | halcon 1
588 | hook 2
589 | elasticsearch 3
590 | wss 10
591 | sqllite 3
592 | ia 1
593 | model 1
594 | https 4
595 | prism 2
596 | python 17
597 | il 1
598 | large 2
599 | issue 3
600 | cookie 1
601 | surface 1
602 | im 1
603 | maturity 1
604 | io 3
605 | ip 39
606 | sense 1
607 | wtl 1
608 | slg 1
609 | certification 1
610 | sli 1
611 | traditional 1
612 | flatbuffers 1
613 | field 1
614 | contents 1
615 | wtt 1
616 | slo 1
617 | doc 1
618 | status 1
619 | server 193
620 | clients 1
621 | works 1
622 | dom 12
623 | js 40
624 | tailor 1
625 | thinking 1
626 | products 2
627 | ehcache 1
628 | world 1
629 | jenkins 3
630 | orcal 2
631 | ability 3
632 | together 1
633 | creator 1
634 | ftp 2
635 | may 2
636 | orcale 2
637 | fastreport 1
638 | weblogic 2
639 | willingness 1
640 | health 1
641 | trigger 1
642 | rabbitmq 6
643 | hessian 1
644 | mbp 1
645 | oral 1
646 | complete 1
647 | powerdesign 1
648 | soa 11
649 | webgl 3
650 | usb 3
651 | wwf 2
652 | main 1
653 | serve 3
654 | office 7
655 | supersocket 1
656 | soso 1
657 | swagger 1
658 | ihistorian 1
659 | high 2
660 | solution 1
661 | reviews 1
662 | oracl 1
663 | continuous 2
664 | communication 11
665 | different 2
666 | nsis 1
667 | winrt 1
668 | workflow 6
669 | mq 1
670 | nginx 1
671 | mr 2
672 | ms 27
673 | follows 1
674 | spe 1
675 | photoshop 1
676 | experiences 1
677 | plus 9
678 | task 2
679 | nb 1
680 | orleans 1
681 | position 2
682 | angularjs 10
683 | problems 1
684 | shader 2
685 | no 1
686 | bom 1
687 | code 7
688 | passionate 1
689 | blend 3
690 | box 3
691 | storage 1
692 | demo 3
693 | glsl 1
694 | sql 397
695 | mef 3
696 | qml 1
697 | oa 5
698 | lamada 1
699 | codefirst 1
700 | postgresql 4
701 | consumers 1
702 | mes 36
703 | ok 2
704 | met 1
705 | experienced 1
706 | highly 10
707 | oo 3
708 | delphi 2
709 | execution 2
710 | processes 2
711 | or 1
712 | initiative 1
713 | determine 1
714 | mfc 6
715 | thread 3
716 | structure 1
717 | ruby 2
718 | master 4
719 | windws 1
720 | accountabilities 2
721 | pc 17
722 | pd 2
723 | winfrom 6
724 | gdal 1
725 | online 4
726 | socket 74
727 | pl 5
728 | httprestfull 1
729 | curd 1
730 | sketchup 1
731 | mangodb 1
732 | discipline 1
733 | ssl 1
734 | uwp 5
735 | objective 1
736 | qa 1
737 | dynamics 5
738 | lambda 1
739 | mvvm 28
740 | devops 1
741 | makefile 4
742 | band 1
743 | goals 1
744 | qq 1
745 | based 2
746 | github 3
747 | qt 8
748 | closely 1
749 | brt 2
750 | stl 2
751 | fluent 2
752 | quality 6
753 | concepts 1
754 | rf 2
755 | cassandra 3
756 | processing 1
757 | device 2
758 | websphere 2
759 | components 2
760 | functional 1
761 | access 5
762 | fundamental 2
763 | industry 4
764 | global 2
765 | jmeter 1
766 | btc 1
767 | josn 1
768 | current 2
769 | xpath 6
770 | mis 4
771 | datatemplate 1
772 | tcpip 3
773 | operating 1
774 | so 2
775 | unreal 1
776 | sp 2
777 | holes 1
778 | key 2
779 | ss 1
780 | activemq 2
781 | expression 1
782 | rocketmq 1
783 | one 2
784 | svn 33
785 | designing 2
786 | releases 1
787 | xaml 16
788 | extensive 1
789 | aiax 2
790 | bug 37
791 | teamwork 5
792 | assist 3
793 | troubleshooting 1
794 | rfid 4
795 | ooa 5
796 | protobuf 1
797 | tv 1
798 | jsonp 1
799 | ood 9
800 | willing 3
801 | project 2
802 | express 2
803 | webapi 38
804 | icd 1
805 | nopcommerce 1
806 | qss 1
807 | oop 15
808 | ui 29
809 | sever 8
810 | monetize 1
811 | romting 2
812 | mysql 168
813 | written 6
814 | thinkpad 1
815 | passion 1
816 | navisworks 1
817 | oracle 190
818 | ensure 2
819 | opc 3
820 | solutions 1
821 | degree 2
822 | ide 5
823 | vb 4
824 | vc 8
825 | assurance 1
826 | vf 1
827 | demonstrate 1
828 | vr 8
829 | vs 15
830 | support 5
831 | mongo 2
832 | implements 1
833 | needed 1
834 | learning 1
835 | inventory 1
836 | scada 2
837 | autocad 2
838 | devexpress 18
839 | docker 5
840 | wf 2
841 | odata 1
842 | datagridview 2
843 | linux 36
844 | ionic 1
845 | ws 1
846 | hbase 2
847 | coverage 1
848 | cases 5
849 | reading 2
850 | orm 23
851 | technology 2
852 | informatics 1
853 | webform 19
854 | extjs 5
855 | windowsform 1
856 | bentley 1
857 | bash 1
858 | relational 1
859 | osg 1
860 | software 18
861 | frameworks 1
862 | gcc 1
863 | binding 1
864 | react 2
865 | desirable 4
866 | moq 1
867 | relation 1
868 | feedback 1
869 | problem 3
870 | review 10
871 | premises 1
872 | freewheel 3
873 | azure 7
874 | wifi 2
875 | teams 2
876 | work 7
877 | visualstudio 4
878 | focusing 1
879 | gdi 7
880 | ott 1
881 | samples 1
882 | innovative 1
883 | vba 4
884 | saas 2
885 | memcahced 1
886 | fabric 1
887 | struts 1
888 | following 1
889 | ifix 1
890 | word 1
891 | internal 1
892 | architect 1
893 | enjoy 1
894 | sharepoint 15
895 | requests 1
896 | bootstrap 28
897 | foundation 2
898 | codeplex 1
899 | tools 3
900 | across 1
901 | iis 11
902 | feature 1
903 | writing 2
904 | collective 1
905 | seajs 2
906 | power 2
907 | firmware 1
908 | include 1
909 | netcore 2
910 | netframework 1
911 | dicom 4
912 | nice 3
913 | dotnetcore 1
914 | token 1
915 | excel 2
916 | help 1
917 | oracel 1
918 | htmlcssjavascript 1
919 | revit 2
920 | threading 1
921 | minimum 2
922 | architectural 1
923 | first 1
924 | wince 2
925 | data 3
926 | entityframework 11
927 | vuew 1
928 | spec 2
929 | create 2
930 | html 109
931 | memcache 7
932 | matlab 1
933 | development 27
934 | ubuntu 2
935 | maven 2
936 | cmmi 3
937 | mining 1
938 | core 22
939 | qualification 1
940 | onpremises 1
941 | emgu 2
942 | dash 1
943 | solving 3
944 | arcgis 7
945 | configuration 1
946 | cae 1
947 | cad 3
948 | mybatis 2
949 | scale 1
950 | orscle 1
951 | platform 2
952 | gis 18
953 | git 21
954 | operations 1
955 | flex 2
956 | mvc 165
957 | player 1
958 | tcp 56
959 | memorycache 1
960 | will 3
961 | mvp 7
962 | sourcesafe 1
963 | implementation 2
964 | command 1
965 | async 2
966 | enforces 1
967 | enforcer 1
968 | cbs 1
969 | tdd 2
970 | devexpre 1
971 | compliance 1
972 | efw 1
973 | ioc 6
974 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据清洗阶段-2018-4-27/清洗后_key_map.sql/lagou_export/python.txt:
--------------------------------------------------------------------------------
1 | stack 2
2 | mentor 2
3 | greenlet 1
4 | theano 2
5 | ios 7
6 | icmp 1
7 | without 1
8 | offer 3
9 | fit 1
10 | bat 1
11 | xml 2
12 | rabbit 1
13 | understanding 1
14 | visual 1
15 | lxml 2
16 | near 1
17 | vim 11
18 | uwsgi 3
19 | nova 1
20 | knowledge 1
21 | databases 1
22 | zookeeper 9
23 | excellent 1
24 | ai 6
25 | lvs 5
26 | google 6
27 | infra 1
28 | gunicorn 5
29 | cdn 27
30 | am 1
31 | easy 1
32 | sqlalchemy 13
33 | grpc 1
34 | linq 1
35 | flexible 1
36 | turning 1
37 | science 2
38 | detail 1
39 | agile 1
40 | sdk 1
41 | sdn 3
42 | pony 1
43 | ba 2
44 | schema 1
45 | strong 1
46 | restful 30
47 | bf 1
48 | bi 1
49 | least 3
50 | dubbo 1
51 | bottle 3
52 | ceo 4
53 | coding 4
54 | adaptive 1
55 | search 1
56 | selenium 5
57 | spark 11
58 | systems 2
59 | responsibility 2
60 | beautifulsoup 3
61 | apscheduler 1
62 | systemtap 1
63 | seo 1
64 | querytype 1
65 | innovation 1
66 | cd 2
67 | b 5
68 | kvm 4
69 | c 117
70 | bdd 1
71 | salt 1
72 | ci 4
73 | familiar 1
74 | l 9
75 | practices 1
76 | logical 1
77 | resiful 1
78 | flv 4
79 | qian 2
80 | r 2
81 | rdd 1
82 | s 5
83 | fast 2
84 | u 16
85 | pylint 1
86 | imac 1
87 | cloudfoundry 2
88 | haproxy 1
89 | timeline 1
90 | db 5
91 | gvent 1
92 | rds 2
93 | cgi 1
94 | nltk 1
95 | language 1
96 | acm 2
97 | good 3
98 | spring 7
99 | acid 1
100 | div 2
101 | act 1
102 | kivy 1
103 | urllib 2
104 | multithreading 1
105 | radius 1
106 | smaug 1
107 | programming 2
108 | jvm 1
109 | ef 1
110 | zabbix 4
111 | cinder 1
112 | elk 4
113 | quorum 1
114 | restframework 1
115 | dynamodb 2
116 | druid 1
117 | adx 1
118 | shell 58
119 | freezer 1
120 | http 57
121 | scrum 5
122 | wps 1
123 | icpc 1
124 | deep 2
125 | technical 1
126 | golang 17
127 | nodejs 7
128 | getopenid 1
129 | related 3
130 | uml 2
131 | mongdb 2
132 | responsible 1
133 | vagrant 1
134 | pcl 1
135 | json 1
136 | company 2
137 | jetty 1
138 | oauth 3
139 | ge 1
140 | owner 2
141 | go 30
142 | mobile 2
143 | multiple 1
144 | grand 2
145 | home 2
146 | schedule 1
147 | environment 2
148 | micro 1
149 | service 1
150 | heritrix 1
151 | pyramid 1
152 | hc 1
153 | splash 1
154 | httpclient 2
155 | agg 1
156 | tomcat 10
157 | hr 2
158 | changing 1
159 | years 3
160 | elasticsearch 10
161 | slack 1
162 | model 1
163 | https 10
164 | stackstorm 2
165 | flow 3
166 | tasks 1
167 | travelflan 2
168 | reduce 1
169 | cookie 1
170 | io 4
171 | ip 21
172 | testing 2
173 | understand 1
174 | dns 5
175 | handle 1
176 | mongokit 2
177 | cgroup 1
178 | script 1
179 | angular 3
180 | grafana 2
181 | strongly 1
182 | kafka 8
183 | javascrip 1
184 | gui 2
185 | messagequeue 3
186 | snapshot 1
187 | presto 1
188 | server 4
189 | familiarity 1
190 | js 17
191 | deliver 2
192 | mac 5
193 | products 1
194 | convnet 2
195 | library 1
196 | navicat 1
197 | rails 1
198 | jenkins 7
199 | member 1
200 | ability 1
201 | libevent 1
202 | tengine 4
203 | map 1
204 | macos 2
205 | implementations 1
206 | proficient 2
207 | scipy 3
208 | erp 5
209 | creator 1
210 | product 1
211 | within 1
212 | rhel 1
213 | wxpython 1
214 | torch 2
215 | kv 1
216 | rabbitmq 20
217 | url 1
218 | pandas 13
219 | framework 5
220 | able 2
221 | cmake 1
222 | php 33
223 | italkier 2
224 | opentsdb 1
225 | beego 1
226 | etcd 2
227 | improvements 1
228 | numy 1
229 | coo 1
230 | strdepartment 1
231 | soa 3
232 | lr 1
233 | use 1
234 | hashing 1
235 | vue 6
236 | trafficserver 3
237 | high 1
238 | automation 1
239 | optimize 1
240 | scarpy 1
241 | macbook 1
242 | websocket 1
243 | fastdfs 5
244 | traveflan 1
245 | etl 3
246 | communication 1
247 | net 2
248 | new 5
249 | realtime 1
250 | linix 1
251 | nginx 34
252 | mq 1
253 | mr 1
254 | spa 2
255 | juggle 1
256 | webgui 1
257 | pressure 1
258 | basis 1
259 | isappinstalled 1
260 | plus 1
261 | amazon 1
262 | ioloop 1
263 | spm 4
264 | apis 1
265 | typing 1
266 | gitlab 3
267 | nfs 1
268 | angularjs 4
269 | lbs 1
270 | unix 35
271 | supervisor 2
272 | nfv 1
273 | dsp 1
274 | code 7
275 | pythonic 2
276 | passionate 1
277 | fasting 1
278 | demo 1
279 | jquery 14
280 | sql 51
281 | libvirt 2
282 | grunt 1
283 | effective 1
284 | oa 2
285 | postgresql 38
286 | and 1
287 | design 1
288 | ros 1
289 | pyqt 3
290 | working 1
291 | backend 1
292 | teamcity 1
293 | mongodb 97
294 | apachengnix 1
295 | crm 9
296 | oo 5
297 | qfusion 1
298 | or 1
299 | initiative 2
300 | nonsql 1
301 | rpc 3
302 | os 4
303 | bachelor 1
304 | ruby 8
305 | master 1
306 | pc 3
307 | conduct 1
308 | pg 2
309 | opensource 2
310 | socket 7
311 | scikit 1
312 | pl 1
313 | selenuim 1
314 | pyquery 1
315 | pm 1
316 | coming 1
317 | apsaradb 2
318 | css 51
319 | aop 1
320 | nigix 1
321 | csv 2
322 | ssh 2
323 | nhibernate 1
324 | py 1
325 | ajax 14
326 | constraints 1
327 | ssl 2
328 | htmlparser 1
329 | professional 1
330 | patch 3
331 | skills 1
332 | qa 2
333 | wireshark 2
334 | java 91
335 | speaking 1
336 | mvvm 4
337 | ctf 2
338 | devops 20
339 | okr 1
340 | english 2
341 | api 44
342 | state 1
343 | nix 2
344 | defined 1
345 | cto 1
346 | stackoverflow 5
347 | app 22
348 | based 1
349 | github 26
350 | cache 3
351 | qt 6
352 | openresty 1
353 | pgsql 1
354 | javascript 54
355 | nosql 52
356 | echoing 1
357 | quality 2
358 | node 1
359 | bsd 3
360 | groovy 2
361 | cassandra 5
362 | matplotlib 3
363 | rancher 1
364 | difference 1
365 | flask 118
366 | memcached 5
367 | ironic 1
368 | cmdb 6
369 | odoo 16
370 | document 1
371 | numpy 11
372 | hibernate 2
373 | two 1
374 | ansible 12
375 | matching 1
376 | xpath 10
377 | pregresql 1
378 | celery 18
379 | releasing 1
380 | desired 1
381 | djando 1
382 | thrift 1
383 | pytorch 1
384 | mit 3
385 | tcpip 1
386 | backbone 2
387 | webserver 5
388 | so 1
389 | dtrace 1
390 | postgres 3
391 | arp 1
392 | activemq 3
393 | gitflow 1
394 | storm 3
395 | necessary 1
396 | languages 1
397 | star 1
398 | one 1
399 | svn 7
400 | team 2
401 | services 2
402 | svm 2
403 | openerp 15
404 | opnfv 1
405 | tb 2
406 | rtp 2
407 | jdk 5
408 | pull 2
409 | nlp 2
410 | ppt 1
411 | bug 8
412 | sanic 1
413 | troubleshooting 2
414 | kubernetes 12
415 | bus 1
416 | ooc 1
417 | agent 1
418 | ood 1
419 | qemu 2
420 | express 1
421 | multiprocessing 1
422 | pairs 1
423 | restfulapi 1
424 | oop 3
425 | features 1
426 | apache 13
427 | ui 5
428 | sever 1
429 | things 1
430 | keras 2
431 | mysql 183
432 | keyword 1
433 | connextion 1
434 | yaml 1
435 | surprise 1
436 | oracle 22
437 | boosting 1
438 | sqlite 4
439 | marathon 1
440 | solutions 1
441 | query 1
442 | batch 1
443 | degree 2
444 | idc 1
445 | engineering 1
446 | sharing 1
447 | pro 1
448 | twistd 1
449 | technologies 3
450 | improving 1
451 | timelines 1
452 | angualrjs 1
453 | consul 1
454 | dbaas 1
455 | vs 1
456 | internet 1
457 | sublime 1
458 | full 1
459 | mongo 6
460 | android 2
461 | sklearn 1
462 | tcpdump 1
463 | learning 5
464 | autocad 2
465 | docker 40
466 | mesos 2
467 | sqlserver 4
468 | linux 260
469 | debugging 1
470 | hadoop 19
471 | boot 1
472 | samba 1
473 | icbu 2
474 | hbase 14
475 | netty 3
476 | delivery 1
477 | webpy 2
478 | green 1
479 | cases 1
480 | paced 1
481 | ansimble 1
482 | varnish 3
483 | ssdb 1
484 | reading 3
485 | orm 6
486 | technology 1
487 | windows 3
488 | extjs 1
489 | xhtml 1
490 | asyncio 3
491 | awk 1
492 | money 1
493 | learner 1
494 | nutch 1
495 | hdfs 2
496 | bash 3
497 | time 1
498 | aws 5
499 | base 1
500 | studio 1
501 | leader 1
502 | cqrs 1
503 | software 4
504 | osi 1
505 | frameworks 1
506 | tripleo 1
507 | react 6
508 | whole 1
509 | influxdb 1
510 | redis 107
511 | required 1
512 | proficiency 1
513 | problem 1
514 | review 9
515 | gevent 4
516 | pyside 1
517 | azure 2
518 | xen 2
519 | cloudstack 1
520 | openapi 1
521 | work 3
522 | players 1
523 | caffe 4
524 | emacs 6
525 | scrapy 8
526 | insight 1
527 | jquer 1
528 | zk 1
529 | saas 14
530 | comfortable 1
531 | fabric 2
532 | struts 1
533 | following 2
534 | openstack 15
535 | word 1
536 | centos 4
537 | hive 5
538 | internal 1
539 | study 1
540 | alembic 1
541 | boss 2
542 | widget 2
543 | falcon 2
544 | bootstrap 7
545 | requests 5
546 | rtmp 4
547 | tensorflow 11
548 | computer 3
549 | feature 1
550 | web 239
551 | writing 1
552 | event 1
553 | djangorestframework 1
554 | uioc 1
555 | saltstack 10
556 | architecture 1
557 | tornado 86
558 | twisted 3
559 | consistent 1
560 | jsoup 3
561 | reactjs 1
562 | nice 1
563 | excel 1
564 | vsphere 1
565 | gfs 5
566 | fellow 1
567 | organized 1
568 | webservice 2
569 | trello 1
570 | pythonweb 2
571 | quick 2
572 | data 1
573 | own 1
574 | used 1
575 | blog 5
576 | experience 3
577 | cloud 3
578 | protocol 1
579 | teammates 1
580 | overflow 1
581 | ldap 2
582 | distributing 1
583 | flume 1
584 | dau 1
585 | html 51
586 | memcache 3
587 | potential 1
588 | jira 1
589 | matlab 2
590 | daocloud 8
591 | laravel 1
592 | development 6
593 | fixing 1
594 | eda 1
595 | ubuntu 5
596 | maven 2
597 | runtime 1
598 | pytho 1
599 | qualification 2
600 | pycharm 5
601 | geeeeek 1
602 | daily 1
603 | zeromq 3
604 | squid 3
605 | pyspider 5
606 | udp 1
607 | request 3
608 | solving 1
609 | arcgis 2
610 | scala 1
611 | line 1
612 | mybatis 5
613 | italki 6
614 | pyspark 2
615 | platform 2
616 | database 3
617 | git 62
618 | cap 1
619 | servers 1
620 | asynchronous 1
621 | mvc 12
622 | tcp 26
623 | rest 6
624 | sap 1
625 | debug 2
626 | phantomjs 1
627 | implementation 1
628 | assistant 2
629 | chef 2
630 | soda 1
631 | command 1
632 | hls 4
633 | puppet 5
634 | mariadb 1
635 | django 166
636 | performance 2
637 | swarm 1
638 | tdd 3
639 | visio 1
640 | cbu 2
641 | namespace 1
642 | challenge 1
643 | attention 1
644 | lua 3
645 | cassendra 1
646 | ngnix 4
647 | slash 2
648 | growth 2
649 | travis 1
650 | opencv 2
651 |
--------------------------------------------------------------------------------
/result/模块二_各大编程语言的工作能力成熟度分析/数据清洗阶段-2018-4-27/清洗过程.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/模块二_各大编程语言的工作能力成熟度分析/数据清洗阶段-2018-4-27/清洗过程.doc
--------------------------------------------------------------------------------
/result/源文件/CrawlerApp-0.0.1-SNAPSHOT-javadoc.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/源文件/CrawlerApp-0.0.1-SNAPSHOT-javadoc.jar
--------------------------------------------------------------------------------
/result/源文件/CrawlerApp-0.0.1-SNAPSHOT-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/源文件/CrawlerApp-0.0.1-SNAPSHOT-sources.jar
--------------------------------------------------------------------------------
/result/源文件/CrawlerApp-0.0.1-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/源文件/CrawlerApp-0.0.1-SNAPSHOT.jar
--------------------------------------------------------------------------------
/result/项目演讲.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/项目演讲.ppt
--------------------------------------------------------------------------------
/sources/hadoop.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/sources/hadoop.dll
--------------------------------------------------------------------------------
/sources/winutils.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/sources/winutils.exe
--------------------------------------------------------------------------------
/src/main/java/com/edmund/crawler/JobCrawler.java:
--------------------------------------------------------------------------------
1 | package com.edmund.crawler;
2 |
3 | import java.io.FileInputStream;
4 | import java.io.IOException;
5 | import java.util.ArrayList;
6 | import java.util.List;
7 | import java.util.Map;
8 | import java.util.Properties;
9 |
10 | import org.openqa.selenium.By;
11 | import org.openqa.selenium.WebElement;
12 | import org.openqa.selenium.chrome.ChromeDriver;
13 | import org.openqa.selenium.support.ui.ExpectedConditions;
14 | import org.openqa.selenium.support.ui.WebDriverWait;
15 |
16 | import com.edmund.utils.DBUtils;
17 | import com.edmund.vo.Job;
18 |
19 | /**
20 | * 用于职位信息爬取的爬虫类
21 | *
22 | * @author Edmund
23 | *
24 | */
25 | public class JobCrawler {
26 | // private static String[] keys = { "java", "python", "c++", "android",
27 | // "php" };
28 | private static String[] keys = { "web" };
29 | private static Map> infos = null;
30 |
31 | private static List cities = null;
32 | private static List roots = null;
33 | private static String localdriver = null; // 本地浏览器驱动位置
34 | private static String localexport = null; // 本地输出路径
35 |
36 | private static final int THREAD_NUMBER = 5;
37 |
38 | /**
39 | * 读取配置文件
40 | */
41 | static {
42 | Properties property = new Properties();
43 | try {
44 | property.load(new FileInputStream(
45 | "./src/main/java/com/edmund/properties"));
46 | } catch (IOException e) {
47 | e.printStackTrace();
48 | }
49 | localdriver = property.getProperty("LocalChromedriver");
50 | localexport = property.getProperty("LocalExportPath");
51 |
52 | }
53 |
54 | public static void main(String[] args) {
55 | for (String strkey : keys) {
56 | initLists(strkey);
57 | }
58 |
59 | for (int i = 0; i < THREAD_NUMBER; i++) {
60 | new JobCrawler().new crawThread().start();
61 | }
62 | }
63 |
64 | /**
65 | * 爬取数据的线程类
66 | * @author Edmund
67 | *
68 | */
69 | class crawThread extends Thread {
70 | ChromeDriver driver = initBrowser();
71 |
72 | @Override
73 | public void run() {
74 | while (true) {
75 | String[] urls = getURL();
76 | if (urls == null) {
77 | break;
78 | }
79 | String key = whichKey(urls[1]);
80 |
81 | List jobs = null;
82 | try {
83 | jobs = crawJobs(urls, key, driver);
84 | } catch (Exception e) {
85 | pushIntoLists(urls);
86 | }
87 | DBUtils.writeToFile(jobs,
88 | localexport + "/" + key + "/" + this.getName() + "/"
89 | + urls[0] + "-" + key + "-info.txt");
90 | }
91 | }
92 | }
93 |
94 | /**
95 | * 线程同步取url和city信息
96 | * @return urls[0]保存city,urls[1]保存url
97 | */
98 | private synchronized static String[] getURL() {
99 | if (cities == null || cities.isEmpty()) {
100 | return null;
101 | }
102 | if (roots == null || roots.isEmpty()) {
103 | return null;
104 | }
105 | String[] urls = { cities.get(0), roots.get(0) };
106 | cities.remove(0);
107 | roots.remove(0);
108 |
109 | return urls;
110 | }
111 |
112 | /**
113 | * 静态初始化职位信息,将所有信息加载到内存中
114 | * @param strkey 关键字
115 | */
116 | private static void initLists(String strkey) {
117 | try {
118 | infos = DBUtils
119 | .readFromFile("./result-sources/EdmundDXu/files/emp.txt");
120 | } catch (IOException e) {
121 | }
122 | List newroot = new ArrayList();
123 | cities = infos.get("cities");
124 |
125 | for (String root : infos.get("roots")) {
126 | newroot.add(root.replace("#", strkey));
127 | }
128 | roots = newroot;
129 | }
130 |
131 | /**
132 | * 初始化浏览器驱动
133 | * @return 浏览器驱动对象
134 | */
135 | private static ChromeDriver initBrowser() {
136 | System.setProperty("webdriver.chrome.driver", localdriver);
137 | ChromeDriver driver = new ChromeDriver();
138 | return driver;
139 | }
140 |
141 | /**
142 | * 如果出现异常情况导致没有被其他catch语句捕获,就将该url重新加入列表中处理
143 | * @param url
144 | */
145 | private synchronized static void pushIntoLists(String[] urls) {
146 | cities.add(urls[0]);
147 | roots.add(urls[1]);
148 | }
149 |
150 | /**
151 | * 根据url判断该url属于哪个关键字
152 | * @param url
153 | * @return 关键字
154 | */
155 | private static String whichKey(String url) {
156 | for (String key : keys) {
157 | if (url.contains(key)) {
158 | return key;
159 | }
160 | }
161 | return null;
162 | }
163 |
164 | /**
165 | * 从指定根站点,以指定关键字开始爬取职位信息,多线程方式将职位信息逐条写入文件中 58同城
166 | * 该方法暂时废弃
167 | */
168 |
169 | /**
170 | * 从指定根站点,以指定关键字开始爬取职位信息 58同城
171 | * @param urls 保存url和city信息的数组,urls[0]保存city,urls[1]保存url
172 | * @param key 需要爬取的关键字
173 | * @param driver 浏览器驱动对象
174 | * @return 包含职位信息的列表
175 | */
176 | public static List crawJobs(String[] urls, String key,
177 | ChromeDriver driver) {
178 |
179 | if (pretreatment(urls[1], driver) == -1) {
180 | return null;
181 | }
182 |
183 | List jobs = new ArrayList();
184 | while (true) {
185 | WebElement list = driver.findElementById("list_con");
186 | List positions = list.findElements(By.tagName("li"));
187 | for (WebElement webElement : positions) {
188 | // 出现此条语句表示下面的结果与搜索关键字无关,故直接抛弃下面的职位
189 | if (webElement.getAttribute("class").contains("noData")) {
190 | break;
191 | }
192 | jobs.add(createJobVo(webElement, urls[0], key));
193 | }
194 | if (nextPage(driver) == -1) {
195 | break;
196 | }
197 | }
198 | return jobs;
199 |
200 | }
201 |
202 | /**
203 | * 在爬取数据之前做的预处理工作
204 | * @param url 需要爬取的url
205 | * @param driver 浏览器驱动对象
206 | * @return 0表示预处理正常,-1表示预处理失败
207 | */
208 | private static int pretreatment(String url, ChromeDriver driver) {
209 | driver.get(url);
210 | // 最大化窗口
211 | // driver.manage().window().maximize();
212 |
213 | WebDriverWait wait = new WebDriverWait(driver, 10);
214 |
215 | // 等待职位列表和分页列表加载完毕
216 | try {
217 | wait.until(ExpectedConditions
218 | .presenceOfElementLocated(By.id("list_con")));
219 | } catch (Exception e) {
220 | // 如果出现页面中没有list_con元素的情况,视为没有职位信息,直接退出本页面
221 | return -1;
222 | }
223 | // wait.until(ExpectedConditions.presenceOfElementLocated(By.className("next")));
224 |
225 | return 0;
226 | }
227 |
228 | /**
229 | * 爬取完数据后的翻页操作
230 | * @param driver 浏览器驱动对象
231 | * @return 0表示翻页操作可以正常执行,-1表示翻页操作不能继续进行
232 | */
233 | public static int nextPage(ChromeDriver driver) {
234 | // 使用findElements可以避免出现‘页面中没有next元素’而导致的异常
235 | List nextlist = driver.findElementsByClassName("next");
236 | // 如果页面中没有next元素,则不点击next,直接退出本次循环
237 | if (nextlist == null || nextlist.isEmpty()) {
238 | return -1;
239 | }
240 |
241 | WebElement next = nextlist.get(0);
242 |
243 | // 一旦翻页按钮无法使用,表示到了最后一页,则退出循环
244 | if (next.getAttribute("class").contains("disabled")) {
245 | return -1;
246 | }
247 | next.click();
248 | return 0;
249 | }
250 |
251 | /**
252 | * 创建职位信息的封装类
253 | * @param webElement
254 | * @param city 城市信息
255 | * @param key 关键字
256 | * @return 封装职位信息的Job对象
257 | */
258 | private static Job createJobVo(WebElement webElement, String city,
259 | String key) {
260 | String title = webElement.findElement(By.className("job_name"))
261 | .getText();
262 | String job_name = webElement.findElement(By.className("cate"))
263 | .getText();
264 | String salary = webElement.findElement(By.className("job_salary"))
265 | .getText();
266 | String company = webElement.findElement(By.className("comp_name"))
267 | .getText();
268 | String education = webElement.findElement(By.className("xueli"))
269 | .getText();
270 | String experience = webElement.findElement(By.className("jingyan"))
271 | .getText();
272 |
273 | Job job = new Job(null, city, key, title, salary.split("元/月")[0],
274 | company.split(" ")[0], job_name, education, experience);
275 | return job;
276 | }
277 | }
278 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/crawler/KeyMapMerger.java:
--------------------------------------------------------------------------------
1 | package com.edmund.crawler;
2 |
3 | import java.util.HashMap;
4 | import java.util.List;
5 | import java.util.Map;
6 | import java.util.Set;
7 |
8 | import com.edmund.utils.DataBaseConnection;
9 | import com.edmund.utils.LGDBUtils;
10 | import com.edmund.vo.KeyMap;
11 |
12 | /**
13 | * 关键字map的合并类
14 | * @author Edmund
15 | *
16 | */
17 | public class KeyMapMerger {
18 |
19 | private DataBaseConnection dbc = new DataBaseConnection();
20 | private LGDBUtils utils = new LGDBUtils(dbc);
21 | private static String[] keys = { "web", "java", "python", "c++", "c#",
22 | "android", "linux" };
23 |
24 | public static void main(String[] args) {
25 | new KeyMapMerger().merge();
26 | }
27 |
28 | /**
29 | * 合并数据库中所有条目的Map集合,整合为一个Map集合,并输出到本地文件系统中
30 | */
31 | private void merge() {
32 | for (String keyword : keys) {
33 | Map kwMerge = new HashMap();
34 | // List jobs = utils.getLGJob(keyword);
35 | // for (LGJob job : jobs) {
36 | List kms = utils.getKeyMap(keyword);
37 | for (KeyMap km : kms) {
38 | // Map kwMap = job.getKeywords();
39 | Map kwMap = km.getKeywords();
40 | Set keyset = kwMap.keySet();
41 | for (String key : keyset) {
42 | if (kwMerge.containsKey(key)) {
43 | kwMerge.put(key, kwMerge.get(key) + kwMap.get(key));
44 | } else {
45 | if (key.contains("/")) {
46 | String[] keys = key.split("/");
47 | for (String inner_key : keys) {
48 | if (kwMerge.containsKey(inner_key)) {
49 | kwMerge.put(inner_key,
50 | kwMerge.get(inner_key)
51 | + kwMap.get(key));
52 | } else {
53 | kwMerge.put(inner_key, kwMap.get(key));
54 | }
55 | }
56 | } else {
57 | kwMerge.put(key, kwMap.get(key));
58 | }
59 |
60 | }
61 | }
62 | }
63 | utils.writeKeyMapToMysql(kwMerge, keyword);
64 | }
65 | dbc.close();
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/crawler/LGJobCleaner.java:
--------------------------------------------------------------------------------
1 | package com.edmund.crawler;
2 |
3 | import com.edmund.utils.DataBaseConnection;
4 | import com.edmund.utils.LGCleanUtils;
5 |
6 | /**
7 | * 拉钩数据表的清洗类
8 | * @author Edmund
9 | *
10 | */
11 | public class LGJobCleaner {
12 | private DataBaseConnection dbc = new DataBaseConnection();
13 | private LGCleanUtils utils = new LGCleanUtils(dbc);
14 |
15 | private static final int LAGOU = 0;
16 | private static final int BOSS = 1;
17 |
18 | public static void main(String[] args) {
19 | new LGJobCleaner().clean();
20 | }
21 |
22 | private void clean() {
23 | utils.JobClean(LAGOU);
24 | dbc.close();
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/crawler/LGJobCrawler.java:
--------------------------------------------------------------------------------
1 | package com.edmund.crawler;
2 |
3 | import java.io.FileInputStream;
4 | import java.io.IOException;
5 | import java.util.List;
6 | import java.util.Properties;
7 |
8 | import org.openqa.selenium.By;
9 | import org.openqa.selenium.Keys;
10 | import org.openqa.selenium.WebElement;
11 | import org.openqa.selenium.chrome.ChromeDriver;
12 | import org.openqa.selenium.support.ui.ExpectedConditions;
13 | import org.openqa.selenium.support.ui.WebDriverWait;
14 |
15 | import com.edmund.utils.DataBaseConnection;
16 | import com.edmund.utils.LGDBUtils;
17 |
18 | /**
19 | * 拉钩网爬虫类
20 | * 现在用于从city_url表中读取需要处理的所有url,然后将抓取到的所有href保存到ready_url表中
21 | * 爬虫处理阶段2
22 | * @author Edmund
23 | *
24 | */
25 | public class LGJobCrawler {
26 | private static String[] keys = { "web", "java", "python", "c++", "c#",
27 | "android", "linux" };
28 |
29 | private static String localdriver = null; // 本地浏览器驱动位置
30 | private DataBaseConnection dbc = new DataBaseConnection();
31 | private LGDBUtils utils = new LGDBUtils(dbc);
32 |
33 | /**
34 | * 读取配置文件
35 | */
36 | static {
37 | Properties property = new Properties();
38 | try {
39 | property.load(new FileInputStream(
40 | "./src/main/java/com/edmund/properties"));
41 | } catch (IOException e) {
42 | e.printStackTrace();
43 | }
44 | localdriver = property.getProperty("LocalChromedriver");
45 |
46 | }
47 |
48 | public static void main(String[] args) throws Exception {
49 | LGJobCrawler lgCrawler = new LGJobCrawler();
50 | ChromeDriver driver = initBrowser();
51 |
52 | // for (int i = 0; i < THREAD_NUMBER; i++) {
53 | // new LGJobCrawler().new LGJobCrawlerThread().start();
54 | // }
55 | String url = null;
56 |
57 | while ((url = lgCrawler.read()) != null) {
58 | lgCrawler.crawJobs(url, driver);
59 | }
60 | }
61 |
62 | private String read() {
63 | return utils.readFromCityURL();
64 | }
65 |
66 | /**
67 | * 初始化浏览器驱动
68 | * @return 浏览器驱动对象
69 | */
70 | private static ChromeDriver initBrowser() {
71 | System.setProperty("webdriver.chrome.driver", localdriver);
72 | ChromeDriver driver = new ChromeDriver();
73 | return driver;
74 | }
75 |
76 | /**
77 | * 在爬取数据之前做的预处理工作
78 | * @param url 需要爬取的url
79 | * @param driver 浏览器驱动对象
80 | * @return 0表示预处理正常,-1表示预处理失败
81 | */
82 | private static int pretreatment(String url, ChromeDriver driver) {
83 | driver.get(url);
84 | // driver.manage().window().maximize();
85 |
86 | WebDriverWait wait = new WebDriverWait(driver, 5);
87 |
88 | try {
89 | wait.until(ExpectedConditions
90 | .presenceOfElementLocated(By.id("s_position_list")));
91 | } catch (Exception e) {
92 | return -1;
93 | }
94 |
95 | return 0;
96 | }
97 |
98 | /**
99 | * 从给定url爬取职位信息
100 | * @param url 网页路径
101 | * @param driver 浏览器驱动
102 | * @return 职位信息列表
103 | * @throws Exception
104 | */
105 | public void crawJobs(String url, ChromeDriver driver) throws Exception {
106 |
107 | try {
108 | if (pretreatment(url, driver) == -1) {
109 | return;
110 | }
111 |
112 | while (true) {
113 | WebElement list = driver.findElementById("s_position_list");
114 | WebElement list_ul = list.findElement(By.tagName("ul"));
115 | List positions = list_ul
116 | .findElements(By.tagName("li"));
117 | for (WebElement webElement : positions) {
118 | String href = webElement.findElement(By.tagName("a"))
119 | .getAttribute("href");
120 | utils.writeIntoReadyURL(href, whichKey(url));
121 | }
122 |
123 | if (nextPage(driver) == -1) {
124 | break;
125 | }
126 | }
127 | } catch (Exception e) {
128 | restart(url);
129 | e.printStackTrace();
130 | }
131 | }
132 |
133 | /**
134 | * 处理url出现异常时,恢复该url在数据库中的状态,并且休息10秒钟
135 | * @param url
136 | */
137 | private void restart(String url) {
138 | utils.restoreReadyURL(url);
139 | System.out.println("正在回滚数据");
140 | try {
141 | Thread.sleep(10000);
142 | } catch (InterruptedException e) {
143 | e.printStackTrace();
144 | }
145 | }
146 |
147 | /**
148 | * 爬取完数据后的翻页操作
149 | * @param driver 浏览器驱动对象
150 | * @return 0表示翻页操作可以正常执行,-1表示翻页操作不能继续进行
151 | * @throws InterruptedException
152 | */
153 | private static int nextPage(ChromeDriver driver)
154 | throws InterruptedException {
155 | // 使用findElements可以避免出现‘页面中没有next元素’而导致的异常
156 | List nextlist = driver.findElements(
157 | By.cssSelector("#s_position_list span.pager_next"));
158 |
159 | // 如果页面中没有next元素,则不点击next,直接退出本次循环
160 | if (nextlist == null || nextlist.isEmpty()) {
161 | return -1;
162 | }
163 |
164 | WebElement next = nextlist.get(0);
165 | driver.getKeyboard().sendKeys(Keys.END);
166 | Thread.sleep(2000);
167 | // 一旦翻页按钮无法使用,表示到了最后一页,则退出循环
168 | if (next.getAttribute("class").contains("pager_next_disabled")) {
169 | return -1;
170 | }
171 | next.click();
172 | Thread.sleep(2000);
173 | return 0;
174 | }
175 |
176 | /**
177 | * 根据url判断该url属于哪个关键字
178 | * @param url
179 | * @return 关键字
180 | */
181 | private static String whichKey(String url) {
182 | for (String key : keys) {
183 | if (url.contains(key)) {
184 | return key;
185 | }
186 | }
187 | return null;
188 | }
189 |
190 | }
191 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/crawler/LGJobCrawlerThread.java:
--------------------------------------------------------------------------------
1 | package com.edmund.crawler;
2 |
3 | import java.io.FileInputStream;
4 | import java.io.IOException;
5 | import java.util.HashMap;
6 | import java.util.Map;
7 | import java.util.Properties;
8 |
9 | import org.jsoup.Jsoup;
10 | import org.jsoup.nodes.Document;
11 | import org.openqa.selenium.By;
12 | import org.openqa.selenium.chrome.ChromeDriver;
13 | import org.openqa.selenium.support.ui.ExpectedConditions;
14 | import org.openqa.selenium.support.ui.WebDriverWait;
15 |
16 | import com.edmund.utils.DataBaseConnection;
17 | import com.edmund.utils.LGDBUtils;
18 | import com.edmund.vo.LGJob;
19 |
20 | import jeasy.analysis.MMAnalyzer;
21 |
22 | /**
23 | * 多线程静态爬取职位信息的线程类
24 | * 现在用于从ready_url表中读取出需要处理的url,然后将处理结果存入lagou表中
25 | * 爬虫处理阶段3
26 | * @author Edmund
27 | *
28 | */
29 | class LGJobCrawlerThread extends Thread {
30 |
31 | private DataBaseConnection dbc = new DataBaseConnection();
32 | private LGDBUtils utils = new LGDBUtils(dbc);
33 |
34 | private static String localdriver = null; // 本地浏览器驱动位置
35 |
36 | private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36";
37 |
38 | /**
39 | * 读取配置文件
40 | */
41 | static {
42 | Properties property = new Properties();
43 | try {
44 | property.load(new FileInputStream(
45 | "./src/main/java/com/edmund/properties"));
46 | } catch (IOException e) {
47 | e.printStackTrace();
48 | }
49 | localdriver = property.getProperty("LocalChromedriver");
50 | }
51 |
52 | public static void main(String[] args) throws InterruptedException {
53 | // for (int i = 0; i < THREAD_NUMBER; i++) {
54 | // new LGJobCrawlerThread().start();
55 | // Thread.sleep(5000);
56 | // }
57 | }
58 |
59 | @Override
60 | public void run() {
61 | ChromeDriver driver = initBrowser();
62 | while (true) {
63 | try {
64 | String[] infos = null;
65 | if ((infos = utils.readFromReadyURL()) == null) {
66 | try {
67 | Thread.sleep(6000);
68 | } catch (InterruptedException e) {
69 | e.printStackTrace();
70 | }
71 | } else {
72 | LGJob job = getJobDetails_Dynamic(infos, driver);
73 | utils.insertLGJob(job);
74 | }
75 | } catch (Exception e) {
76 | e.printStackTrace();
77 | }
78 | }
79 | }
80 |
81 | /**
82 | * 初始化浏览器驱动
83 | * @return 浏览器驱动对象
84 | */
85 | private static ChromeDriver initBrowser() {
86 | System.setProperty("webdriver.chrome.driver", localdriver);
87 | ChromeDriver driver = new ChromeDriver();
88 | return driver;
89 | }
90 |
91 | /**
92 | * 在爬取数据之前做的预处理工作
93 | * @param url 需要爬取的url
94 | * @param driver 浏览器驱动对象
95 | * @return 0表示预处理正常,-1表示预处理失败
96 | */
97 | private int pretreatment(String url, ChromeDriver driver) {
98 | driver.get(url);
99 | // driver.manage().window().maximize();
100 |
101 | WebDriverWait wait = new WebDriverWait(driver, 5);
102 |
103 | try {
104 | wait.until(ExpectedConditions
105 | .presenceOfElementLocated(By.className("position-head")));
106 | wait.until(ExpectedConditions
107 | .presenceOfElementLocated(By.id("job_detail")));
108 |
109 | } catch (Exception e) {
110 | e.printStackTrace();
111 | restart(url);
112 | return -1;
113 | }
114 |
115 | return 0;
116 | }
117 |
118 | private LGJob getJobDetails_Dynamic(String[] infos, ChromeDriver driver) {
119 | LGJob job = null;
120 | String url = infos[0];
121 | // 过滤条件,只允许包含数据的url通过
122 | if (url.matches(".*lagou\\.com/jobs/[0-9]+\\..?html")) {
123 | if (pretreatment(url, driver) == -1) {
124 | return null;
125 | }
126 | String key = infos[1];
127 | String[] job_request = driver.findElementByClassName("job_request")
128 | .getText().split("/");
129 | String salary = job_request[0].trim();
130 | String city = job_request[1].trim();
131 | String experience = job_request[2].trim();
132 | String education = job_request[3].trim();
133 |
134 | String company = driver.findElementByClassName("company").getText();
135 | String keywords = driver.findElementByClassName("job_bt")
136 | .findElement(By.tagName("div")).getText();
137 |
138 | job = new LGJob(null, key, null, salary, city, experience,
139 | education, company.substring(0, company.length() - 2),
140 | getKeywordsMap(keywords));
141 |
142 | } else {
143 | return null;
144 | }
145 |
146 | return job;
147 | }
148 |
149 | /**
150 | * 根据infos数组获取工作的详细信息,infos[0]保存url,infos[1]保存keyword
151 | * @param infos 保存了url和keyword的数组
152 | * @return 职位信息的封装类
153 | */
154 | private LGJob getJobDetails(String[] infos) {
155 | Document doc = null;
156 | LGJob job = null;
157 | String url = infos[0];
158 | try {
159 | // 过滤条件,只允许包含数据的url通过
160 | if (url.matches(".*lagou\\.com/jobs/[0-9]+\\..?html")) {
161 | doc = Jsoup.connect(url).userAgent(USER_AGENT).get();
162 | } else {
163 | return null;
164 | }
165 | } catch (IOException e) {
166 | e.printStackTrace();
167 | return null;
168 | }
169 | String key = infos[1];
170 | String[] job_request = null;
171 | try {
172 | job_request = doc.getElementsByClass("job_request").first().text()
173 | .split("/");
174 | } catch (Exception e) {
175 | restart(url);
176 | e.printStackTrace();
177 | }
178 | String salary = job_request[0].trim();
179 | String city = job_request[1].trim();
180 | String experience = job_request[2].trim();
181 | String education = job_request[3].trim();
182 |
183 | String company = doc.getElementsByClass("company").first().text();
184 | String keywords = doc.getElementsByClass("job_bt").first()
185 | .getElementsByTag("div").text();
186 |
187 | job = new LGJob(null, key, null, salary, city, experience, education,
188 | company, getKeywordsMap(keywords));
189 |
190 | return job;
191 | }
192 |
193 | /**
194 | * 处理url出现异常时,恢复该url在数据库中的状态,并且休息10秒钟
195 | * @param url
196 | */
197 | private void restart(String url) {
198 | utils.restoreReadyURL(url);
199 | try {
200 | Thread.sleep(10000);
201 | } catch (InterruptedException e) {
202 | e.printStackTrace();
203 | }
204 | }
205 |
206 | /**
207 | * 根据传入的文本进行分词,取出其中的英文单词,并且将其出现的次数按照map的格式保存
208 | * @param keywords 需要分词的文本
209 | * @return 分词后的单词和其出现的次数
210 | */
211 | private static Map getKeywordsMap(String keywords) {
212 | Map kwMap = new HashMap();
213 | MMAnalyzer mm = new MMAnalyzer();
214 | MMAnalyzer.addWord("C#");
215 | MMAnalyzer.addWord("c#");
216 | try {
217 | String[] kwStrs = mm.segment(keywords, "|").split("\\|");
218 | for (String kwStr : kwStrs) {
219 | if (!kwStr.matches("[a-zA-Z/#\\\\]+")) {
220 | continue;
221 | }
222 | if (kwMap.containsKey(kwStr)) {
223 | kwMap.put(kwStr, kwMap.get(kwStr) + 1);
224 | } else {
225 | kwMap.put(kwStr, 1);
226 | }
227 | }
228 | } catch (IOException e) {
229 | e.printStackTrace();
230 | }
231 | return kwMap;
232 | }
233 | }
--------------------------------------------------------------------------------
/src/main/java/com/edmund/crawler/LGJobUrlGenerator.java:
--------------------------------------------------------------------------------
1 | package com.edmund.crawler;
2 |
3 | import java.io.IOException;
4 | import java.util.List;
5 |
6 | import com.edmund.utils.DataBaseConnection;
7 | import com.edmund.utils.LGDBUtils;
8 | import com.edmund.vo.LGJob;
9 |
10 | /**
11 | * 用于根据关键字和城市来生成所有需要处理的url,并存入city_url表中
12 | * 爬虫处理阶段1
13 | * @author Edmund
14 | *
15 | */
16 | public class LGJobUrlGenerator {
17 |
18 | private DataBaseConnection dbc = new DataBaseConnection();
19 | private LGDBUtils utils = new LGDBUtils(dbc);
20 | private static String[] keys = { "web", "java", "python", "c++", "c#",
21 | "android", "linux" };
22 | private static String root = "https://www.lagou.com/jobs/list_%KW%?px=default&city=%CT%#filterBox";
23 |
24 | public static void main(String[] args) throws IOException {
25 | new LGJobUrlGenerator().initURLList();
26 | // Test test = new Test();
27 | // String line = null;
28 | // List jobs = test.read();
29 | // for (LGJob lgJob : jobs) {
30 | // System.out.println(lgJob.getKeywords());
31 | // }
32 | }
33 |
34 | private void initURLList() throws IOException {
35 | List cities = utils
36 | .readFromFile("C:/Users/admin/Desktop/files/lagou.txt");
37 | for (String key : keys) {
38 | for (String city : cities) {
39 | String url = root.replace("%KW%", key).replace("%CT%", city);
40 | utils.writeIntoCityURL(url);
41 | }
42 | }
43 | }
44 |
45 | private List read() {
46 | return utils.getLGJob("web");
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/properties:
--------------------------------------------------------------------------------
1 | LocalChromedriver=D:/utils/chromedriver.exe
2 | LocalExportPath=C:/Users/admin/Desktop/export
--------------------------------------------------------------------------------
/src/main/java/com/edmund/test/Test.java:
--------------------------------------------------------------------------------
1 | package com.edmund.test;
2 |
3 | import java.io.IOException;
4 |
5 | import com.edmund.utils.DBUtils;
6 |
7 | public class Test {
8 | public static void main(String[] args) throws IOException {
9 | DBUtils.readFromFile("emp.txt");
10 |
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/utils/DBUtils.java:
--------------------------------------------------------------------------------
1 | package com.edmund.utils;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.FileNotFoundException;
7 | import java.io.FileOutputStream;
8 | import java.io.IOException;
9 | import java.io.InputStreamReader;
10 | import java.io.PrintWriter;
11 | import java.util.ArrayList;
12 | import java.util.HashMap;
13 | import java.util.List;
14 | import java.util.Map;
15 |
16 | import com.edmund.vo.Job;
17 |
18 | /**
19 | * 用于支持职位信息爬虫的文件操作类
20 | *
21 | * @author Edmund
22 | *
23 | */
24 | public class DBUtils {
25 | private static PrintWriter pw = null;
26 | private static int count = 1;
27 |
28 | /**
29 | * 从文件中读取网站根路径和城市
30 | *
31 | * @param filepath
32 | * 文件路径
33 | * @return 包含网站根路径列表和城市列表的map集合,可以通过get("cities")获得城市列表,get("roots")获得网站根路径列表,两个列表的索引一一对应
34 | * @throws IOException
35 | */
36 | public static Map> readFromFile(String filepath)
37 | throws IOException {
38 | Map> infos = new HashMap>();
39 | List cities = new ArrayList();
40 | List roots = new ArrayList();
41 |
42 | File file = new File(filepath);
43 | FileInputStream in = new FileInputStream(file);
44 | BufferedReader reader = new BufferedReader(
45 | new InputStreamReader(in, "UTF-8"));
46 | String line = null;
47 |
48 | while ((line = reader.readLine()) != null) {
49 | cities.add(line.split("\\t")[1]);
50 | roots.add(line.split("\\t")[2]);
51 | }
52 | infos.put("cities", cities);
53 | infos.put("roots", roots);
54 | reader.close();
55 | return infos;
56 | }
57 |
58 | /**
59 | * 将职位信息写入到文件中
60 | *
61 | * @param job
62 | * 职位信息
63 | * @param filepath
64 | * 保存的文件路径
65 | * @throws FileNotFoundException
66 | */
67 | public static void writeToFile(Job job, String filepath)
68 | throws FileNotFoundException {
69 | PrintWriter pw = new PrintWriter(
70 | new FileOutputStream(new File(filepath), true));
71 | pw.print(job.getCity() + "\t");
72 | pw.print(job.getKey() + "\t");
73 | pw.print(job.getTitle() + "\t");
74 | pw.print(job.getSalary() + "\t");
75 | pw.print(job.getCompany() + "\t");
76 | pw.print(job.getJob() + "\t");
77 | pw.print(job.getEducation() + "\t");
78 | pw.println(job.getExperience());
79 | pw.flush();
80 | pw.close();
81 | }
82 |
83 | /**
84 | * 将职位信息列表中的职位信息写入到文件中
85 | *
86 | * @param jobs
87 | * 职位信息列表
88 | * @param filepath
89 | * 文件路径
90 | */
91 | public static void writeToFile(List jobs, String filepath) {
92 | if (jobs == null || jobs.isEmpty()) {
93 | return;
94 | }
95 | try {
96 | initWriter(filepath);
97 | } catch (FileNotFoundException e) {
98 | e.printStackTrace();
99 | }
100 | for (Job job : jobs) {
101 | System.out.println("正在处理: " + job + ",已处理: " + count++);
102 | pw.print(job.getCity() + "\t");
103 | pw.print(job.getKey() + "\t");
104 | pw.print(job.getTitle() + "\t");
105 | pw.print(job.getSalary() + "\t");
106 | pw.print(job.getCompany() + "\t");
107 | pw.print(job.getJob() + "\t");
108 | pw.print(job.getEducation() + "\t");
109 | pw.println(job.getExperience());
110 | }
111 | pw.flush();
112 | closeAll();
113 |
114 | }
115 |
116 | /**
117 | * 关闭writer
118 | */
119 | public static void closeAll() {
120 | if (pw != null) {
121 | pw.close();
122 | pw = null;
123 | }
124 | }
125 |
126 | /**
127 | * 开启writer
128 | * @param filepath 文件路径
129 | * @throws FileNotFoundException
130 | */
131 | public static void initWriter(String filepath)
132 | throws FileNotFoundException {
133 | if (pw == null) {
134 | File file = new File(filepath);
135 | if (!file.getParentFile().exists()) {
136 | file.getParentFile().mkdirs();
137 | }
138 | pw = new PrintWriter(new FileOutputStream(file, true));
139 | }
140 | }
141 | }
142 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/utils/DataBaseConnection.java:
--------------------------------------------------------------------------------
1 | package com.edmund.utils;
2 |
3 | import java.sql.Connection;
4 | import java.sql.DriverManager;
5 | import java.sql.SQLException;
6 |
7 | /**
8 | * 数据库连接管理类
9 | *
10 | * @author Edmund
11 | *
12 | */
13 | public class DataBaseConnection {
14 | private static String DBDRIVER = "org.gjt.mm.mysql.Driver";
15 | private static String DBURL = "jdbc:mysql://10.60.72.28:3306/test";
16 | private static String DBUSER = "root";
17 | private static String DBPASSWORD = "redhat";
18 |
19 | private Connection conn = null;
20 |
21 | public DataBaseConnection() {
22 | super();
23 | }
24 |
25 | /**
26 | * 返回一个数据库连接
27 | *
28 | * @return
29 | */
30 | public Connection getConn() {
31 | try {
32 | if (conn == null || conn.isClosed()) {
33 | Class.forName(DBDRIVER);
34 | conn = DriverManager.getConnection(DBURL, DBUSER, DBPASSWORD);
35 | }
36 | } catch (SQLException e) {
37 | e.printStackTrace();
38 | } catch (ClassNotFoundException e) {
39 | e.printStackTrace();
40 | }
41 | return conn;
42 | }
43 |
44 | /**
45 | * 关闭数据库连接
46 | */
47 | public void close() {
48 | if (conn != null) {
49 | try {
50 | conn.close();
51 | } catch (SQLException e) {
52 | // TODO Auto-generated catch block
53 | e.printStackTrace();
54 | }
55 | }
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/utils/LGCleanUtils.java:
--------------------------------------------------------------------------------
1 | package com.edmund.utils;
2 |
3 | import java.sql.PreparedStatement;
4 | import java.sql.ResultSet;
5 | import java.sql.SQLException;
6 |
7 | /**
8 | * 拉钩数据表清洗工具类
9 | * @author Edmund
10 | *
11 | */
12 | public class LGCleanUtils {
13 | public DataBaseConnection dbc = null;
14 |
15 | public LGCleanUtils(DataBaseConnection dbc) {
16 | this.dbc = dbc;
17 | }
18 |
19 | /**
20 | * experience字段的清洗方法
21 | * @param experience
22 | * @return -1表示清洗失败, 否则返回大于0的值
23 | */
24 | private int experienceClean(String experience) {
25 | String strClean = experience.substring(2);
26 | int expClean = -1;
27 | if (strClean.matches("[0-9]+-[0-9]+年")) {
28 | expClean = Integer.parseInt(strClean.split("-")[0]);
29 | } else if (strClean.contains("不限") || strClean.contains("应届毕业生")
30 | || strClean.matches("[0-9]+年以下")) {
31 | expClean = 0;
32 | } else if (strClean.matches("[0-9]+年以上")) {
33 | expClean = Integer.parseInt(strClean.split("年")[0]);
34 | }
35 | return expClean;
36 | }
37 |
38 | /**
39 | * education字段的清洗方法
40 | * @param education
41 | * @return -1 表示清洗失败,否则返回大于0的值
42 | */
43 | private int educationClean(String education) {
44 | int edu = -1;
45 | if (education.matches("学历不限")) {
46 | edu = 0;
47 | } else if (education.matches("大专及以上")) {
48 | edu = 1;
49 | } else if (education.matches("本科及以上")) {
50 | edu = 2;
51 | } else if (education.matches("硕士及以上")) {
52 | edu = 3;
53 | }
54 | return edu;
55 | }
56 |
57 | /**
58 | * salary字段的清洗方法
59 | * @param salary
60 | * @return 三个均为0表示清洗失败,否则返回大于0的三个值数组
61 | */
62 | private int[] salaryClean(String salary) {
63 | int[] cleanSal = new int[3];
64 | int min_salary = 0;
65 | int max_salary = 0;
66 | int avg_salary = 0;
67 | if (salary.matches("[0-9]+[kK]-[0-9]+[kK]")) {
68 | String[] sals = salary.split("-");
69 | min_salary = Integer
70 | .parseInt(sals[0].replace("k", "000").replace("K", "000"));
71 | max_salary = Integer
72 | .parseInt(sals[1].replace("k", "000").replace("K", "000"));
73 | } else if (salary.matches("[0-9]+[kK]以上")) {
74 | String[] sals = salary.split("以上");
75 | min_salary = Integer
76 | .parseInt(sals[0].replace("k", "000").replace("K", "000"));
77 | max_salary = Integer.parseInt(
78 | sals[0].replace("k", "000").replace("K", "000")) + 5000;
79 | }
80 | avg_salary = (min_salary + max_salary) / 2;
81 | cleanSal[0] = min_salary;
82 | cleanSal[1] = max_salary;
83 | cleanSal[2] = avg_salary;
84 | return cleanSal;
85 | }
86 |
87 | /**
88 | * 拉钩数据表条目的清洗方法,用于清洗整个数据条目
89 | * @param data_from 0表示清洗拉勾网数据,1表示清洗BOSS网数据
90 | */
91 | public void JobClean(int data_from) {
92 | String query_sql = "SELECT id,key_word,job,salary,province,city,experience,education,company FROM lagou";
93 | String insert_sql = "INSERT INTO job_data_result(data_from,province,city,key_word,company_or_team,min_salary,max_salary,avg_salary,min_experience,min_education) VALUES(?,?,?,?,?,?,?,?,?,?)";
94 | try {
95 | PreparedStatement pst = dbc.getConn().prepareStatement(query_sql);
96 | ResultSet rs = pst.executeQuery();
97 | while (rs.next()) {
98 | int id = rs.getInt(1);
99 | String key_word = rs.getString(2);
100 | String job = rs.getString(3);
101 | String salary = rs.getString(4);
102 | String province = rs.getString(5);
103 | String city = rs.getString(6);
104 | String experience = rs.getString(7);
105 | String education = rs.getString(8);
106 | String company_or_team = rs.getString(9);
107 |
108 | int min_education = educationClean(education);
109 | int min_experience = experienceClean(experience);
110 | int[] cleanSal = salaryClean(salary);
111 | int min_salary = cleanSal[0];
112 | int max_salary = cleanSal[1];
113 | int avg_salary = cleanSal[2];
114 |
115 | pst = dbc.getConn().prepareStatement(insert_sql);
116 | pst.setInt(1, data_from);
117 | pst.setString(2, province);
118 | pst.setString(3, city);
119 | pst.setString(4, key_word);
120 | pst.setString(5, company_or_team);
121 | pst.setInt(6, min_salary);
122 | pst.setInt(7, max_salary);
123 | pst.setInt(8, avg_salary);
124 | pst.setInt(9, min_experience);
125 | pst.setInt(10, min_education);
126 |
127 | pst.executeUpdate();
128 | }
129 |
130 | rs.close();
131 | pst.close();
132 |
133 | } catch (SQLException e) {
134 | e.printStackTrace();
135 | }
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/utils/LGDBUtils.java:
--------------------------------------------------------------------------------
1 | package com.edmund.utils;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.FileNotFoundException;
7 | import java.io.FileOutputStream;
8 | import java.io.IOException;
9 | import java.io.InputStreamReader;
10 | import java.io.ObjectInputStream;
11 | import java.io.PrintWriter;
12 | import java.sql.Blob;
13 | import java.sql.PreparedStatement;
14 | import java.sql.ResultSet;
15 | import java.sql.SQLException;
16 | import java.util.ArrayList;
17 | import java.util.List;
18 | import java.util.Map;
19 | import java.util.Set;
20 |
21 | import com.edmund.vo.KeyMap;
22 | import com.edmund.vo.LGJob;
23 |
24 | /**
25 | * 拉勾网操作工具类
26 | * @author Edmund
27 | *
28 | */
29 | public class LGDBUtils {
30 | private PrintWriter pw = null;
31 |
32 | public DataBaseConnection dbc = null;
33 |
34 | public LGDBUtils(DataBaseConnection dbc) {
35 | this.dbc = dbc;
36 | }
37 |
38 | /**
39 | * 向数据库中写入需要处理的url
40 | * @param url
41 | */
42 | public void writeIntoReadyURL(String url, String keyword) {
43 | String sql = "INSERT INTO ready_url (url,state,keyword) VALUES (?,0,?)";
44 | try {
45 | dbc.getConn().setAutoCommit(false);
46 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
47 | pst.setString(1, url);
48 | pst.setString(2, keyword);
49 | pst.executeUpdate();
50 | dbc.getConn().commit();
51 | pst.close();
52 |
53 | } catch (SQLException e) {
54 | try {
55 | dbc.getConn().rollback();
56 | } catch (SQLException e1) {
57 | e1.printStackTrace();
58 | }
59 | e.printStackTrace();
60 | }
61 | }
62 |
63 | /**
64 | * 从数据库中读取未被处理过得url,并将其的状态值改为1,由于需要记录职位对应的关键字,故将keyword也一并取出
65 | * @return
66 | * @throws SQLException
67 | */
68 | public String[] readFromReadyURL() {
69 | try {
70 | dbc.getConn().setAutoCommit(false);
71 | } catch (SQLException e2) {
72 | e2.printStackTrace();
73 | }
74 | String sql = "SELECT id,url,keyword FROM ready_url WHERE state=0 LIMIT 1";
75 | String updateSql = "UPDATE ready_url SET state=1 WHERE id=?";
76 | String[] infos = null;
77 | try {
78 | dbc.getConn().setAutoCommit(false);
79 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
80 | ResultSet rs = pst.executeQuery();
81 | if (rs.next()) {
82 | String id = rs.getString(1);
83 | String url = rs.getString(2);
84 | String keyword = rs.getString(3);
85 | infos = new String[2];
86 | infos[0] = url;
87 | infos[1] = keyword;
88 | pst = dbc.getConn().prepareStatement(updateSql);
89 | pst.setInt(1, Integer.parseInt(id));
90 | pst.executeUpdate();
91 | System.out.println("正在处理: " + url);
92 | }
93 |
94 | dbc.getConn().commit();
95 | rs.close();
96 | pst.close();
97 | } catch (SQLException e) {
98 | try {
99 | dbc.getConn().rollback();
100 | } catch (SQLException e1) {
101 | e1.printStackTrace();
102 | }
103 | e.printStackTrace();
104 | }
105 | return infos;
106 |
107 | }
108 |
109 | /**
110 | * 处理指定url出现异常后,将其处理状态修改为0
111 | * @param url
112 | */
113 | public void restoreReadyURL(String url) {
114 | String sql = "UPDATE ready_url SET state=0 WHERE url=? AND state=1";
115 | try {
116 | dbc.getConn().setAutoCommit(false);
117 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
118 | pst = dbc.getConn().prepareStatement(sql);
119 | pst.setString(1, url);
120 | pst.executeUpdate();
121 | dbc.getConn().commit();
122 | pst.close();
123 | System.out.println("正在回滚: " + url);
124 | } catch (SQLException e) {
125 | try {
126 | dbc.getConn().rollback();
127 | } catch (SQLException e1) {
128 | e1.printStackTrace();
129 | }
130 | e.printStackTrace();
131 | }
132 | }
133 |
134 | /**
135 | * 向数据库中写入需要处理的url
136 | * @param url
137 | */
138 | public void writeIntoCityURL(String url) {
139 | String sql = "INSERT INTO city_url (url,state) VALUES (?,0)";
140 | try {
141 | dbc.getConn().setAutoCommit(false);
142 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
143 | pst.setString(1, url);
144 | pst.executeUpdate();
145 | dbc.getConn().commit();
146 | pst.close();
147 | } catch (SQLException e) {
148 | try {
149 | dbc.getConn().rollback();
150 | } catch (SQLException e1) {
151 | e1.printStackTrace();
152 | }
153 | e.printStackTrace();
154 | }
155 | }
156 |
157 | /**
158 | * 从数据库中读取未被处理过得url,并将其的状态值改为1,由于需要记录职位对应的关键字,故将keyword也一并取出
159 | * @return
160 | */
161 | public String readFromCityURL() {
162 | String sql = "SELECT id,url FROM city_url WHERE state=0 LIMIT 1";
163 | String updateSql = "UPDATE city_url SET state=1 WHERE id=?";
164 | String url = null;
165 | try {
166 | dbc.getConn().setAutoCommit(false);
167 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
168 | ResultSet rs = pst.executeQuery();
169 | if (rs.next()) {
170 | String id = rs.getString(1);
171 | url = rs.getString(2);
172 | pst = dbc.getConn().prepareStatement(updateSql);
173 | pst.setInt(1, Integer.parseInt(id));
174 | pst.executeUpdate();
175 | }
176 |
177 | dbc.getConn().commit();
178 | rs.close();
179 | pst.close();
180 | } catch (SQLException e) {
181 | try {
182 | dbc.getConn().rollback();
183 | } catch (SQLException e1) {
184 | e1.printStackTrace();
185 | }
186 | e.printStackTrace();
187 | }
188 | return url;
189 |
190 | }
191 |
192 | /**
193 | * 处理指定url出现异常后,将其处理状态修改为0
194 | * @param url
195 | */
196 | public void restoreCityURL(String url) {
197 | String sql = "UPDATE city_url SET state=0 WHERE url=? AND state=1";
198 | try {
199 | dbc.getConn().setAutoCommit(false);
200 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
201 | pst = dbc.getConn().prepareStatement(sql);
202 | pst.setString(1, url);
203 | pst.executeUpdate();
204 | dbc.getConn().commit();
205 | pst.close();
206 | } catch (SQLException e) {
207 | try {
208 | dbc.getConn().rollback();
209 | } catch (SQLException e1) {
210 | e1.printStackTrace();
211 | }
212 | e.printStackTrace();
213 | }
214 | }
215 |
216 | /**
217 | * 向数据库中插入一条职位信息记录
218 | * @param job 职位信息对象LGJob
219 | */
220 | public void insertLGJob(LGJob job) {
221 | String sql = "INSERT INTO lagou (key_word,job,salary,city,experience,education,company,key_words) VALUES (?,?,?,?,?,?,?,?)";
222 | try {
223 | dbc.getConn().setAutoCommit(false);
224 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
225 | pst.setString(1, job.getKeyword());
226 | pst.setString(2, null);
227 | pst.setString(3, job.getSalary());
228 | pst.setString(4, job.getCity());
229 | pst.setString(5, job.getExperience());
230 | pst.setString(6, job.getEducation());
231 | pst.setString(7, job.getCompany());
232 | pst.setObject(8, job.getKeywords());
233 |
234 | pst.executeUpdate();
235 | dbc.getConn().commit();
236 | pst.close();
237 | System.out.println("正在写入: " + job);
238 | } catch (SQLException e) {
239 | try {
240 | dbc.getConn().rollback();
241 | } catch (SQLException e1) {
242 | e1.printStackTrace();
243 | }
244 | e.printStackTrace();
245 | }
246 |
247 | }
248 |
249 | /**
250 | * 读取数据库中的所有职位信息记录,并封装为对象列表
251 | * @return 职位信息对象列表
252 | */
253 | public List getLGJob(String keyword) {
254 | String sql = "SELECT key_word,job,salary,city,experience,education,company,key_words FROM lagou WHERE key_word=?";
255 | List jobs = new ArrayList();
256 | try {
257 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
258 | pst.setString(1, keyword);
259 | ResultSet rs = pst.executeQuery();
260 | while (rs.next()) {
261 | Blob kwBlob = rs.getBlob(8);
262 | ObjectInputStream objIn = new ObjectInputStream(
263 | kwBlob.getBinaryStream());
264 | Map keywords = (Map) objIn
265 | .readObject();
266 | LGJob job = new LGJob(null, rs.getString(1), null,
267 | rs.getString(3), rs.getString(4), rs.getString(5),
268 | rs.getString(6), rs.getString(7), keywords);
269 | jobs.add(job);
270 | objIn.close();
271 | }
272 | rs.close();
273 | pst.close();
274 | } catch (SQLException e) {
275 | e.printStackTrace();
276 | } catch (IOException e) {
277 | e.printStackTrace();
278 | } catch (ClassNotFoundException e) {
279 | e.printStackTrace();
280 | }
281 |
282 | return jobs;
283 | }
284 |
285 | /**
286 | * 读取数据库中的所有关键字图,并封装成KeyMap对象,保存到KeyMap列表中
287 | * @return KeyMap列表
288 | */
289 | public List getKeyMap(String keyword) {
290 | String sql = "SELECT id,key_word,key_words FROM lagou WHERE key_word=?";
291 | List kmaps = new ArrayList();
292 | try {
293 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
294 | pst.setString(1, keyword);
295 | ResultSet rs = pst.executeQuery();
296 | while (rs.next()) {
297 | Blob kwBlob = rs.getBlob(3);
298 | ObjectInputStream objIn = new ObjectInputStream(
299 | kwBlob.getBinaryStream());
300 | Map keywords = (Map) objIn
301 | .readObject();
302 | KeyMap kmap = new KeyMap(rs.getInt(1), rs.getString(2),
303 | keywords);
304 | kmaps.add(kmap);
305 | objIn.close();
306 | }
307 | rs.close();
308 | pst.close();
309 | } catch (SQLException e) {
310 | e.printStackTrace();
311 | } catch (IOException e) {
312 | e.printStackTrace();
313 | } catch (ClassNotFoundException e) {
314 | e.printStackTrace();
315 | }
316 |
317 | return kmaps;
318 | }
319 |
320 | /**
321 | * 将分析报告输出到mysql中
322 | * @param kwMap 经过merge后生成的map
323 | * @param key_word 该map对应的关键词
324 | */
325 | public void writeKeyMapToMysql(Map kwMap,
326 | String key_word) {
327 | String sql = "INSERT INTO key_map_export(word,value,key_word) VALUES(?,?,?)";
328 | try {
329 | Set keyset = kwMap.keySet();
330 | for (String key : keyset) {
331 | PreparedStatement pst = dbc.getConn().prepareStatement(sql);
332 | pst.setString(1, key);
333 | pst.setInt(2, kwMap.get(key));
334 | pst.setString(3, key_word);
335 | pst.executeUpdate();
336 | pst.close();
337 | }
338 | } catch (SQLException e) {
339 | e.printStackTrace();
340 | }
341 | }
342 |
343 | /**
344 | * 将分析报告写入到文件中
345 | * @param kwMap
346 | * @param filepath
347 | * @throws FileNotFoundException
348 | */
349 | public static void writeToFile(Map kwMap,
350 | String filepath) {
351 | int i = 1;
352 | try {
353 | PrintWriter pw = new PrintWriter(
354 | new FileOutputStream(new File(filepath), true));
355 | Set keyset = kwMap.keySet();
356 | for (String key : keyset) {
357 | pw.println(key + "\t" + kwMap.get(key));
358 | System.out.println("已处理: " + i++);
359 | }
360 | pw.flush();
361 | pw.close();
362 | } catch (FileNotFoundException e) {
363 | e.printStackTrace();
364 | }
365 | }
366 |
367 | /**
368 | * 从指定文件路径中读取文件
369 | * @param filepath 文件路径
370 | * @return 以行为单位保存的列表
371 | * @throws IOException
372 | */
373 | public List readFromFile(String filepath) throws IOException {
374 | List cities = new ArrayList();
375 |
376 | File file = new File(filepath);
377 | FileInputStream in = new FileInputStream(file);
378 | BufferedReader reader = new BufferedReader(
379 | new InputStreamReader(in, "UTF-8"));
380 | String line = null;
381 |
382 | while ((line = reader.readLine()) != null) {
383 | cities.add(line);
384 | }
385 | reader.close();
386 | return cities;
387 | }
388 |
389 | /**
390 | * 关闭writer
391 | */
392 | public void closeAll() {
393 | if (pw != null) {
394 | pw.close();
395 | pw = null;
396 | }
397 | }
398 |
399 | /**
400 | * 开启writer
401 | * @param filepath 文件路径
402 | * @throws FileNotFoundException
403 | */
404 | public void initWriter(String filepath) throws FileNotFoundException {
405 | if (pw == null) {
406 | File file = new File(filepath);
407 | if (!file.getParentFile().exists()) {
408 | file.getParentFile().mkdirs();
409 | }
410 | pw = new PrintWriter(new FileOutputStream(file, true));
411 | }
412 | }
413 |
414 | }
415 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/vo/Job.java:
--------------------------------------------------------------------------------
1 | package com.edmund.vo;
2 |
3 | /**
4 | * 职位信息封装类
5 | *
6 | * @author Edmund
7 | *
8 | */
9 | public class Job {
10 | private Integer jid;
11 | private String city;
12 | private String key;
13 | private String title;
14 | private String salary;
15 | private String company;
16 | private String job;
17 | private String education;
18 | private String experience;
19 |
20 | public Job() {
21 | super();
22 | }
23 |
24 | public Job(Integer jid, String city, String key, String title, String salary, String company, String job,
25 | String education, String experience) {
26 | super();
27 | this.jid = jid;
28 | this.city = city;
29 | this.key = key;
30 | this.title = title;
31 | this.salary = salary;
32 | this.company = company;
33 | this.job = job;
34 | this.education = education;
35 | this.experience = experience;
36 | }
37 |
38 | public Integer getJid() {
39 | return jid;
40 | }
41 |
42 | public void setJid(Integer jid) {
43 | this.jid = jid;
44 | }
45 |
46 | public String getCity() {
47 | return city;
48 | }
49 |
50 | public void setCity(String city) {
51 | this.city = city;
52 | }
53 |
54 | public String getKey() {
55 | return key;
56 | }
57 |
58 | public void setKey(String key) {
59 | this.key = key;
60 | }
61 |
62 | public String getTitle() {
63 | return title;
64 | }
65 |
66 | public void setTitle(String title) {
67 | this.title = title;
68 | }
69 |
70 | public String getSalary() {
71 | return salary;
72 | }
73 |
74 | public void setSalary(String salary) {
75 | this.salary = salary;
76 | }
77 |
78 | public String getCompany() {
79 | return company;
80 | }
81 |
82 | public void setCompany(String company) {
83 | this.company = company;
84 | }
85 |
86 | public String getJob() {
87 | return job;
88 | }
89 |
90 | public void setJob(String job) {
91 | this.job = job;
92 | }
93 |
94 | public String getEducation() {
95 | return education;
96 | }
97 |
98 | public void setEducation(String education) {
99 | this.education = education;
100 | }
101 |
102 | public String getExperience() {
103 | return experience;
104 | }
105 |
106 | public void setExperience(String experience) {
107 | this.experience = experience;
108 | }
109 |
110 | @Override
111 | public String toString() {
112 | return "Job [jid=" + jid + ", city=" + city + ", key=" + key + ", title=" + title + ", salary=" + salary
113 | + ", company=" + company + ", job=" + job + ", education=" + education + ", experience=" + experience
114 | + "]";
115 | }
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/vo/KeyMap.java:
--------------------------------------------------------------------------------
1 | package com.edmund.vo;
2 |
3 | import java.util.Map;
4 |
5 | public class KeyMap {
6 | private int id;
7 | private String keyword;
8 | private Map keywords;
9 |
10 | public KeyMap() {
11 | super();
12 | }
13 |
14 | public KeyMap(int id, String keyword, Map keywords) {
15 | super();
16 | this.id = id;
17 | this.keyword = keyword;
18 | this.keywords = keywords;
19 | }
20 |
21 | public int getId() {
22 | return id;
23 | }
24 |
25 | public void setId(int id) {
26 | this.id = id;
27 | }
28 |
29 | public String getKeyword() {
30 | return keyword;
31 | }
32 |
33 | public void setKeyword(String keyword) {
34 | this.keyword = keyword;
35 | }
36 |
37 | public Map getKeywords() {
38 | return keywords;
39 | }
40 |
41 | public void setKeywords(Map keywords) {
42 | this.keywords = keywords;
43 | }
44 |
45 | @Override
46 | public String toString() {
47 | return "KeyMap [id=" + id + ", keyword=" + keyword + ", keywords="
48 | + keywords + "]";
49 | }
50 |
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/java/com/edmund/vo/LGJob.java:
--------------------------------------------------------------------------------
1 | package com.edmund.vo;
2 |
3 | import java.util.Map;
4 |
5 | public class LGJob {
6 | private Integer id;
7 | private String keyword;
8 | private String job;
9 | private String salary;
10 | private String city;
11 | private String experience;
12 | private String education;
13 | private String company;
14 | private Map keywords;
15 |
16 | public LGJob() {
17 | super();
18 | }
19 |
20 | public LGJob(Integer id, String keyword, String job, String salary,
21 | String city, String experience, String education, String company,
22 | Map keywords) {
23 | super();
24 | this.id = id;
25 | this.keyword = keyword;
26 | this.job = job;
27 | this.salary = salary;
28 | this.city = city;
29 | this.experience = experience;
30 | this.education = education;
31 | this.company = company;
32 | this.keywords = keywords;
33 | }
34 |
35 | public Integer getId() {
36 | return id;
37 | }
38 |
39 | public void setId(Integer id) {
40 | this.id = id;
41 | }
42 |
43 | public String getKeyword() {
44 | return keyword;
45 | }
46 |
47 | public void setKeyword(String keyword) {
48 | this.keyword = keyword;
49 | }
50 |
51 | public String getJob() {
52 | return job;
53 | }
54 |
55 | public void setJob(String job) {
56 | this.job = job;
57 | }
58 |
59 | public String getSalary() {
60 | return salary;
61 | }
62 |
63 | public void setSalary(String salary) {
64 | this.salary = salary;
65 | }
66 |
67 | public String getCity() {
68 | return city;
69 | }
70 |
71 | public void setCity(String city) {
72 | this.city = city;
73 | }
74 |
75 | public String getExperience() {
76 | return experience;
77 | }
78 |
79 | public void setExperience(String experience) {
80 | this.experience = experience;
81 | }
82 |
83 | public String getEducation() {
84 | return education;
85 | }
86 |
87 | public void setEducation(String education) {
88 | this.education = education;
89 | }
90 |
91 | public String getCompany() {
92 | return company;
93 | }
94 |
95 | public void setCompany(String company) {
96 | this.company = company;
97 | }
98 |
99 | public Map getKeywords() {
100 | return keywords;
101 | }
102 |
103 | public void setKeywords(Map keywords) {
104 | this.keywords = keywords;
105 | }
106 |
107 | @Override
108 | public String toString() {
109 | return "LGJob [id=" + id + ", keyword=" + keyword + ", job=" + job
110 | + ", salary=" + salary + ", city=" + city + ", experience="
111 | + experience + ", education=" + education + ", company="
112 | + company + ", keywords=" + keywords + "]";
113 | }
114 |
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/HDFSUtil/HDFSTest.java:
--------------------------------------------------------------------------------
1 | package com.radish.HDFSUtil;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.hadoop.fs.FSDataOutputStream;
5 | import org.apache.hadoop.fs.FileSystem;
6 | import org.apache.hadoop.fs.Path;
7 |
8 | public class HDFSTest {
9 |
10 | public static void main(String[] args) {
11 | // TODO Auto-generated method stub
12 | Configuration configuration = new Configuration();
13 | Path path = new Path("hdfs://192.168.199.233:9000/input/H2.txt");
14 | try {
15 | FileSystem fs = path.getFileSystem(configuration);
16 | FSDataOutputStream os = fs.create(path);
17 | os.writeUTF("Ni Hao ~");
18 | os.close();
19 | } catch (Exception e) {
20 | System.out.println("catch a exception");
21 | }
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/analysis/DataAnalysiser.java:
--------------------------------------------------------------------------------
1 | package com.radish.analysis;
2 |
3 | import java.sql.Connection;
4 | import java.sql.DriverManager;
5 | import java.sql.PreparedStatement;
6 | import java.sql.ResultSet;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | /**
11 | * 用于得出最终要展示的数据
12 | * @author radish
13 | *
14 | */
15 | public class DataAnalysiser {
16 | private static Connection conn;
17 | private static String[] provinceArray = new String[] { "北京", "天津", "河北", "山西", "内蒙古", "辽宁", "吉林", "黑龙江", "上海", "江苏",
18 | "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南", "广东", "广西", "海南", "重庆", "四川", "贵州", "云南", "西藏", "陕西", "甘肃",
19 | "青海", "宁夏", "新疆" };
20 | private static String[] keyWordArray = new String[] { "java", "C#", "linux", "python", "web", "c++", "android" };
21 |
22 | static {
23 | try {
24 | Class.forName("com.mysql.jdbc.Driver");
25 | String url = "jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8";
26 | String username = "root";
27 | String password = "admin";
28 | conn = DriverManager.getConnection(url, username, password);
29 | } catch (Exception e) {
30 | // TODO Auto-generated catch block
31 | e.printStackTrace();
32 | }
33 | }
34 |
35 | /*
36 | * 生成数据
37 | */
38 | public static void main(String[] args) throws Exception {
39 |
40 | for (int i = 0; i < keyWordArray.length; i++) {
41 | String key = keyWordArray[i];
42 | for (int strNum = 1; strNum <= 6; strNum++) {
43 | System.out.print("dataMap."+keyWordArray[i] + "data" + strNum + "=dataFormatter({");// javadata1--
44 | if (strNum == 1) {
45 | System.out.print("2018:[");
46 | List countList = countJobRequestNumber(key);
47 | for (int k = 0; k < countList.size() - 1; k++) {
48 | System.out.print(countList.get(k) + ",");
49 | }
50 | System.out.print(countList.get(countList.size() - 1));
51 | System.out.print("]});");
52 | System.out.println();
53 | }
54 | // 第二行数据是平均薪资
55 | if (strNum == 2) {
56 | System.out.print("2018:[");
57 | List countList = countAvgSalary(key);
58 | for (int k = 0; k < countList.size() - 1; k++) {
59 | System.out.print(countList.get(k) + ",");
60 | }
61 | System.out.print(countList.get(countList.size() - 1));
62 | System.out.print("]});");
63 | System.out.println();
64 | }
65 | // 第三行数据是平均最高工资
66 | if (strNum == 3) {
67 | System.out.print("2018:[");
68 | List countList = countMaxSalary(key);
69 | for (int k = 0; k < countList.size() - 1; k++) {
70 | System.out.print(countList.get(k) + ",");
71 | }
72 | System.out.print(countList.get(countList.size() - 1));
73 | System.out.print("]});");
74 | System.out.println();
75 | }
76 | // 第四行数据是平均最低工资
77 | if (strNum == 4) {
78 | System.out.print("2018:[");
79 | List countList = countMinSalary(key);
80 | for (int k = 0; k < countList.size() - 1; k++) {
81 | System.out.print(countList.get(k) + ",");
82 | }
83 | System.out.print(countList.get(countList.size() - 1));
84 | System.out.print("]});");
85 | System.out.println();
86 | }
87 | // 本科以及以上员工占岗位数的百分比
88 | if (strNum == 5) {
89 | System.out.print("2018:[");
90 | List countList = countEducationOver2Percent(key);
91 | for (int k = 0; k < countList.size() - 1; k++) {
92 | System.out.print(countList.get(k) + ",");
93 | }
94 | System.out.print(countList.get(countList.size() - 1));
95 | System.out.print("]});");
96 | System.out.println();
97 | }
98 | // 工作经验不限的比例
99 | if (strNum == 6) {
100 | System.out.print("2018:[");
101 | List countList = countExperienceIn0(key);
102 | for (int k = 0; k < countList.size() - 1; k++) {
103 | System.out.print(countList.get(k) + ",");
104 | }
105 | System.out.print(countList.get(countList.size() - 1));
106 | System.out.print("]});");
107 | System.out.println();
108 | }
109 | }
110 | }
111 |
112 | }
113 |
114 | /**
115 | *
116 | * @param key 搜索关键字
117 | * @param city 搜索所在省市
118 | * @return 搜索数据库中各大province的指定搜索关键词key的总条数
119 | */
120 | public static List countJobRequestNumber(String key) throws Exception {
121 | List countList = new ArrayList();
122 | String sql = null;
123 | PreparedStatement stmt = null;
124 | ResultSet rs = null;
125 | for (String province : provinceArray) {
126 | sql = "SELECT COUNT(id) FROM job_data_result where key_word=? and province=?";
127 | stmt = conn.prepareStatement(sql);
128 | stmt.setString(1, key);
129 | stmt.setString(2, province);
130 | rs = stmt.executeQuery();
131 | rs.next();
132 | int jobRequestNumber = rs.getInt(1);
133 | countList.add(jobRequestNumber);
134 | }
135 | if (rs != null) {
136 | rs.close();
137 | }
138 | if (stmt != null) {
139 | stmt.close();
140 | }
141 | return countList;
142 | }
143 |
144 | /**
145 | * 计算平均薪资
146 | * @param key
147 | * @return
148 | * @throws Exception
149 | */
150 | public static List countAvgSalary(String key) throws Exception {
151 | List countList = new ArrayList();
152 | String sql = null;
153 | PreparedStatement stmt = null;
154 | ResultSet rs = null;
155 | for (String province : provinceArray) {
156 | sql = "SELECT AVG(avg_salary) FROM job_data_result where key_word=? and province=?";
157 | stmt = conn.prepareStatement(sql);
158 | stmt.setString(1, key);
159 | stmt.setString(2, province);
160 | rs = stmt.executeQuery();
161 | rs.next();
162 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", rs.getDouble(1)));
163 | if (!Double.isNaN(jobRequestNumber)) {
164 | countList.add(jobRequestNumber);
165 | } else {
166 | countList.add(0.0);
167 | }
168 | }
169 | if (rs != null) {
170 | rs.close();
171 | }
172 | if (stmt != null) {
173 | stmt.close();
174 | }
175 | return countList;
176 | }
177 |
178 | /**
179 | * 计算平均最大工资值
180 | * @param key
181 | * @return 搜索数据库中各大province的指定搜索关键词key的平均最大工资值
182 | * @throws Exception
183 | */
184 | public static List countMaxSalary(String key) throws Exception {
185 | List countList = new ArrayList();
186 | String sql = null;
187 | PreparedStatement stmt = null;
188 | ResultSet rs = null;
189 | for (String province : provinceArray) {
190 | sql = "SELECT AVG(max_salary) FROM job_data_result where key_word=? and province=?";
191 | stmt = conn.prepareStatement(sql);
192 | stmt.setString(1, key);
193 | stmt.setString(2, province);
194 | rs = stmt.executeQuery();
195 | rs.next();
196 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", rs.getDouble(1)));
197 | if (!Double.isNaN(jobRequestNumber)) {
198 | countList.add(jobRequestNumber);
199 | } else {
200 | countList.add(0.0);
201 | }
202 | }
203 | if (rs != null) {
204 | rs.close();
205 | }
206 | if (stmt != null) {
207 | stmt.close();
208 | }
209 | return countList;
210 | }
211 |
212 | /**
213 | * 计算平均最小工资值
214 | * @param key
215 | * @return
216 | * @throws Exception
217 | */
218 | public static List countMinSalary(String key) throws Exception {
219 | List countList = new ArrayList();
220 | String sql = null;
221 | PreparedStatement stmt = null;
222 | ResultSet rs = null;
223 | for (String province : provinceArray) {
224 | sql = "SELECT AVG(min_salary) FROM job_data_result where key_word=? and province=?";
225 | stmt = conn.prepareStatement(sql);
226 | stmt.setString(1, key);
227 | stmt.setString(2, province);
228 | rs = stmt.executeQuery();
229 | rs.next();
230 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", rs.getDouble(1)));
231 | if (!Double.isNaN(jobRequestNumber)) {
232 | countList.add(jobRequestNumber);
233 | } else {
234 | countList.add(0.0);
235 | }
236 | }
237 | if (rs != null) {
238 | rs.close();
239 | }
240 | if (stmt != null) {
241 | stmt.close();
242 | }
243 | return countList;
244 | }
245 |
246 | /**
247 | * 计算本科以及以上员工占岗位数的百分比
248 | * @param key
249 | * @return
250 | * @throws Exception
251 | */
252 | public static List countEducationOver2Percent(String key) throws Exception {
253 | List countList = new ArrayList();
254 | String sql = null;
255 | PreparedStatement stmt = null;
256 | ResultSet rs = null;
257 | for (String province : provinceArray) {
258 | // 先查询员工总数总数
259 | sql = "SELECT COUNT(id) FROM job_data_result WHERE key_word=? and province=?";
260 | stmt = conn.prepareStatement(sql);
261 | stmt.setString(1, key);
262 | stmt.setString(2, province);
263 | rs = stmt.executeQuery();
264 | rs.next();
265 | int staffCount = rs.getInt(1);
266 | // 再查询本科以上的个数
267 | sql = "SELECT COUNT(id) FROM job_data_result WHERE key_word=? AND province=? AND " + "min_education>1 ";
268 | stmt = conn.prepareStatement(sql);
269 | stmt.setString(1, key);
270 | stmt.setString(2, province);
271 | rs = stmt.executeQuery();
272 | rs.next();
273 | int staffOver2 = rs.getInt(1);
274 | double persent = (staffOver2 * 1.0) / staffCount;
275 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", persent));
276 | if (!Double.isNaN(jobRequestNumber)) {
277 | countList.add(jobRequestNumber);
278 | } else {
279 | countList.add(0.0);
280 | }
281 | }
282 | if (rs != null) {
283 | rs.close();
284 | }
285 | if (stmt != null) {
286 | stmt.close();
287 | }
288 | return countList;
289 | }
290 |
291 | /**
292 | * 计算工作经验不限的比例
293 | * @param key
294 | * @return
295 | * @throws Exception
296 | */
297 | public static List countExperienceIn0(String key) throws Exception {
298 | List countList = new ArrayList();
299 | String sql = null;
300 | PreparedStatement stmt = null;
301 | ResultSet rs = null;
302 | for (String province : provinceArray) {
303 | // 先查询员工总数总数
304 | sql = "SELECT COUNT(id) FROM job_data_result WHERE key_word=? AND province=?";
305 | stmt = conn.prepareStatement(sql);
306 | stmt.setString(1, key);
307 | stmt.setString(2, province);
308 | rs = stmt.executeQuery();
309 | rs.next();
310 | int staffCount = rs.getInt(1);
311 | // 再查询本科以上的个数
312 | sql = "SELECT COUNT(id) FROM job_data_result WHERE key_word=? AND province=? AND " + "min_experience=1 ";
313 | stmt = conn.prepareStatement(sql);
314 | stmt.setString(1, key);
315 | stmt.setString(2, province);
316 | rs = stmt.executeQuery();
317 | rs.next();
318 | int staffSelected = rs.getInt(1);
319 | double persent = (staffSelected * 1.0) / staffCount;
320 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", persent));
321 | if (!Double.isNaN(jobRequestNumber)) {
322 | countList.add(jobRequestNumber);
323 | } else {
324 | countList.add(0.0);
325 | }
326 | }
327 | if (rs != null) {
328 | rs.close();
329 | }
330 | if (stmt != null) {
331 | stmt.close();
332 | }
333 | return countList;
334 | }
335 | }
336 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/crawler/BOSSCrawlerManager.java:
--------------------------------------------------------------------------------
1 | package com.radish.crawler;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.InputStreamReader;
7 | import java.util.ArrayList;
8 | import java.util.List;
9 | import java.util.StringTokenizer;
10 |
11 | import org.openqa.selenium.By;
12 | import org.openqa.selenium.WebElement;
13 | import org.openqa.selenium.chrome.ChromeDriver;
14 | import org.openqa.selenium.support.ui.ExpectedConditions;
15 | import org.openqa.selenium.support.ui.WebDriverWait;
16 |
17 | import com.radish.vo.BOSSUrlVO;
18 |
19 | /**
20 | * 单例模式实现管理爬取队列
21 | * @author admin
22 | *
23 | */
24 | public class BOSSCrawlerManager {
25 | // 初始化爬取队列,每个VO中的url都是可直接访问的
26 | private List urlList = new ArrayList();
27 | // 要爬取的关键词 java python web linux
28 | private String[] keys = new String[] { "java", "python", "web", "linux" };
29 | // 爬虫间争抢runningCrawler的同步对象
30 | private Object obj = new Object();
31 | // 所有爬虫爬完后,通过obj通知main线程
32 | public Object mainThread = new Object();
33 | // 当前仍然在工作的爬虫数
34 | private Integer runningCrawler = 0;
35 |
36 | private static BOSSCrawlerManager instance = new BOSSCrawlerManager();
37 |
38 | // 不可构造,单例模式
39 | private BOSSCrawlerManager() {
40 |
41 | }
42 |
43 | public static BOSSCrawlerManager getInstance() {
44 | return instance;
45 | }
46 |
47 | /**
48 | * 根据文件绝对路径初始化待爬取队列
49 | * 初始化成功则返回true,否则false
50 | * @return
51 | */
52 | public boolean init(String filePath) {
53 | boolean flag = false;
54 | try {
55 | // 如果文件不存在
56 | if (!new File(filePath).exists()) {
57 | return false;
58 | }
59 | BufferedReader reader = new BufferedReader(
60 | new InputStreamReader(new FileInputStream(new File(filePath)), "UTF-8"));
61 | // 逐行读
62 | String line = null;
63 | while ((line = reader.readLine()) != null) {
64 | StringTokenizer tokens = new StringTokenizer(line);
65 | String province = null;
66 | String city = null;
67 | String url = null;
68 | if (tokens.hasMoreTokens()) {
69 | province = tokens.nextToken();
70 | }
71 | if (tokens.hasMoreTokens()) {
72 | city = tokens.nextToken();
73 | }
74 | if (tokens.hasMoreTokens()) {
75 | url = tokens.nextToken();
76 | }
77 | // 根据关键词数组进行初始化
78 | for (int i = 0; i < keys.length; i++) {
79 | urlList.add(new BOSSUrlVO(province, city, url, keys[i]));
80 | }
81 | }
82 |
83 | reader.close();
84 | flag=true;
85 | } catch (Exception e) {
86 | // 如果出异常,返回false
87 | return false;
88 | }
89 | // 如果没返回true,则返回false
90 | return flag;
91 | }
92 |
93 | /**
94 | * 设置爬虫领取任务的同步方法
95 | * 如果任务没了,就返回null
96 | * @return
97 | */
98 | public synchronized BOSSUrlVO getVO() {
99 | // 如果待爬取队列空了
100 | if (urlList.size() == 0) {
101 | return null;
102 | }
103 | BOSSUrlVO vo = urlList.get(0);
104 | urlList.remove(0);
105 | return vo;
106 | }
107 |
108 | public void buildWorker() {
109 | new WorkerThread().start();
110 | }
111 |
112 | /**
113 | * 建立爬虫类
114 | * @author admin
115 | *
116 | */
117 | class WorkerThread extends Thread {
118 | private BOSSUrlVO vo;
119 | private ChromeDriver driver;
120 |
121 | // 构造方法
122 | public WorkerThread() {
123 | // 初始化浏览器驱动
124 | driver = new ChromeDriver();
125 | }
126 |
127 | /**
128 | * 核心工作方法
129 | * 不断获取待爬取队列的vo对象,爬取vo对象的url
130 | */
131 | @Override
132 | public void run() {
133 | synchronized (runningCrawler) {
134 | runningCrawler++;
135 | }
136 | // 永真循环
137 | while (true) {
138 | synchronized (runningCrawler) {
139 | vo = getVO();
140 | // 如果队列空了
141 | if (vo == null) {
142 | // 当前工作爬虫-1
143 | runningCrawler--;
144 | // 如果这是最后一个死掉的爬虫,唤醒main线程
145 | synchronized (mainThread) {
146 | if (runningCrawler == 0) {
147 | System.out.println("最后一只爬虫休眠");
148 | System.out.println("即将唤醒main线程");
149 | mainThread.notify();
150 | }
151 | }
152 | // 退出run
153 | return;
154 | }
155 | }
156 | // 如果得到了分配的任务
157 | try {
158 | work();
159 | } catch (Exception e) {
160 | // 如果爬虫一次工作出现异常
161 | System.out.println(Thread.currentThread().getName()+":-------------爬虫工作异常---------------");
162 | }
163 | }
164 | }
165 |
166 | /**
167 | * 线程的工作方法
168 | * 爬取url对应的
169 | */
170 | private void work(){
171 |
172 | try {
173 | String province = vo.getProvince();
174 | String city = vo.getCity();
175 | String url = vo.getUrl();
176 | System.out.println("url:" + url);
177 | WebDriverWait wait = new WebDriverWait(driver, 8);
178 | // 打开网页
179 | driver.get(url);
180 | while (true) {
181 | // 等待加载
182 | //wait.until(ExpectedConditions.presenceOfElementLocated(By.id("footer")));
183 | wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#wrap")));
184 | // 爬取内容
185 | // 先爬取所有的div.job-list div.job-primary
186 | List divList = driver.findElementsByCssSelector("div.job-list div.job-primary");
187 | for (WebElement jobDiv : divList) {
188 | // 得到title salary city experience education company
189 | // 标题
190 | String title = jobDiv.findElement(By.cssSelector("div.job-title")).getText();
191 | // 收入
192 | String salary = jobDiv.findElement(By.cssSelector("span.red")).getText();
193 | // 企业
194 | String company = jobDiv.findElement(By.cssSelector("div.company-text h3")).getText();
195 | // 工作经验 学历
196 | String text = jobDiv.findElement(By.cssSelector("div.info-primary p")).getText();
197 | String experience = text.substring(text.indexOf(" "));
198 | String education = text.substring(text.length()-2);
199 | // 打印一个单元数据测试
200 | System.out.printf("title:%s\t%s\t%s\t%s\t%s\t%s", title, city, salary, company, experience,
201 | education+"\r\n");
202 | }
203 | WebElement nextElement = null;
204 | // 如果有下一页,则点击下一页,否则
205 | if((nextElement=driver.findElement(By.cssSelector("div.page a.next")))!=null){
206 | if(nextElement.getAttribute("class").contains("disabled")){
207 | return;
208 | }else{
209 | nextElement.click();
210 | }
211 | }else{// 如果没找到就结束了.
212 | return;
213 | }
214 | }
215 | } catch(Exception e){
216 | System.out.println("url:"+vo.getUrl()+"error,没有正常爬取完毕");
217 | try {
218 | Thread.sleep(10*1000);
219 | } catch (InterruptedException e1) {
220 | System.out.println("sleep失败");
221 | }
222 | System.out.println("尽快输入验证码");
223 | }
224 | }
225 | }
226 |
227 | }
228 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/crawler/BOSSProvinceCrawler.java:
--------------------------------------------------------------------------------
1 | package com.radish.crawler;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.io.FileWriter;
6 | import java.io.OutputStreamWriter;
7 | import java.io.PrintWriter;
8 | import java.util.List;
9 |
10 | import org.jsoup.Jsoup;
11 | import org.jsoup.nodes.Document;
12 | import org.jsoup.nodes.Element;
13 | import org.jsoup.select.Elements;
14 | import org.openqa.selenium.By;
15 | import org.openqa.selenium.WebElement;
16 | import org.openqa.selenium.chrome.ChromeDriver;
17 | import org.openqa.selenium.support.ui.ExpectedConditions;
18 | import org.openqa.selenium.support.ui.WebDriverWait;
19 |
20 | /**
21 | * 爬取https://www.zhipin.com/job_detail/?query=java&scity=101090100&industry=&position=100101
22 | * 中BOOS直聘的province编号.
23 | * java置空为#
24 | * 输出
25 | * 省 市 url
26 | * @author admin
27 | *
28 | */
29 | public class BOSSProvinceCrawler {
30 |
31 | public static void main(String[] args) throws Exception {
32 | //
33 | // work();
34 | //
35 | jsoupWork();
36 | }
37 |
38 | /**
39 | * 使用selenium打开目标网页,爬取主要信息,并保存到/result-sources/BoosUrl.txt
40 | */
41 | public static void work() throws Exception {
42 | String url = "https://www.zhipin.com/job_detail/?query=java&scity=101090100&industry=&position=100101";
43 | // 设置Chrome浏览器驱动所在位置
44 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe");
45 | ChromeDriver driver = new ChromeDriver();
46 |
47 | WebDriverWait wait = new WebDriverWait(driver, 5);
48 | // 打开网页
49 | driver.get(url);
50 |
51 | // 等待网页加载完毕
52 | wait.until(ExpectedConditions.presenceOfElementLocated(By.id("wrap")));
53 | // System.out.println(bodyText);
54 | // 点击下拉框
55 | driver.findElement(By.cssSelector("span.label-text")).click();
56 | // 找出city-box 显示省份条数
57 | List provinceList = driver.findElements(By.cssSelector("div.city-box ul.dorpdown-province li"));
58 | // 去掉第一行的热门
59 | provinceList.remove(0);
60 | // 找出每个省份对应的ul列
61 | List cityList = driver.findElements(By.cssSelector("div.dorpdown-city ul"));
62 | // 去掉第一个热门
63 | cityList.remove(0);
64 | // 遍历省
65 | for (WebElement provinceEL : provinceList) {
66 | int i = 0;
67 | // 省名称
68 | String provinceName = provinceEL.getText();
69 | // 遍历省对应的城市ul
70 | WebElement ulEL = cityList.get(i);
71 | List liList = ulEL.findElements(By.tagName("li"));
72 | for (WebElement li : liList) {
73 | System.out.println(provinceName + "\t" + li.getText() + "\t" + li.getAttribute("data-val"));
74 | }
75 | i++;
76 | }
77 |
78 | }
79 |
80 | public static void jsoupWork() {
81 | try {
82 | // String url =
83 | // "https://www.zhipin.com/job_detail/?query=java&scity=101090100&industry=&position=100101";
84 | // String
85 | // userAgent="Opera11.11–WindowsUser-Agent:Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11";
86 | String url = "https://www.zhipin.com/job_detail/?query=java&scity=101090100&industry=&position=100101";
87 | // 设置Chrome浏览器驱动所在位置
88 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe");
89 | ChromeDriver driver = new ChromeDriver();
90 | WebDriverWait wait = new WebDriverWait(driver, 5);
91 | // 打开网页
92 | driver.get(url);
93 | // 等待网页加载完毕
94 | wait.until(ExpectedConditions.presenceOfElementLocated(By.id("footer")));
95 |
96 | Document document = Jsoup.parse(driver.getPageSource());
97 | // 找到省
98 | Elements provinceList = document.select("div.city-box ul.dorpdown-province li");
99 | // 去掉热门
100 | provinceList.remove(0);
101 | // 市ul列表
102 | Elements cityULList = document.select("div.dorpdown-city ul");
103 | // 去掉热门
104 | cityULList.remove(0);
105 | StringBuilder builder = new StringBuilder();
106 | for (int i = 0; i < provinceList.size(); i++) {
107 | // 得到省名称
108 | String provinceName = provinceList.get(i).text();
109 | // 找到城市ul并得到其中的li
110 | Elements cityList = cityULList.get(i).select("li");
111 | // 遍历li
112 | for (Element cityLi : cityList) {
113 | // 写入到/result-sources/BoosUrl.txt
114 | //provinceName + "\t" + cityLi.text() + "\t" + cityLi.attr("data-val")
115 | // url=https://www.zhipin.com/job_detail/?query=java&scity=101281900
116 | String line = provinceName + "\t" + cityLi.text() + "\t"
117 | +"https://www.zhipin.com/job_detail/?query=#&scity="
118 | + cityLi.attr("data-val")+"\r\n";
119 | System.out.println(">>"+line);
120 | builder.append(line);
121 | }
122 | }
123 | // 结果写入到文件
124 | File result = new File("D:/eclipse_2/git/Job_Analysis/result-sources/radish/BossUrl.txt");
125 | result.createNewFile();
126 | PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(result),"UTF-8"));
127 | writer.print(builder.toString());
128 | writer.flush();
129 | writer.close();
130 | } catch (Exception e) {
131 | // TODO Auto-generated catch block
132 | e.printStackTrace();
133 | }
134 | }
135 | }
136 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/crawler/Test.java:
--------------------------------------------------------------------------------
1 | package com.radish.crawler;
2 |
3 | public class Test {
4 | public static void main(String[] args) {
5 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe");
6 | // 获得单例句柄
7 | BOSSCrawlerManager instance = BOSSCrawlerManager.getInstance();
8 | // 初始化单例对象
9 | if(!instance.init("C:/Users/admin/Desktop/BossUrl.txt")){
10 | System.out.println("init 失败!程序退出");
11 | System.exit(0);
12 | };
13 | System.out.println("urlList初始化完毕!--------------------");
14 | instance.buildWorker();
15 | //instance.buildWorker();
16 | //instance.buildWorker();
17 | System.out.println("3只爬虫启动成功");
18 |
19 | // 启动3个线程后,自己睡眠等待最后一只休眠的爬虫唤醒自己
20 | Object mainThread = instance.mainThread;
21 | synchronized (mainThread) {
22 | try {
23 | System.out.println("main线程睡眠");
24 | mainThread.wait();
25 | System.out.println("main线程被唤醒");
26 | } catch (InterruptedException e) {
27 | e.printStackTrace();
28 | }
29 | }
30 | // 被唤醒后 main方法负责
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/crawler/distributed/DistributedCrawler.java:
--------------------------------------------------------------------------------
1 | package com.radish.crawler.distributed;
2 |
3 | import java.sql.Connection;
4 | import java.sql.DriverManager;
5 | import java.sql.SQLException;
6 |
7 | import org.openqa.selenium.chrome.ChromeDriver;
8 |
9 | /**
10 | * 实现分布式爬虫
11 | *
12 | * 任务:
13 | * 1.从数据库表:url_list获取一条url,
14 | *
15 | * 2.将爬到的数据存入到表lagou
16 | * 爬虫类,每次领取一个任务,并将任务的状态值置为1
17 | * 任务完成后,将结果存入lagou表,并将url_list中相对应任务的状态值置为2
18 | * @author admin
19 | *
20 | */
21 | public class DistributedCrawler {
22 | private Connection conn;
23 | private ChromeDriver driver;
24 |
25 | // 空构造,初始化conn
26 | public DistributedCrawler() {
27 |
28 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe");
29 | driver = new ChromeDriver();
30 | try {
31 | Class.forName("com.mysql.jdbc.Driver");
32 | // 注意,IP可以填写分布式数据库所在的主机
33 | conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8", "root",
34 | "admin");
35 | } catch (Exception e) {
36 | System.out.println("数据库连接初始化失败");
37 | }
38 | }
39 |
40 | // 关闭内置的数据库连接
41 | public void closeConnection() throws Exception {
42 | conn.close();
43 | }
44 |
45 | public Connection getConn() {
46 | return conn;
47 | }
48 |
49 | public void setConn(Connection conn) {
50 | this.conn = conn;
51 | }
52 |
53 | public ChromeDriver getDriver() {
54 | return driver;
55 | }
56 |
57 | public void setDriver(ChromeDriver driver) {
58 | this.driver = driver;
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/crawler/distributed/Test.java:
--------------------------------------------------------------------------------
1 | package com.radish.crawler.distributed;
2 |
3 | import java.sql.Connection;
4 | import java.sql.PreparedStatement;
5 | import java.sql.ResultSet;
6 | import java.sql.SQLException;
7 | import java.util.List;
8 |
9 | import org.openqa.selenium.By;
10 | import org.openqa.selenium.WebElement;
11 | import org.openqa.selenium.chrome.ChromeDriver;
12 | import org.openqa.selenium.support.ui.ExpectedConditions;
13 | import org.openqa.selenium.support.ui.WebDriverWait;
14 |
15 | import com.radish.vo.BOSSUrlVO;
16 | import com.radish.vo.JobDataVO;
17 |
18 | public class Test {
19 | private static Connection conn;
20 | private static ChromeDriver driver;
21 | private static Integer count = 1;
22 |
23 | static {
24 | DistributedCrawler crawler = new DistributedCrawler();
25 | System.out.println("新建 crawler对象");
26 | System.out.println("设置 system.property");
27 | driver = crawler.getDriver();
28 | conn = crawler.getConn();
29 | try {
30 | conn.setAutoCommit(false);
31 | } catch (SQLException e) {
32 | System.out.println("设置不自动提交失败");
33 | }
34 | System.out.println("初始化块执行完毕");
35 | }
36 |
37 | public static void main(String[] args) {
38 | try {
39 | // 事务处理串行化
40 | conn.setAutoCommit(false);
41 | } catch (SQLException e2) {
42 | System.out.println("设置自动提交失败");
43 | e2.printStackTrace();
44 | }
45 |
46 | while (true) {
47 | try {
48 | // 查询一个任务的SELECT语句
49 | String sql = "SELECT id,province,city,url,key_word,status " + "FROM url_list "
50 | + "WHERE status=0 order by id limit 1 ";
51 | PreparedStatement stmt = conn.prepareStatement(sql);
52 | ResultSet rs = stmt.executeQuery();
53 | //System.out.println("select 执行");
54 | //System.out.println(rs.wasNull());
55 | //System.out.println("next():" + rs.next());
56 | // 查到东西
57 | if (rs.next()) {
58 | System.out.println("查到了一个对象");
59 | // 取出到vo对象
60 | BOSSUrlVO vo = null;
61 |
62 | int id = rs.getInt(1);
63 | String province = rs.getString(2);
64 | String city = rs.getString(3);
65 | String url = rs.getString(4);
66 | System.out.println("url----" + url);
67 | String keyWord = rs.getString(5);
68 | int status = rs.getInt(6);
69 | vo = new BOSSUrlVO(id, province, city, url, keyWord, status);
70 |
71 | // 就修改status = 1
72 | sql = "UPDATE url_list set status =1 where id=" + vo.getId();
73 | stmt = conn.prepareStatement(sql);
74 | stmt.executeUpdate();
75 | System.out.println("update 执行");
76 | conn.commit();
77 | System.out.println("查询+update status事务提交完毕");
78 | rs.close();
79 | stmt.close();
80 |
81 | // 此刻已经拿到vo
82 | driver.get(vo.getUrl());
83 | Thread.sleep(1500);
84 | work(driver, vo);
85 | } else {// 如果没查到,程序退出
86 | rs.close();
87 | stmt.close();
88 | System.out.println("数据库中没任务了");
89 | conn.close();
90 | System.exit(0);
91 | }
92 | } catch (Exception e) {
93 | System.out.println("事务处理失败 rollback");
94 | try {
95 | conn.rollback();
96 | continue;
97 | } catch (Exception e1) {
98 | System.out.println("rollback 失败");
99 | continue;
100 | }
101 | }
102 | } // while
103 |
104 | }// main
105 |
106 | public static void work(ChromeDriver driver, BOSSUrlVO urlVo) {
107 | WebDriverWait wait = new WebDriverWait(driver, 12);
108 | try {
109 | s: while (true) {
110 | // 等待加载
111 | // wait.until(ExpectedConditions.presenceOfElementLocated(By.id("footer")));
112 | wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#wrap")));
113 | // 爬取内容
114 | // 先爬取所有的div.job-list div.job-primary
115 | List divList = driver.findElementsByCssSelector("div.job-list div.job-primary");
116 | for (WebElement jobDiv : divList) {
117 | // 得到title salary city experience education company
118 | // 标题
119 | String title = jobDiv.findElement(By.cssSelector("div.job-title")).getText();
120 |
121 | String requestUrl = jobDiv.findElement(By.cssSelector("h3.name a")).getAttribute("href");
122 | // 收入
123 | String salary = jobDiv.findElement(By.cssSelector("span.red")).getText();
124 | // 企业
125 | String company = jobDiv.findElement(By.cssSelector("div.company-text h3")).getText();
126 | // 工作经验 学历
127 | String text = jobDiv.findElement(By.cssSelector("div.info-primary p")).getText();
128 | String city = text.substring(0, text.indexOf(" "));
129 | // 如果li的城市和搜索城市不相符,则下个url
130 | if (!city.equals(urlVo.getCity()))
131 | break s;
132 |
133 | String experience = text.substring(text.indexOf(" "));
134 | String education = text.substring(text.length() - 2);
135 | JobDataVO jobData = new JobDataVO(urlVo.getId(), urlVo.getCity(), urlVo.getKey(), title, company,
136 | null, salary, experience, education, null, null, requestUrl);
137 | insertData(jobData);
138 | }
139 | WebElement nextElement = null;
140 | // 如果有下一页,则点击下一页,否则
141 | if ((nextElement = driver.findElement(By.cssSelector("div.page a.next"))) != null) {
142 | if (nextElement.getAttribute("class").contains("disabled")) {
143 | return;
144 | } else {
145 | nextElement.click();
146 | Thread.sleep(1000);
147 | }
148 | } else {// 如果没找到就结束了.
149 | return;
150 | }
151 | }
152 | } catch (Exception e) {
153 | // 如果这个while出错,比如被屏蔽需要输入验证码
154 |
155 | System.out.println("while 循环出错,可能需要输入验证码");
156 | try {
157 | Thread.sleep(20 * 1000);
158 | } catch (InterruptedException e1) {
159 | System.out.println("sleep 失败");
160 | }
161 | }
162 | }// work()
163 |
164 | public static void insertData(JobDataVO dataVO) {
165 | try {
166 | String sql = "INSERT INTO job_data(id,city,key_word,title,company,salary,experience,education,job_request_url) "
167 | + "VALUES(?,?,?,?,?,?,?,?,?)";
168 | PreparedStatement stmt = conn.prepareStatement(sql);
169 | stmt.setInt(1, dataVO.getId());
170 | stmt.setString(2, dataVO.getCity());
171 | stmt.setString(3, dataVO.getKeyWord());
172 | stmt.setString(4, dataVO.getTitle());
173 | stmt.setString(5, dataVO.getCompany());
174 | stmt.setString(6, dataVO.getSalary());
175 | stmt.setString(7, dataVO.getExperience());
176 | stmt.setString(8, dataVO.getEducation());
177 | stmt.setString(9, dataVO.getJobRequestUrl());
178 | stmt.executeUpdate();
179 | conn.commit();
180 | stmt.close();
181 | // System.out.println("插入一条数据,总插入数据数为:" + count++);
182 | } catch (Exception e) {
183 | try {
184 | conn.rollback();
185 | } catch (SQLException e1) {
186 | System.out.println("rollback 失败");
187 | }
188 | System.out.println("插入结果数据失败");
189 | }
190 | }
191 | }
192 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/dataclean/DataCleaner.java:
--------------------------------------------------------------------------------
1 | package com.radish.dataclean;
2 |
3 | import java.sql.Connection;
4 | import java.sql.DriverManager;
5 | import java.sql.PreparedStatement;
6 | import java.sql.ResultSet;
7 | import java.sql.SQLException;
8 |
9 | import com.edmund.test.Test;
10 | /**
11 | * 数据清洗的相关类
12 | * @author radish
13 | *
14 | */
15 | public class DataCleaner {
16 | private Connection conn;
17 |
18 | /**
19 | * 构造器初始化数据库的连接
20 | */
21 | public DataCleaner() throws Exception {
22 | Class.forName("com.mysql.jdbc.Driver");
23 | String url = "jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8";
24 | String username = "root";
25 | String password = "admin";
26 | conn = DriverManager.getConnection(url, username, password);
27 | }
28 |
29 | /**
30 | *
31 | * 根据url_list,为lagou表每一条记录根据city填上province
32 | * @throws Exception
33 | */
34 | public void insertProvince() throws Exception {
35 | String sql = null;
36 | PreparedStatement stmt = null;
37 | ResultSet rs = null;
38 | while (true) {
39 | try {
40 | sql = "SELECT id,city FROM lagou WHERE province IS NULL LIMIT 1";
41 | stmt = conn.prepareStatement(sql);
42 | rs = stmt.executeQuery();
43 | // 如果查询到数据
44 | if (rs.next()) {
45 | int id = rs.getInt(1);
46 | String city = rs.getString(2);
47 | // 用city去job_data表查询province
48 | sql = "SELECT province FROM url_list WHERE city=? LIMIT 1";
49 | stmt = conn.prepareStatement(sql);
50 | stmt.setString(1, city);
51 | rs = stmt.executeQuery();
52 | // 如果有查到province
53 | if (rs.next()) {
54 | String province = rs.getString(1);
55 | sql = "UPDATE lagou SET province=? WHERE city = ?";
56 | stmt = conn.prepareStatement(sql);
57 | stmt.setString(1, province);
58 | stmt.setString(2, city);
59 | stmt.executeUpdate();
60 | } else {
61 | String province = "null";
62 | sql = "UPDATE lagou SET province=? WHERE id = ?";
63 | stmt = conn.prepareStatement(sql);
64 | stmt.setString(1, province);
65 | stmt.setInt(2, id);
66 | stmt.executeUpdate();
67 | }
68 | } else {
69 | System.out.println("处理完毕");
70 | if (stmt != null) {
71 | stmt.close();
72 | }
73 | if (rs != null) {
74 | rs.close();
75 | }
76 | System.exit(0);
77 | }
78 | } catch (Exception e) {
79 | if (stmt != null) {
80 | stmt.close();
81 | }
82 | if (rs != null) {
83 | rs.close();
84 | }
85 | e.printStackTrace();
86 | System.exit(0);
87 | }
88 | }
89 | }
90 |
91 | /**
92 | * 根据url_list,为job_data表每一条记录根据city填上province
93 | *
94 | * @throws Exception
95 | */
96 | public void insertProvinceToBoss() throws Exception {
97 | String sql = null;
98 | PreparedStatement stmt = null;
99 | ResultSet rs = null;
100 | while (true) {
101 | try {
102 | sql = "SELECT id,city FROM job_data WHERE province IS NULL LIMIT 1";
103 | stmt = conn.prepareStatement(sql);
104 | rs = stmt.executeQuery();
105 | // 如果查询到数据
106 | if (rs.next()) {
107 | int id = rs.getInt(1);
108 | String city = rs.getString(2);
109 | // 用city去job_data表查询province
110 | sql = "SELECT province FROM url_list WHERE city=? LIMIT 1";
111 | stmt = conn.prepareStatement(sql);
112 | stmt.setString(1, city);
113 | rs = stmt.executeQuery();
114 | // 如果有查到province
115 | if (rs.next()) {
116 | String province = rs.getString(1);
117 | sql = "UPDATE job_data SET province=? WHERE city = ?";
118 | stmt = conn.prepareStatement(sql);
119 | stmt.setString(1, province);
120 | stmt.setString(2, city);
121 | stmt.executeUpdate();
122 | } else {
123 | String province = "null";
124 | sql = "UPDATE job_data SET province=? WHERE id = ?";
125 | stmt = conn.prepareStatement(sql);
126 | stmt.setString(1, province);
127 | stmt.setInt(2, id);
128 | stmt.executeUpdate();
129 | }
130 | } else {
131 | System.out.println("处理完毕");
132 | if (stmt != null) {
133 | stmt.close();
134 | }
135 | if (rs != null) {
136 | rs.close();
137 | }
138 | System.exit(0);
139 | }
140 | } catch (Exception e) {
141 | if (stmt != null) {
142 | stmt.close();
143 | }
144 | if (rs != null) {
145 | rs.close();
146 | }
147 | e.printStackTrace();
148 | System.exit(0);
149 | }
150 | }
151 | }
152 |
153 | /**
154 | * 将拉勾网的所有数据清洗后,填写到表job_data_result
155 | *
156 | */
157 | public void moveLagouDataToResult() throws Exception {
158 |
159 | }
160 |
161 | /**
162 | * 把BOSS直聘的数据清洗后插入到表job_data_result
163 | *
164 | */
165 | public void moveBossDataToResult() throws Exception {
166 | int count=0;
167 | String sql = null;
168 | PreparedStatement stmt = null;
169 | ResultSet rs = null;
170 |
171 | sql = "SELECT province,"// 1
172 | + "city,"// 2
173 | + "key_word,"// 3
174 | + "company,"// 4
175 | + "salary,"// 5
176 | + "experience,"// 6
177 | + "education" // 7
178 | + " FROM job_data";
179 | stmt = conn.prepareStatement(sql);
180 | rs = stmt.executeQuery();
181 | /*
182 | * 测试rs的长度,输出结果为46353,全部可以提取 rs.last(); System.out.println(rs.getRow());
183 | */
184 | // 查到46353条记录后
185 | // 插入到结果集的data_from 1代表BOSS直聘的数据
186 | int dataFrom = 1;
187 | String province = null;
188 | String city = null;
189 | String keyWord = null;
190 | String companyOrTeam = null;
191 | double minSalary = 0.0;
192 | double maxSalary = 0.0;
193 | double avgSalary = 0.0;
194 | int minExperience=0;
195 | int minEducation=0;
196 | // key_words_map留空
197 | while (rs.next()) {
198 | try {
199 | // 提取province----->
200 | province = rs.getString(1);
201 | // 提取city
202 | city = rs.getString(2);
203 | // 提取关键词
204 | keyWord = rs.getString(3);
205 | // 提取公司_组织名
206 | companyOrTeam = rs.getString(4);
207 | /*
208 | * 提取salary,并做处理
209 | */
210 | String salaryStr = rs.getString(5);
211 | try {
212 | String[] salaryArray = salaryStr.trim().split("-");
213 | String minSalaryStr = salaryArray[0];
214 | minSalary = Double.parseDouble(minSalaryStr.substring(0, minSalaryStr.indexOf("k")));
215 | String maxSalaryStr = salaryArray[1];
216 | maxSalary = Double.parseDouble(maxSalaryStr.substring(0, maxSalaryStr.indexOf("k")));
217 | minSalary=minSalary*1000;
218 | maxSalary=maxSalary*1000;
219 | avgSalary = (minSalary + maxSalary) / 2;
220 | } catch (Exception e) {
221 | // 如果salary处理失败或者报错,任何薪水项=0
222 | maxSalary = 0;
223 | minSalary = 0;
224 | avgSalary = 0;
225 | }
226 | /*
227 | * 提取工作经验:experience 处理后--->min_experience 处理失败的都置为0
228 | */
229 | String experienceStr = rs.getString(6);
230 | try {
231 | minExperience = Integer.parseInt(String.valueOf(experienceStr.trim().charAt(0)));
232 | } catch (Exception e) {
233 | // 如果最小工作经验转化失败,则置为-1
234 | minExperience = 0;
235 | }
236 | /*
237 | * 提取学历,处理出min_education
238 | * 如果处理失败,则置为-1
239 | * 最低学历:
240 | */
241 | String educationStr=rs.getString(7);
242 | try {
243 | if(educationStr.contains("专")){
244 | minEducation=1;
245 | }else if(educationStr.contains("本")){
246 | minEducation=2;
247 | }else if(educationStr.contains("硕")){
248 | minEducation=3;
249 | }else if(educationStr.contains("博")){
250 | minEducation=4;
251 | }else {
252 | minEducation=0;
253 | }
254 | } catch (Exception e) {
255 | // 最小学历转化失败,则置为-1
256 | minEducation = -1;
257 | }
258 | } catch (Exception e) {
259 | System.out.println("getString失败");
260 | e.printStackTrace();
261 | }
262 | sql="INSERT INTO job_data_result(data_from,province,city,key_word"
263 | + ",company_or_team,min_salary,max_salary,avg_salary"
264 | + ",min_experience,min_education) "
265 | + "VALUES(?,?,?,?"
266 | + ",?,?,?,?"
267 | + ",?,?"
268 | + ")";
269 | stmt=conn.prepareStatement(sql);
270 | stmt.setInt(1, dataFrom);
271 | stmt.setString(2, province);
272 | stmt.setString(3, city);
273 | stmt.setString(4, keyWord);
274 | stmt.setString(5, companyOrTeam);
275 | stmt.setDouble(6, minSalary);
276 | stmt.setDouble(7, maxSalary);
277 | stmt.setDouble(8, avgSalary);
278 | stmt.setInt(9, minExperience);
279 | stmt.setInt(10, minEducation);
280 | stmt.executeUpdate();
281 | count++;
282 | }
283 | System.out.println("插入条数:"+count);
284 | }
285 |
286 | public static void main(String[] args) {
287 | try {
288 | DataCleaner cleaner = new DataCleaner();
289 | cleaner.moveBossDataToResult();
290 | // cleaner.moveBossDataToResult();
291 | // cleaner.moveLagouDataToResult();
292 | // String string = " 3-5年本科";
293 | // string = string.trim();
294 | // System.out.println(string.charAt(0));
295 | //test();
296 | } catch (Exception e) {
297 | e.printStackTrace();
298 | }
299 | }
300 |
301 | public static void test() {
302 | String string = "10k-20k";
303 | string = string.trim();
304 | for (String string2 : string.split("-")) {
305 | System.out.println(string2);
306 | }
307 | String experienceStr = "";
308 |
309 | }
310 | }
311 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/util/MyUtil.java:
--------------------------------------------------------------------------------
1 | package com.radish.util;
2 |
3 | public class MyUtil {
4 |
5 | public static void main(String[] args) {
6 | // TODO Auto-generated method stub
7 |
8 | }
9 |
10 | }
11 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/util/UrlListIniter.java:
--------------------------------------------------------------------------------
1 | package com.radish.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.InputStreamReader;
7 | import java.sql.Connection;
8 | import java.sql.DriverManager;
9 | import java.sql.PreparedStatement;
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | import java.util.StringTokenizer;
13 |
14 | import com.radish.vo.BOSSUrlVO;
15 | /**
16 | * 初始化表url_list
17 | * 表结构:
18 | * `id` int(11) NOT NULL AUTO_INCREMENT, 任务标号
19 | `province` varchar(50) NOT NULL,
20 | `city` varchar(100) NOT NULL,
21 | `url` varchar(500) NOT NULL,
22 | `key_word` varchar(50) NOT NULL,
23 | status int not null 0代表爬虫可领取,1代表爬虫已领取,2代表爬虫领取任务后成功提交
24 | * @author admin
25 | *
26 | */
27 | public class UrlListIniter {
28 | // 初始化爬取队列,每个VO中的url都是可直接访问的
29 | private static List urlList = new ArrayList();
30 | // 要爬取的关键词 java python web linux C#
31 | private static String[] keys = new String[] { "java", "python", "web", "linux","C%23" };
32 | // 数据库连接
33 | private static Connection conn;
34 | private static String sql="";
35 | static{
36 | try {
37 | Class.forName("com.mysql.jdbc.Driver");
38 | String username="root";
39 | String password="admin";
40 | String url="jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf8";
41 | conn=DriverManager.getConnection(url, username, password);
42 | } catch (Exception e) {
43 | System.out.println("静态初始化块出错");
44 | }
45 | }
46 | public static void main(String[] args) throws Exception {
47 | String filePath = "C:/Users/admin/Desktop/BossUrl.txt";
48 | BufferedReader reader = new BufferedReader(
49 | new InputStreamReader(new FileInputStream(new File(filePath)), "UTF-8"));
50 | // 逐行读
51 | String line = null;
52 | while ((line = reader.readLine()) != null) {
53 | StringTokenizer tokens = new StringTokenizer(line);
54 | String province = null;
55 | String city = null;
56 | String url = null;
57 | if (tokens.hasMoreTokens()) {
58 | province = tokens.nextToken();
59 | }
60 | if (tokens.hasMoreTokens()) {
61 | city = tokens.nextToken();
62 | }
63 | if (tokens.hasMoreTokens()) {
64 | url = tokens.nextToken();
65 | }
66 | // 根据关键词数组进行初始化
67 | for (int i = 0; i < keys.length; i++) {
68 | urlList.add(new BOSSUrlVO(province, city, url, keys[i]));
69 | }
70 | }
71 | reader.close();
72 | // 如果list初始化成功
73 | if (urlList.size() != 0) {
74 | for (BOSSUrlVO vo : urlList) {
75 | insertVO(vo);
76 | }
77 | } else {
78 | System.out.println("list 初始化失败 程序退出");
79 | System.exit(0);
80 | }
81 | conn.close();
82 | }// main
83 | /**
84 | * 向数据库插入一条数据
85 | * @param vo
86 | */
87 | public static void insertVO(BOSSUrlVO vo){
88 | try {
89 | //conn.setTransactionIsolation(conn.TRANSACTION_SERIALIZABLE);
90 | //conn.setAutoCommit(false);
91 | sql="INSERT INTO url_list(province,city,url,key_word,status) VALUES(?,?,?,?,0)";
92 | PreparedStatement stmt = conn.prepareStatement(sql);
93 | stmt.setString(1, vo.getProvince());
94 | stmt.setString(2, vo.getCity());
95 | stmt.setString(3, vo.getUrl());
96 | stmt.setString(4, vo.getKey());
97 | stmt.executeUpdate();
98 | } catch (Exception e) {
99 | System.out.println("insertVO error!");
100 | e.printStackTrace();
101 | }
102 |
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/vo/BOSSUrlVO.java:
--------------------------------------------------------------------------------
1 | package com.radish.vo;
2 |
3 | /**
4 | * 对应
5 | * 北京 北京 https://www.zhipin.com/job_detail/?query=#&scity=101010100
6 | * @author admin
7 | *
8 | */
9 | public class BOSSUrlVO {
10 | private Integer id;
11 | private String province;
12 | private String city;
13 | private String url;
14 | private String key;
15 | private Integer status;
16 |
17 | public BOSSUrlVO() {
18 | super();
19 | }
20 |
21 | public BOSSUrlVO(Integer id, String province, String city, String url, String key, Integer status) {
22 | this.id = id;
23 | this.province = province;
24 | this.city = city;
25 | this.key = key;
26 | this.url = url.replace("#", this.key);
27 | this.status = status;
28 | }
29 | public BOSSUrlVO(String province, String city, String url, String key) {
30 | this.province = province;
31 | this.city = city;
32 | this.key = key;
33 | this.url = url.replace("#", this.key);
34 | }
35 | public Integer getId() {
36 | return id;
37 | }
38 |
39 | public void setId(Integer id) {
40 | this.id = id;
41 | }
42 |
43 | public String getProvince() {
44 | return province;
45 | }
46 |
47 | public void setProvince(String province) {
48 | this.province = province;
49 | }
50 |
51 | public String getCity() {
52 | return city;
53 | }
54 |
55 | public Integer getStatus() {
56 | return status;
57 | }
58 |
59 | public void setStatus(Integer status) {
60 | this.status = status;
61 | }
62 |
63 | public void setCity(String city) {
64 | this.city = city;
65 | }
66 |
67 | public String getUrl() {
68 | return url;
69 | }
70 |
71 | public void setUrl(String url) {
72 | this.url = url;
73 | }
74 |
75 | @Override
76 | public int hashCode() {
77 | final int prime = 31;
78 | int result = 1;
79 | result = prime * result + ((city == null) ? 0 : city.hashCode());
80 | result = prime * result + ((province == null) ? 0 : province.hashCode());
81 | result = prime * result + ((url == null) ? 0 : url.hashCode());
82 | return result;
83 | }
84 |
85 | @Override
86 | public boolean equals(Object obj) {
87 | if (this == obj)
88 | return true;
89 | if (obj == null)
90 | return false;
91 | if (getClass() != obj.getClass())
92 | return false;
93 | BOSSUrlVO other = (BOSSUrlVO) obj;
94 | if (city == null) {
95 | if (other.city != null)
96 | return false;
97 | } else if (!city.equals(other.city))
98 | return false;
99 | if (province == null) {
100 | if (other.province != null)
101 | return false;
102 | } else if (!province.equals(other.province))
103 | return false;
104 | if (url == null) {
105 | if (other.url != null)
106 | return false;
107 | } else if (!url.equals(other.url))
108 | return false;
109 | return true;
110 | }
111 |
112 | public String getKey() {
113 | return key;
114 | }
115 |
116 | public void setKey(String key) {
117 | this.key = key;
118 | }
119 |
120 | @Override
121 | public String toString() {
122 | return "BOSSUrlVO [province=" + province + ", city=" + city + ", url=" + url + "]";
123 | }
124 |
125 | }
126 |
--------------------------------------------------------------------------------
/src/main/java/com/radish/vo/JobDataVO.java:
--------------------------------------------------------------------------------
1 | package com.radish.vo;
2 |
3 | import java.util.Map;
4 |
5 | public class JobDataVO {
6 | // 任务ID
7 | private Integer id;
8 | // 城市
9 | private String city;
10 | // 关键词
11 | private String keyWord;
12 | // 标题
13 | private String title;
14 | // 公司名称
15 | private String company;
16 | // 职位名称
17 | private String job;
18 | // 薪水字符串
19 | private String salary;
20 | // 工作经验
21 | private String experience;
22 | // 学历
23 | private String education;
24 | // 职位要求
25 | private String jobRequestMessage;
26 | // 打开后可以获取招聘要求的url地址
27 | private String jobRequestUrl;
28 | // 关键词map
29 | private Map keyMap;
30 |
31 | // 空构造
32 | public JobDataVO() {
33 | super();
34 | }
35 |
36 | public JobDataVO(Integer id, String city, String keyWord, String title, String company, String job, String salary,
37 | String experience, String education, String jobRequestMessage, Map keyMap,String jobRequestUrl) {
38 | this.id = id;
39 | this.city = city;
40 | this.keyWord = keyWord;
41 | this.title = title;
42 | this.company = company;
43 | this.job = job;
44 | this.salary = salary;
45 | this.experience = experience;
46 | this.education = education;
47 | this.jobRequestMessage = jobRequestMessage;
48 | this.keyMap = keyMap;
49 | this.jobRequestUrl=jobRequestUrl;
50 | }
51 |
52 | /*
53 | * 以下为get和set
54 | */
55 | public Integer getId() {
56 | return id;
57 | }
58 | public void setId(Integer id) {
59 | this.id = id;
60 | }
61 | public String getCity() {
62 | return city;
63 | }
64 | public void setCity(String city) {
65 | this.city = city;
66 | }
67 | public String getKeyWord() {
68 | return keyWord;
69 | }
70 | public void setKeyWord(String keyWord) {
71 | this.keyWord = keyWord;
72 | }
73 | public String getTitle() {
74 | return title;
75 | }
76 | public void setTitle(String title) {
77 | this.title = title;
78 | }
79 | public String getCompany() {
80 | return company;
81 | }
82 | public void setCompany(String company) {
83 | this.company = company;
84 | }
85 | public String getJob() {
86 | return job;
87 | }
88 | public void setJob(String job) {
89 | this.job = job;
90 | }
91 | public String getSalary() {
92 | return salary;
93 | }
94 | public void setSalary(String salary) {
95 | this.salary = salary;
96 | }
97 | public String getExperience() {
98 | return experience;
99 | }
100 | public void setExperience(String experience) {
101 | this.experience = experience;
102 | }
103 | public String getEducation() {
104 | return education;
105 | }
106 | public void setEducation(String education) {
107 | this.education = education;
108 | }
109 |
110 | public String getJobRequestMessage() {
111 | return jobRequestMessage;
112 | }
113 |
114 | public void setJobRequestMessage(String jobRequestMessage) {
115 | this.jobRequestMessage = jobRequestMessage;
116 | }
117 |
118 | public String getJobRequestUrl() {
119 | return jobRequestUrl;
120 | }
121 |
122 | public void setJobRequestUrl(String jobRequestUrl) {
123 | this.jobRequestUrl = jobRequestUrl;
124 | }
125 |
126 | public Map getKeyMap() {
127 | return keyMap;
128 | }
129 | public void setKeyMap(Map keyMap) {
130 | this.keyMap = keyMap;
131 | }
132 |
133 | @Override
134 | public String toString() {
135 | return "JobDataVO [id=" + id + ", city=" + city + ", keyWord=" + keyWord + ", title=" + title + ", company="
136 | + company + ", job=" + job + ", salary=" + salary + ", experience=" + experience + ", education="
137 | + education + ", jobRequestMessage=" + jobRequestMessage + ", jobRequestUrl=" + jobRequestUrl
138 | + ", keyMap=" + keyMap + "]";
139 | }
140 |
141 |
142 |
143 |
144 | }
145 |
--------------------------------------------------------------------------------
/src/main/java/com/random/crawler/BOSSRequestMessageCrawler.java:
--------------------------------------------------------------------------------
1 | package com.random.crawler;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.IOException;
6 | import java.sql.Connection;
7 | import java.sql.DriverManager;
8 | import java.sql.PreparedStatement;
9 | import java.sql.ResultSet;
10 | import java.util.List;
11 | import java.util.Properties;
12 |
13 | import org.openqa.selenium.chrome.ChromeDriver;
14 |
15 | /**
16 | * 将job_message.txt文件 文件格式key_word job_request_url
17 | * 中的url取出,根据url爬取数据存入到数据库表job_message中
18 | *
19 | * @author admin
20 | *
21 | */
22 | public class BOSSRequestMessageCrawler {
23 | private List fileList;
24 | private Connection conn;
25 | private static String localdriver = null; // 本地浏览器驱动位置
26 | private ChromeDriver driver;
27 |
28 | /**
29 | * 读取配置文件
30 | */
31 | static {
32 | Properties property = new Properties();
33 | try {
34 | property.load(new FileInputStream("./src/main/java/com/random/properties"));
35 | } catch (IOException e) {
36 | e.printStackTrace();
37 | }
38 | localdriver = property.getProperty("LocalChromedriver");
39 | }
40 |
41 | /*
42 | * 有参构造方法,得到一个fileList
43 | */
44 | public BOSSRequestMessageCrawler(List fileList) {
45 |
46 | try {
47 | Class.forName("org.gjt.mm.mysql.Driver");
48 | conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8", "root",
49 | "root");
50 | } catch (Exception e) {
51 | System.out.println("数据库连接失败");
52 | }
53 | this.fileList = fileList;
54 | }
55 |
56 | /*
57 | * 将job_message.txt文件中的key url写入job_message中,并将status的值置为0
58 | */
59 | public void addAllLineToMySQL() {
60 |
61 | }
62 |
63 | public void crawlerMessage() {
64 | String sql = "SELECT url FROM job_message WHERE status=0 ORDER BY id LIMIT 1";
65 | try {
66 | PreparedStatement stmt = conn.prepareStatement(sql);
67 | ResultSet rs = stmt.executeQuery();
68 | // 如果rs有没有
69 | if (rs.wasNull()) {
70 | return;
71 | } else {// 如果有,指针下移
72 | rs.next();
73 | String url = rs.getString(1);
74 |
75 | }
76 | } catch (Exception e) {
77 |
78 | }
79 |
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/com/random/crawler/TaskManager.java:
--------------------------------------------------------------------------------
1 | package com.random.crawler;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileInputStream;
6 | import java.io.InputStreamReader;
7 | import java.io.ObjectInputStream;
8 | import java.sql.Blob;
9 | import java.sql.Connection;
10 | import java.sql.DriverManager;
11 | import java.sql.PreparedStatement;
12 | import java.sql.ResultSet;
13 | import java.sql.SQLException;
14 | import java.util.ArrayList;
15 | import java.util.HashMap;
16 | import java.util.List;
17 | import java.util.Map;
18 | import java.util.Set;
19 |
20 | import org.openqa.selenium.By;
21 | import org.openqa.selenium.chrome.ChromeDriver;
22 |
23 | import jeasy.analysis.MMAnalyzer;
24 |
25 | public class TaskManager {
26 | private Connection conn;
27 | private ChromeDriver driver;
28 |
29 | public TaskManager() {
30 | try {
31 | Class.forName("com.mysql.jdbc.Driver");
32 | conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8", "root",
33 | "root");
34 |
35 | } catch (Exception e) {
36 | System.out.println("数据库连接失败");
37 | }
38 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe");
39 | driver = new ChromeDriver();
40 | System.out.println("构造方法执行完毕");
41 | }
42 |
43 | /**
44 | *
45 | * 将job_message.txt文件中的key url写入job_message中,并将status的值置为0
46 | *
47 | * @param fileList
48 | */
49 | public void initData(List fileList) {
50 | for (File file : fileList) {
51 | try {
52 | BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
53 | String line = null;
54 | String sql = null;
55 | while ((line = reader.readLine()) != null) {
56 | String key = line.split("\t")[0];
57 | String url = line.split("\t")[1];
58 | sql = "INSERT INTO job_message(key_word,url,status) VALUES(?,?,0)";
59 | PreparedStatement stmt = conn.prepareStatement(sql);
60 | stmt.setString(1, key);
61 | stmt.setString(2, url);
62 | stmt.executeUpdate();
63 | stmt.close();
64 | }
65 | reader.close();
66 | } catch (Exception e) {
67 | System.out.println("文件读取失败");
68 | }
69 | }
70 | }
71 |
72 | /**
73 | * 爬虫获取数据表中的url 爬取 message 取出url,status置为1
74 | *
75 | */
76 | public void startCrawler() {
77 | try {// 设置事务不自动提交
78 | conn.setAutoCommit(false);
79 | System.out.println("设置自动提交为false");
80 | } catch (SQLException e1) {
81 | System.out.println("设置自动提交为false,失败!!!!!!!-----------------");
82 | }
83 | s: while (true) {
84 | String sql = "SELECT url_id,url FROM job_message WHERE status=0 ORDER BY url_id LIMIT 1";
85 | try {
86 | PreparedStatement stmt = conn.prepareStatement(sql);
87 | ResultSet rs = stmt.executeQuery();
88 | // 如果rs有没有
89 | if (rs.wasNull()) {
90 | conn.commit();
91 | return;
92 | } else {// 如果有,指针下移
93 | // System.out.println("查到一条记录");
94 | rs.next();
95 | int urlId = rs.getInt(1);
96 | String url = rs.getString(2);
97 | // System.out.println("url_id:" + urlId + "url:" + url);
98 | sql = "UPDATE job_message SET status = 1 WHERE url_id = ?";
99 | stmt = conn.prepareStatement(sql);
100 | stmt.setInt(1, urlId);
101 | stmt.executeUpdate();
102 | conn.commit();
103 | // url ---> message 此处是getBossMessage() 拉钩的方法为
104 | String message = null;
105 | // int reTry = 0;
106 | // 如果得message失败,就重试,直到重试2次
107 | // while ((message = getBossMessage(url)) == null && reTry <2) {
108 | // reTry++;
109 | // }
110 | // 如果重试2次还=null,放弃这个url
111 | if ((message = getBossMessage(url)) == null) {
112 | System.out.println("死活得不到这个message,不要了 urlID:" + urlId);
113 | continue s;
114 | }
115 | try {
116 | sql = "UPDATE job_message SET message=? WHERE url_id=?";
117 | stmt = conn.prepareStatement(sql);
118 | stmt.setString(1, message);
119 | stmt.setInt(2, urlId);
120 | stmt.executeUpdate();
121 | conn.commit();
122 | } catch (Exception e) {
123 | System.out.println("一个状态值被置为1的记录,并没有写入message!!!! urlID:" + urlId);
124 | }
125 | }
126 | } catch (Exception e) {
127 | e.printStackTrace();
128 | try {
129 | conn.rollback();
130 | } catch (SQLException e1) {
131 | e1.printStackTrace();
132 | }
133 | continue s;
134 | }
135 | }
136 | }
137 |
138 | /*
139 | * 通过url,动态爬取Boss直聘 message
140 | */
141 | private String getBossMessage(String url) {
142 | String message = null;
143 | try {
144 | driver.get(url);
145 | Thread.sleep(2000);
146 | message = driver.findElement(By.cssSelector("div.detail-content .job-sec")).getText();
147 | } catch (Exception e) {
148 | try {
149 | Thread.sleep(20 * 1000);
150 | } catch (InterruptedException e1) {
151 | e1.printStackTrace();
152 | }
153 | System.out.println("根据url获取message失败,可能是需要输入验证码");
154 | }
155 | return message;
156 | }
157 |
158 | /**
159 | * 方法的出口,条件是没有status=1 而且有message了
160 | */
161 | public void pickMapFromMessage() {
162 | s: while (true) {
163 | try {
164 | conn.setAutoCommit(false);
165 | } catch (SQLException e1) {
166 | System.out.println("设置自动提交false失败");
167 | }
168 | Map message_map = new HashMap();
169 | String sql = "SELECT url_id,message FROM job_message WHERE status=1 ORDER BY url_id LIMIT 1";
170 | try {
171 | PreparedStatement stmt = conn.prepareStatement(sql);
172 | ResultSet rs = stmt.executeQuery();
173 | conn.commit();
174 | if (rs.wasNull()) {
175 | // 方法的出口,条件是没有status=1的记录
176 | return;
177 | } else {
178 | rs.next();
179 | int urlId = rs.getInt(1);
180 | // 如果查到了记录,就把相对应的记录的status设置为2
181 | try {
182 | sql = "update job_message set status=2 where url_id=" + urlId;
183 | stmt = conn.prepareStatement(sql);
184 | stmt.executeUpdate();
185 | conn.commit();
186 | } catch (Exception e1) {
187 | System.out.println("设置message处理完毕的status=2失败,url_id:" + urlId);
188 | try {
189 | conn.rollback();
190 | } catch (Exception e) {
191 | e.printStackTrace();
192 | }
193 |
194 | }
195 | String message = rs.getString(2);
196 | MMAnalyzer mm = new MMAnalyzer();
197 | String[] keys = mm.segment(message, "|").split("\\|");
198 | for (String key : keys) {
199 | if (key.matches("[a-zA-Z/#\\\\]+")) {
200 | // 如果符合英文,但是已有,则value+1
201 | if (message_map.containsKey(key)) {
202 | message_map.put(key, message_map.get(key) + 1);
203 | } else {// 如果不包含
204 | message_map.put(key, 1);
205 | }
206 | }
207 | }
208 |
209 | try {
210 | sql = "UPDATE job_message SET message_map = ? WHERE url_id=?";
211 | stmt = conn.prepareStatement(sql);
212 | stmt.setObject(1, message_map);
213 | stmt.setInt(2, urlId);
214 | stmt.executeUpdate();
215 | conn.commit();
216 | } catch (Exception e) {
217 | conn.rollback();
218 | System.out.println("message_map写入失败,url_id:" + urlId);
219 | }
220 | }
221 | } catch (Exception e) {
222 | try {
223 | conn.rollback();
224 | } catch (SQLException e1) {
225 | e1.printStackTrace();
226 | }
227 | e.printStackTrace();
228 | continue s;
229 | }
230 | }
231 | }
232 |
233 | public Map readMap() {
234 |
235 | String sql = "SELECT message_map FROM job_message WHERE message IS NOT NULL ORDER BY url_id LIMIT 1";
236 | try {
237 | PreparedStatement stmt = conn.prepareStatement(sql);
238 | ResultSet rs = stmt.executeQuery();
239 | // System.out.println("rs.size" + rs.wasNull());
240 | rs.next();
241 | Blob message_map = rs.getBlob(1);
242 | ObjectInputStream objIs = new ObjectInputStream(message_map.getBinaryStream());
243 | Map map = (Map) objIs.readObject();
244 | // Set keySet = map.keySet();
245 | // for (String key : keySet) {
246 | // System.out.println("key:" + key + " value:" + map.get(key));
247 | // }
248 |
249 | // 一定要考虑map为空的情况
250 |
251 | } catch (Exception e) {
252 | e.printStackTrace();
253 | }
254 | return null;
255 | }
256 |
257 | public List