├── .gitignore ├── LICENSE ├── README ├── pom.xml ├── result-sources ├── EdmundDXu │ └── files │ │ ├── city_url.sql │ │ ├── clean_result_test.txt │ │ ├── emp.txt │ │ ├── index.jsp │ │ ├── lagou.doc │ │ ├── lagou.jsp │ │ ├── lagou.sql │ │ ├── lagou.txt │ │ └── ready_url.sql ├── radish │ └── BossUrl.txt ├── random │ ├── emp.txt │ └── job_message.sql └── wcy │ └── emp.txt ├── result ├── 概要设计 │ ├── BossUrl.txt │ ├── cities.txt │ ├── emp.txt │ └── 概要设计.doc ├── 模块一_各大省会招聘概况 │ ├── 数据可视化-2018-4-26 │ │ └── view │ │ │ ├── js │ │ │ ├── echarts.min.js │ │ │ ├── jquery-1.11.2.js │ │ │ └── js.js │ │ │ └── view1.html │ └── 数据清洗阶段-2018-4-26 │ │ ├── 清洗前_lagou.sql+job_data.sql │ │ ├── job_data.sql │ │ ├── lagou.sql │ │ └── url_list.sql │ │ ├── 清洗后_job_data_result.sql │ │ └── job_data_result.sql │ │ └── 清洗过程.doc ├── 模块二_各大编程语言的工作能力成熟度分析 │ ├── file │ ├── 数据可视化-2018-4-27 │ │ └── view │ │ │ ├── dynamic │ │ │ └── EChartsDemo.war │ │ │ └── static │ │ │ ├── android.html │ │ │ ├── auto_scroll.html │ │ │ ├── c#.html │ │ │ ├── c++.html │ │ │ ├── index.html │ │ │ ├── java.html │ │ │ ├── js │ │ │ ├── echarts-wordcloud.js │ │ │ └── echarts.js │ │ │ ├── linux.html │ │ │ ├── python.html │ │ │ └── web.html │ └── 数据清洗阶段-2018-4-27 │ │ ├── 清洗前_lagou.sql+job_message.sql │ │ ├── job_message.sql │ │ └── lagou.sql │ │ ├── 清洗后_key_map.sql │ │ ├── key_map.sql │ │ └── lagou_export │ │ │ ├── android.txt │ │ │ ├── c#.txt │ │ │ ├── c++.txt │ │ │ ├── java.txt │ │ │ ├── linux.txt │ │ │ ├── python.txt │ │ │ └── web.txt │ │ └── 清洗过程.doc ├── 源文件 │ ├── CrawlerApp-0.0.1-SNAPSHOT-javadoc.jar │ ├── CrawlerApp-0.0.1-SNAPSHOT-sources.jar │ └── CrawlerApp-0.0.1-SNAPSHOT.jar └── 项目演讲.ppt ├── sources ├── hadoop.dll └── winutils.exe ├── src └── main │ └── java │ ├── com │ ├── edmund │ │ ├── crawler │ │ │ ├── JobCrawler.java │ │ │ ├── KeyMapMerger.java │ │ │ ├── LGJobCleaner.java │ │ │ ├── LGJobCrawler.java │ │ │ ├── LGJobCrawlerThread.java │ │ │ └── LGJobUrlGenerator.java │ │ ├── properties │ │ ├── test │ │ │ └── Test.java │ │ ├── utils │ │ │ ├── DBUtils.java │ │ │ ├── DataBaseConnection.java │ │ │ ├── LGCleanUtils.java │ │ │ └── LGDBUtils.java │ │ └── vo │ │ │ ├── Job.java │ │ │ ├── KeyMap.java │ │ │ └── LGJob.java │ ├── radish │ │ ├── HDFSUtil │ │ │ └── HDFSTest.java │ │ ├── analysis │ │ │ └── DataAnalysiser.java │ │ ├── crawler │ │ │ ├── BOSSCrawlerManager.java │ │ │ ├── BOSSProvinceCrawler.java │ │ │ ├── Test.java │ │ │ └── distributed │ │ │ │ ├── DistributedCrawler.java │ │ │ │ └── Test.java │ │ ├── dataclean │ │ │ └── DataCleaner.java │ │ ├── util │ │ │ ├── MyUtil.java │ │ │ └── UrlListIniter.java │ │ └── vo │ │ │ ├── BOSSUrlVO.java │ │ │ └── JobDataVO.java │ ├── random │ │ ├── crawler │ │ │ ├── BOSSRequestMessageCrawler.java │ │ │ ├── TaskManager.java │ │ │ └── Test.java │ │ ├── properties │ │ └── test │ │ │ └── Demo.java │ └── wcy │ │ └── test │ │ └── Test.java │ └── log4j.properties └── 概要设计.doc /.gitignore: -------------------------------------------------------------------------------- 1 | /.settings 2 | /.classpath 3 | /.project 4 | /target 5 | /bin/ 6 | /~$oject_Info.doc 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 radishT 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | 各位使用的时候注意分寸。。 2 | 3 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.radish.Crawler 6 | CrawlerApp 7 | 0.0.1-SNAPSHOT 8 | jar 9 | CrawlerApp 10 | http://maven.apache.org 11 | 12 | UTF-8 13 | 14 | 15 | 16 | 17 | jdk.tools 18 | jdk.tools 19 | 1.8 20 | system 21 | ${JAVA_HOME}/lib/tools.jar 22 | 23 | 24 | junit 25 | junit 26 | 3.8.1 27 | test 28 | 29 | 30 | 31 | org.jsoup 32 | jsoup 33 | 1.10.3 34 | 35 | 36 | 37 | mysql 38 | mysql-connector-java 39 | 5.1.29 40 | 41 | 42 | 43 | org.apache.hadoop 44 | hadoop-hdfs 45 | 2.7.2 46 | 47 | 48 | 49 | org.apache.hadoop 50 | hadoop-common 51 | 2.7.2 52 | provided 53 | 54 | 55 | 56 | log4j 57 | log4j 58 | 1.2.17 59 | 60 | 61 | 62 | org.seleniumhq.selenium 63 | selenium-chrome-driver 64 | 3.6.0 65 | 66 | 67 | 68 | org.seleniumhq.selenium 69 | selenium-java 70 | 3.6.0 71 | 72 | 73 | 74 | com.google.guava 75 | guava 76 | 23.0 77 | 78 | 79 | 80 | com.alibaba 81 | fastjson 82 | 1.2.37 83 | 84 | 85 | 86 | org.apache.lucene 87 | lucene-core 88 | 2.0.0 89 | 90 | 95 | 96 | je 97 | analysis 98 | 1.5.3 99 | 100 | 101 | 102 | 103 | 104 | org.apache.maven.plugins 105 | maven-compiler-plugin 106 | 3.5.1 107 | 108 | 1.8 109 | 1.8 110 | 111 | 112 | 113 | 114 | 115 | org.apache.maven.plugins 116 | maven-javadoc-plugin 117 | 2.7 118 | 119 | 120 | attach-javadocs 121 | 122 | jar 123 | 124 | 125 | -Xdoclint:none 126 | 127 | 128 | 129 | 130 | 131 | 132 | org.apache.maven.plugins 133 | maven-source-plugin 134 | 3.0.1 135 | 136 | 137 | attach-sources 138 | 139 | jar 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /result-sources/EdmundDXu/files/clean_result_test.txt: -------------------------------------------------------------------------------- 1 | # 数据清理结束后用于检查是否有漏网之鱼 2 | SELECT 3 | * 4 | FROM 5 | job_data_result 6 | WHERE 7 | min_experience < 0 8 | OR min_education < 0 9 | OR min_salary = 0 10 | OR max_salary = 0 11 | OR avg_salary = 0; -------------------------------------------------------------------------------- /result-sources/EdmundDXu/files/index.jsp: -------------------------------------------------------------------------------- 1 | <%@ page language="java" contentType="text/html; charset=UTF-8" 2 | pageEncoding="UTF-8"%> 3 | <% 4 | String path = request.getContextPath(); 5 | String basePath = request.getScheme() + "://" 6 | + request.getServerName() + ":" + request.getServerPort() 7 | + path + "/"; 8 | %> 9 | 10 | 11 | 12 | 13 | 14 | Echarts Demo 15 | 16 | 17 | 100 | 101 | 102 |
103 | 104 | -------------------------------------------------------------------------------- /result-sources/EdmundDXu/files/lagou.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result-sources/EdmundDXu/files/lagou.doc -------------------------------------------------------------------------------- /result-sources/EdmundDXu/files/lagou.jsp: -------------------------------------------------------------------------------- 1 | <%@ page language="java" contentType="text/html; charset=UTF-8" 2 | pageEncoding="UTF-8"%> 3 | <% 4 | String path = request.getContextPath(); 5 | String basePath = request.getScheme() + "://" 6 | + request.getServerName() + ":" + request.getServerPort() 7 | + path + "/"; 8 | %> 9 | 10 | 11 | 12 | 13 | 14 | Echarts Demo 15 | 16 | 17 | 265 | 266 | 267 |
268 | 269 | 270 | 271 | 272 | -------------------------------------------------------------------------------- /result-sources/EdmundDXu/files/lagou.txt: -------------------------------------------------------------------------------- 1 | 北京 2 | 上海 3 | 深圳 4 | 广州 5 | 杭州 6 | 成都 7 | 南京 8 | 武汉 9 | 西安 10 | 厦门 11 | 长沙 12 | 苏州 13 | 天津 14 | 安庆 15 | 鞍山 16 | 澳门特别行政区 17 | 安阳 18 | 阿克苏 19 | 北京 20 | 保定 21 | 包头 22 | 北海 23 | 蚌埠 24 | 滨州 25 | 宝鸡 26 | 百色 27 | 巴中 28 | 亳州 29 | 本溪 30 | 保山 31 | 白山 32 | 巴彦淖尔 33 | 白银 34 | 成都 35 | 长沙 36 | 重庆 37 | 长春 38 | 常州 39 | 沧州 40 | 常德 41 | 潮州 42 | 赤峰 43 | 承德 44 | 滁州 45 | 郴州 46 | 楚雄 47 | 池州 48 | 昌吉 49 | 朝阳 50 | 东莞 51 | 德阳 52 | 东营 53 | 达州 54 | 德州 55 | 大庆 56 | 大同 57 | 丹东 58 | 德宏 59 | 定西 60 | 迪庆 61 | 恩施 62 | 鄂尔多斯 63 | 鄂州 64 | 佛山 65 | 福州 66 | 阜阳 67 | 抚顺 68 | 抚州 69 | 防城港 70 | 阜新 71 | 广州 72 | 贵阳 73 | 赣州 74 | 广元 75 | 贵港 76 | 甘孜藏族自治州 77 | 广安 78 | 高雄 79 | 杭州 80 | 合肥 81 | 哈尔滨 82 | 惠州 83 | 海口 84 | 呼和浩特 85 | 湖州 86 | 邯郸 87 | 淮安 88 | 菏泽 89 | 黄冈 90 | 衡水 91 | 河源 92 | 衡阳 93 | 黄石 94 | 汉中 95 | 河池 96 | 淮北 97 | 红河 98 | 怀化 99 | 淮南 100 | 黄山 101 | 贺州 102 | 鹤壁 103 | 黑河 104 | 济南 105 | 金华 106 | 嘉兴 107 | 江门 108 | 济宁 109 | 揭阳 110 | 荆州 111 | 晋中 112 | 九江 113 | 景德镇 114 | 晋城 115 | 酒泉 116 | 焦作 117 | 吉安 118 | 鸡西 119 | 锦州 120 | 佳木斯 121 | 昆明 122 | 开封 123 | 克拉玛依 124 | 廊坊 125 | 兰州 126 | 洛阳 127 | 拉萨 128 | 泸州 129 | 龙岩 130 | 漯河 131 | 乐山 132 | 莱芜 133 | 娄底 134 | 来宾 135 | 绵阳 136 | 梅州 137 | 眉山 138 | 马鞍山 139 | 茂名 140 | 牡丹江 141 | 南京 142 | 宁波 143 | 南昌 144 | 南宁 145 | 南通 146 | 南充 147 | 南阳 148 | 宁德 149 | 内江 150 | 南平 151 | 莆田 152 | 濮阳 153 | 攀枝花 154 | 盘锦 155 | 平顶山 156 | 萍乡 157 | 青岛 158 | 泉州 159 | 秦皇岛 160 | 清远 161 | 衢州 162 | 黔西南 163 | 曲靖 164 | 齐齐哈尔 165 | 庆阳 166 | 黔东南 167 | 黔南 168 | 钦州 169 | 日照 170 | 上海 171 | 深圳 172 | 苏州 173 | 沈阳 174 | 石家庄 175 | 汕头 176 | 绍兴 177 | 三亚 178 | 韶关 179 | 商丘 180 | 十堰 181 | 宿迁 182 | 汕尾 183 | 上饶 184 | 遂宁 185 | 邵阳 186 | 三明 187 | 三沙 188 | 宿州 189 | 随州 190 | 三门峡 191 | 松原 192 | 石嘴山 193 | 商洛 194 | 双鸭山 195 | 朔州 196 | 天津 197 | 太原 198 | 唐山 199 | 台州 200 | 泰安 201 | 泰州 202 | 台北 203 | 天水 204 | 铜仁 205 | 通化 206 | 武汉 207 | 无锡 208 | 温州 209 | 潍坊 210 | 威海 211 | 芜湖 212 | 渭南 213 | 梧州 214 | 吴忠 215 | 武威 216 | 文山 217 | 西安 218 | 厦门 219 | 徐州 220 | 西宁 221 | 咸阳 222 | 香港特别行政区 223 | 新乡 224 | 邢台 225 | 襄阳 226 | 湘潭 227 | 许昌 228 | 咸宁 229 | 信阳 230 | 新余 231 | 宣城 232 | 孝感 233 | 新北 234 | 忻州 235 | 湘西土家族苗族自治州 236 | 烟台 237 | 扬州 238 | 银川 239 | 盐城 240 | 宜昌 241 | 宜春 242 | 宜宾 243 | 岳阳 244 | 永州 245 | 阳江 246 | 运城 247 | 益阳 248 | 阳泉 249 | 雅安 250 | 云浮 251 | 延安 252 | 鹰潭 253 | 玉溪 254 | 延边 255 | 营口 256 | 郑州 257 | 珠海 258 | 中山 259 | 肇庆 260 | 淄博 261 | 镇江 262 | 湛江 263 | 株洲 264 | 漳州 265 | 遵义 266 | 驻马店 267 | 资阳 268 | 舟山 269 | 张家口 270 | 长治 271 | 枣庄 272 | 中卫 273 | 周口 274 | 张家界 275 | 张掖 276 | 昭通 -------------------------------------------------------------------------------- /result-sources/random/job_message.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : localDB 5 | Source Server Version : 50554 6 | Source Host : localhost:3306 7 | Source Database : crawler_db 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50554 11 | File Encoding : 65001 12 | 13 | Date: 2018-04-27 20:06:34 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for job_message 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `job_message`; 22 | CREATE TABLE `job_message` ( 23 | `url_id` int(11) NOT NULL AUTO_INCREMENT, 24 | `key_word` varchar(255) DEFAULT NULL, 25 | `url` varchar(500) DEFAULT NULL, 26 | `message` varchar(2000) DEFAULT NULL, 27 | `status` int(10) DEFAULT NULL, 28 | `message_map` blob, 29 | PRIMARY KEY (`url_id`) 30 | ) ENGINE=InnoDB AUTO_INCREMENT=46354 DEFAULT CHARSET=utf8; 31 | -------------------------------------------------------------------------------- /result/概要设计/概要设计.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/概要设计/概要设计.doc -------------------------------------------------------------------------------- /result/模块一_各大省会招聘概况/数据可视化-2018-4-26/view/js/js.js: -------------------------------------------------------------------------------- 1 | var dataMap = {}; 2 | function dataFormatter(obj) { 3 | var pList = ['北京','天津','河北','山西','内蒙古','辽宁','吉林','黑龙江','上海','江苏','浙江','安徽','福建','江西','山东','河南','湖北','湖南','广东','广西','海南','重庆','四川','贵州','云南','西藏','陕西','甘肃','青海','宁夏','新疆']; 4 | var temp; 5 | for (var year = 2018; year <= 2018; year++) { 6 | var max = 0; 7 | var sum = 0; 8 | temp = obj[year]; 9 | for (var i = 0, l = temp.length; i < l; i++) { 10 | max = Math.max(max, temp[i]); 11 | sum += temp[i]; 12 | obj[year][i] = { 13 | name : pList[i], 14 | value : temp[i] 15 | } 16 | } 17 | obj[year + 'max'] = Math.floor(max / 100) * 100; 18 | obj[year + 'sum'] = sum; 19 | } 20 | return obj; 21 | } 22 | 23 | dataMap.javadata1 = dataFormatter({ 24 | //max : 60000, 25 | 2018:[994,419,596,336,155,624,314,301,750,2191,999,367,1099,388,888,592,599,1096,1669,246,113,30,366,295,295,7,590,106,28,55,135] 26 | }); 27 | dataMap.javadata2 = dataFormatter({ 28 | //max : 4000, 29 | 2018:[82.44,84.21,956.84,197.8,374.69,590.2,446.17,474.2,79.68,1110.44,685.2,783.66,664.78,535.98,1390,1288.36,707,847.25,1015.08,601.99,222.89,317.87,1047.95,281.1,463.44,39.75,282.21,215.51,47.31,52.95,305] 30 | }); 31 | dataMap.javadata1=dataFormatter({2018:[994,419,596,336,155,624,314,301,750,2191,999,367,1099,388,888,592,599,1096,1669,246,113,30,366,295,295,7,590,106,28,55,135]}); 32 | dataMap.javadata2=dataFormatter({2018:[21764.59,10559.67,8471.48,7636.9,7425.81,8369.39,7033.44,7458.47,17978.67,11696.71,11914.41,9843.32,11484.53,7956.19,8363.74,8919.76,11954.09,10407.85,14360.1,8768.29,9088.5,9700.0,12265.03,8766.1,8584.75,8642.86,11192.37,7830.19,7928.57,7345.45,8633.33]}); 33 | dataMap.javadata3=dataFormatter({2018:[27463.78,13226.73,10619.13,9473.21,9406.45,10479.17,8789.81,9172.76,22457.33,14712.46,15050.05,12247.96,14495.91,10046.39,10451.58,11150.34,14989.98,12933.39,18004.19,10995.93,11345.13,12166.67,15330.6,10867.8,10654.24,10000.0,14125.42,9820.75,10142.86,9145.45,10666.67]}); 34 | dataMap.javadata4=dataFormatter({2018:[16065.39,7892.6,6323.83,5800.6,5445.16,6259.62,5277.07,5744.19,13500.0,8680.97,8778.78,7438.69,8473.16,5865.98,6275.9,6689.19,8918.2,7882.3,10716.0,6540.65,6831.86,7233.33,9199.45,6664.41,6515.25,7285.71,8259.32,5839.62,5714.29,5545.45,6600.0]}); 35 | dataMap.javadata5=dataFormatter({2018:[0.88,0.68,0.46,0.58,0.66,0.55,0.57,0.53,0.75,0.63,0.58,0.61,0.55,0.51,0.53,0.51,0.68,0.52,0.6,0.5,0.65,0.57,0.62,0.66,0.53,0.57,0.73,0.55,0.64,0.6,0.54]}); 36 | dataMap.javadata6=dataFormatter({2018:[0.17,0.38,0.45,0.4,0.52,0.41,0.53,0.49,0.21,0.35,0.47,0.44,0.4,0.51,0.47,0.36,0.29,0.25,0.31,0.41,0.47,0.37,0.32,0.48,0.53,0.29,0.27,0.58,0.5,0.45,0.55]}); 37 | dataMap.cdata1=dataFormatter({2018:[436,263,172,6,11,425,66,2,404,854,635,245,606,8,557,319,238,141,971,64,23,171,384,26,64,2,267,2,1,4,13]}); 38 | dataMap.cdata2=dataFormatter({2018:[14083.72,8471.48,7229.65,6833.33,7181.82,8261.18,7242.42,6500.0,13647.28,10152.22,10826.77,9383.67,9854.79,7125.0,8529.62,8716.3,9405.46,9127.66,10754.38,7046.88,7456.52,8418.13,10277.34,7269.23,8242.19,9000.0,9174.16,6500.0,3500.0,7625.0,8038.46]}); 39 | dataMap.cdata3=dataFormatter({2018:[17729.36,10608.37,9069.77,8666.67,8818.18,10294.12,9151.52,8500.0,17178.22,12761.12,13634.65,11730.61,12438.94,8750.0,10673.25,10827.59,11798.32,11326.24,13568.49,8765.63,9260.87,10608.19,12861.98,9038.46,10312.5,10000.0,11531.84,8500.0,5000.0,9250.0,10153.85]}); 40 | dataMap.cdata4=dataFormatter({2018:[10438.07,6334.6,5389.53,5000.0,5545.45,6228.24,5333.33,4500.0,10116.34,7543.33,8018.9,7036.73,7270.63,5500.0,6386.0,6605.02,7012.61,6929.08,7940.27,5328.13,5652.17,6228.07,7692.71,5500.0,6171.88,8000.0,6816.48,4500.0,2000.0,6000.0,5923.08]}); 41 | dataMap.cdata5=dataFormatter({2018:[0.72,0.59,0.44,0.5,0.27,0.45,0.59,0.5,0.62,0.53,0.53,0.61,0.38,0.38,0.46,0.36,0.49,0.35,0.43,0.36,0.3,0.44,0.52,0.42,0.47,0.0,0.64,0.5,1.0,0.5,0.31]}); 42 | dataMap.cdata6=dataFormatter({2018:[0.4,0.48,0.41,0.5,0.36,0.4,0.45,0.0,0.44,0.52,0.52,0.51,0.5,0.63,0.5,0.49,0.5,0.52,0.53,0.58,0.61,0.53,0.5,0.73,0.61,0.0,0.4,1.0,1.0,0.5,0.69]}); 43 | dataMap.linuxdata1=dataFormatter({2018:[697,307,460,17,104,518,185,6,179,957,688,331,682,107,784,361,350,330,1650,207,80,300,352,209,152,22,339,59,19,44,93]}); 44 | dataMap.linuxdata2=dataFormatter({2018:[20027.98,10506.51,8125.0,6941.18,6865.38,9862.93,7364.86,8583.33,17500.0,11242.95,12292.88,9965.26,11005.87,7523.36,9188.14,8860.11,11292.86,10207.58,12750.3,8978.26,8643.75,10453.33,12170.45,8607.66,8434.21,7090.91,11467.55,7322.03,7052.63,7488.64,7629.03]}); 45 | dataMap.linuxdata3=dataFormatter({2018:[25428.98,13172.64,10136.96,8705.88,8557.69,12158.3,9205.41,10833.33,22162.01,14208.99,15475.29,12522.66,13806.45,9308.41,11562.5,11077.56,14220.0,12742.42,16087.88,11309.18,10637.5,13140.0,15292.61,10741.63,10388.16,8545.45,14486.73,9016.95,8947.37,9590.91,9376.34]}); 46 | dataMap.linuxdata4=dataFormatter({2018:[14626.97,7840.39,6113.04,5176.47,5173.08,7567.57,5524.32,6333.33,12837.99,8276.91,9110.47,7407.85,8205.28,5738.32,6813.78,6642.66,8365.71,7672.73,9412.73,6647.34,6650.0,7766.67,9048.3,6473.68,6480.26,5636.36,8448.38,5627.12,5157.89,5386.36,5881.72]}); 47 | dataMap.linuxdata5=dataFormatter({2018:[0.81,0.66,0.45,0.41,0.49,0.66,0.56,0.5,0.71,0.58,0.59,0.58,0.51,0.43,0.54,0.37,0.62,0.5,0.56,0.45,0.51,0.54,0.65,0.64,0.49,0.41,0.77,0.59,0.47,0.48,0.55]}); 48 | dataMap.linuxdata6=dataFormatter({2018:[0.27,0.49,0.43,0.59,0.55,0.38,0.54,0.67,0.25,0.45,0.47,0.52,0.5,0.59,0.46,0.44,0.41,0.4,0.47,0.46,0.51,0.42,0.4,0.56,0.57,0.59,0.35,0.54,0.58,0.64,0.55]}); 49 | dataMap.pythondata1=dataFormatter({2018:[300,263,124,55,20,378,38,78,392,835,528,317,302,99,355,89,349,306,901,48,29,220,308,63,50,2,335,7,1,4,26]}); 50 | dataMap.pythondata2=dataFormatter({2018:[19906.67,11363.12,8475.81,7727.27,8225.0,9935.19,8315.79,9185.9,17931.12,12986.83,15310.61,11348.58,11470.2,8333.33,10349.3,9949.44,12750.72,11196.08,14235.85,11791.67,10017.24,11611.36,12951.3,9896.83,10950.0,10000.0,12594.03,6714.29,7500.0,8125.0,10019.23]}); 51 | dataMap.pythondata3=dataFormatter({2018:[25460.0,14288.97,10532.26,9672.73,10350.0,12423.28,10315.79,11217.95,22609.69,16513.77,19464.02,14441.64,14417.22,10393.94,13064.79,12483.15,16240.69,14068.63,18011.1,15125.0,12448.28,14722.73,16334.42,12285.71,13320.0,12000.0,16074.63,7857.14,10000.0,10750.0,12500.0]}); 52 | dataMap.pythondata4=dataFormatter({2018:[14353.33,8437.26,6419.35,5781.82,6100.0,7447.09,6315.79,7153.85,13252.55,9459.88,11157.2,8255.52,8523.18,6272.73,7633.8,7415.73,9260.74,8323.53,10460.6,8458.33,7586.21,8500.0,9568.18,7507.94,8580.0,8000.0,9113.43,5571.43,5000.0,5500.0,7538.46]}); 53 | dataMap.pythondata5=dataFormatter({2018:[0.83,0.82,0.45,0.78,0.7,0.73,0.61,0.67,0.82,0.67,0.71,0.76,0.56,0.65,0.63,0.6,0.76,0.59,0.66,0.58,0.55,0.63,0.73,0.75,0.52,0.0,0.8,0.43,0.0,1.0,0.62]}); 54 | dataMap.pythondata6=dataFormatter({2018:[0.36,0.47,0.43,0.42,0.75,0.4,0.42,0.44,0.39,0.39,0.51,0.44,0.46,0.6,0.43,0.44,0.4,0.39,0.47,0.44,0.59,0.4,0.38,0.51,0.48,1.0,0.32,0.57,0.0,1.0,0.5]}); 55 | dataMap.webdata1=dataFormatter({2018:[745,341,489,307,103,636,257,39,120,1370,1401,365,713,364,998,404,541,342,2663,207,112,30,707,238,267,4,427,66,9,32,59]}); 56 | dataMap.webdata2=dataFormatter({2018:[18572.48,8282.99,6674.85,6778.5,5990.29,7473.27,6250.97,6512.82,13766.67,9156.57,11143.83,8538.36,8714.59,7328.3,7329.16,6997.52,9791.13,8149.12,11408.56,7557.97,8093.75,8183.33,10073.55,8048.32,7569.29,7250.0,9241.22,7507.58,8944.44,6734.38,7177.97]}); 57 | dataMap.webdata3=dataFormatter({2018:[23524.83,10348.97,8370.14,8384.36,7495.15,9319.18,7821.01,8051.28,17175.0,11546.72,14066.38,10753.42,10955.12,9162.09,9174.35,8727.72,12316.08,10146.2,14328.58,9381.64,10053.57,10233.33,12736.92,10079.83,9423.22,8000.0,11665.11,9151.52,11444.44,8593.75,8796.61]}); 58 | dataMap.webdata4=dataFormatter({2018:[13620.13,6217.01,4979.55,5172.64,4485.44,5627.36,4680.93,4974.36,10358.33,6766.42,8221.27,6323.29,6474.05,5494.51,5483.97,5267.33,7266.17,6152.05,8488.55,5734.3,6133.93,6133.33,7410.18,6016.81,5715.36,6500.0,6817.33,5863.64,6444.44,4875.0,5559.32]}); 59 | dataMap.webdata5=dataFormatter({2018:[0.84,0.56,0.36,0.57,0.42,0.55,0.49,0.44,0.72,0.5,0.53,0.58,0.42,0.39,0.45,0.38,0.55,0.37,0.48,0.37,0.54,0.57,0.55,0.57,0.44,0.5,0.65,0.5,0.78,0.44,0.54]}); 60 | dataMap.webdata6=dataFormatter({2018:[0.28,0.54,0.55,0.51,0.66,0.49,0.59,0.56,0.45,0.53,0.53,0.56,0.58,0.6,0.58,0.57,0.42,0.51,0.47,0.55,0.49,0.63,0.43,0.55,0.56,0.5,0.4,0.61,0.22,0.56,0.66]}); 61 | dataMap.cppdata1=dataFormatter({2018:[209,8,4,2,0,0,0,3,15,44,274,25,53,3,14,1,124,42,454,1,1,21,181,2,1,0,45,0,0,0,0]}); 62 | dataMap.cppdata2=dataFormatter({2018:[22866.03,10687.5,8000.0,4750.0,0.0,0.0,0.0,9333.33,19133.33,13500.0,16945.26,11400.0,13141.51,9833.33,9071.43,11500.0,12516.13,11535.71,18411.89,7000.0,1500.0,11928.57,12447.51,9000.0,8000.0,0.0,12088.89,0.0,0.0,0.0,0.0]}); 63 | dataMap.cppdata3=dataFormatter({2018:[29071.77,13875.0,10250.0,5500.0,0.0,0.0,0.0,11333.33,24200.0,17431.82,21653.28,14760.0,16735.85,11666.67,11571.43,15000.0,15903.23,14809.52,23484.58,8000.0,2000.0,15190.48,15585.64,11500.0,10000.0,0.0,15644.44,0.0,0.0,0.0,0.0]}); 64 | dataMap.cppdata4=dataFormatter({2018:[16660.29,7500.0,5750.0,4000.0,0.0,0.0,0.0,7333.33,14066.67,9568.18,12237.23,8040.0,9547.17,8000.0,6571.43,8000.0,9129.03,8261.9,13339.21,6000.0,1000.0,8666.67,9309.39,6500.0,6000.0,0.0,8533.33,0.0,0.0,0.0,0.0]}); 65 | dataMap.cppdata5=dataFormatter({2018:[0.91,0.75,0.5,0.5,0.0,0.0,0.0,0.67,0.87,0.59,0.76,0.8,0.64,1.0,0.64,0.0,0.67,0.64,0.74,0.0,0.0,0.48,0.63,1.0,0.0,0.0,0.78,0.0,0.0,0.0,0.0]}); 66 | dataMap.cppdata6=dataFormatter({2018:[0.27,0.63,0.75,0.5,0.0,0.0,0.0,0.0,0.73,0.41,0.36,0.24,0.42,0.33,0.64,0.0,0.34,0.26,0.31,0.0,0.0,0.48,0.35,0.5,1.0,0.0,0.33,0.0,0.0,0.0,0.0]}); 67 | dataMap.androiddata1=dataFormatter({2018:[450,0,7,3,1,0,1,0,434,198,461,62,2,13,34,2,178,56,915,0,4,0,4,7,12,0,74,1,0,0,0]}); 68 | dataMap.androiddata2=dataFormatter({2018:[23723.33,0.0,7500.0,6166.67,8000.0,0.0,9500.0,0.0,18748.85,13020.2,15767.9,9467.74,7000.0,8730.77,8455.88,5750.0,10828.65,10535.71,14014.75,0.0,8250.0,0.0,7750.0,8142.86,9791.67,0.0,10250.0,9000.0,0.0,0.0,0.0]}); 69 | dataMap.androiddata3=dataFormatter({2018:[30215.56,0.0,9857.14,7666.67,9000.0,0.0,12000.0,0.0,23658.99,16585.86,20060.74,12080.65,8500.0,10923.08,10852.94,7000.0,13775.28,13285.71,17634.97,0.0,10500.0,0.0,9250.0,10000.0,12500.0,0.0,13040.54,12000.0,0.0,0.0,0.0]}); 70 | dataMap.androiddata4=dataFormatter({2018:[17231.11,0.0,5142.86,4666.67,7000.0,0.0,7000.0,0.0,13838.71,9454.55,11475.05,6854.84,5500.0,6538.46,6058.82,4500.0,7882.02,7785.71,10394.54,0.0,6000.0,0.0,6250.0,6285.71,7083.33,0.0,7459.46,6000.0,0.0,0.0,0.0]}); 71 | dataMap.androiddata5=dataFormatter({2018:[0.94,0.0,0.14,0.33,0.0,0.0,1.0,0.0,0.79,0.6,0.72,0.58,0.5,0.38,0.29,0.0,0.61,0.55,0.63,0.0,0.5,0.0,0.5,0.57,0.17,0.0,0.65,0.0,0.0,0.0,0.0]}); 72 | dataMap.androiddata6=dataFormatter({2018:[0.14,0.0,0.29,1.0,0.0,0.0,0.0,0.0,0.27,0.46,0.39,0.52,1.0,0.54,0.68,0.5,0.38,0.36,0.42,0.0,0.75,0.0,0.25,0.43,0.5,0.0,0.35,1.0,0.0,0.0,0.0]}); 73 | 74 | option = { 75 | baseOption: { 76 | timeline: { 77 | // y: 0, 78 | axisType: 'category', 79 | // realtime: false, 80 | // loop: false, 81 | autoPlay: false, 82 | // currentIndex: 2, 83 | playInterval: 1000, 84 | // controlStyle: { 85 | // position: 'left' 86 | // }, 87 | data: [ 88 | 'java','C#','linux','python','web','C++','android' 89 | ], 90 | label: { 91 | formatter : function(s) { 92 | return s; 93 | } 94 | } 95 | }, 96 | title: { 97 | subtext: '数据来自第八组萝卜中队' 98 | }, 99 | tooltip: { 100 | 101 | }, 102 | legend: { 103 | x: 'right', 104 | data: ['岗位需求量', '平均薪资', '最高薪资', '最低薪资', '本科及以上员工比例', '工作经验不限的比例'], 105 | //show:false 106 | }, 107 | calculable : true, 108 | grid: { 109 | top: 80, 110 | bottom: 100, 111 | tooltip: { 112 | trigger: 'axis', 113 | axisPointer: { 114 | type: 'shadow', 115 | label: { 116 | show: true, 117 | formatter: function (params) { 118 | return params.value.replace('\n', ''); 119 | } 120 | } 121 | } 122 | } 123 | }, 124 | xAxis: [ 125 | { 126 | 'type':'category', 127 | 'axisLabel':{'interval':0}, 128 | 'data':[ 129 | '北京','\n天津','河北','\n山西','内蒙古','\n辽宁','吉林','\n黑龙江', 130 | '上海','\n江苏','浙江','\n安徽','福建','\n江西','山东','\n河南', 131 | '湖北','\n湖南','广东','\n广西','海南','\n重庆','四川','\n贵州', 132 | '云南','\n西藏','陕西','\n甘肃','青海','\n宁夏','新疆' 133 | ], 134 | splitLine: {show: true} 135 | } 136 | ], 137 | yAxis: [ 138 | { 139 | type: 'value', 140 | name: '' 141 | } 142 | ], 143 | series: [ 144 | {name: '岗位需求量', type: 'bar'}, 145 | {name: '平均薪资', type: 'bar'}, 146 | {name: '最高薪资', type: 'bar'}, 147 | {name: '最低薪资', type: 'bar'}, 148 | {name: '本科及以上员工比例', type: 'bar'}, 149 | {name: '工作经验不限的比例', type: 'bar'}, 150 | ] 151 | }, 152 | options: [ 153 | { 154 | title: {text: '2018-4-java'}, 155 | series: [ 156 | {data: dataMap.javadata1['2018']},// menu4 157 | {data: dataMap.javadata2['2018']},// menu5 158 | {data: dataMap.javadata3['2018']},//menu6 159 | {data: dataMap.javadata4['2018']},// menu1 160 | {data: dataMap.javadata5['2018']},// menu2 161 | {data: dataMap.javadata6['2018']},// menu3 162 | ] 163 | }, 164 | { 165 | title: {text: '2018-4-C#'}, 166 | series: [ 167 | {data: dataMap.cdata1['2018']},// menu4 168 | {data: dataMap.cdata2['2018']},// menu5 169 | {data: dataMap.cdata3['2018']},//menu6 170 | {data: dataMap.cdata4['2018']},// menu1 171 | {data: dataMap.cdata5['2018']},// menu2 172 | {data: dataMap.cdata6['2018']},// menu3 173 | ] 174 | }, 175 | { 176 | title: {text: '2018-4-linux'}, 177 | series: [ 178 | {data: dataMap.linuxdata1['2018']},// menu4 179 | {data: dataMap.linuxdata2['2018']},// menu5 180 | {data: dataMap.linuxdata3['2018']},//menu6 181 | {data: dataMap.linuxdata4['2018']},// menu1 182 | {data: dataMap.linuxdata5['2018']},// menu2 183 | {data: dataMap.linuxdata6['2018']},// menu3 184 | ] 185 | }, 186 | { 187 | title: {text: '2018-4-python'}, 188 | series: [ 189 | {data: dataMap.pythondata1['2018']},// menu4 190 | {data: dataMap.pythondata2['2018']},// menu5 191 | {data: dataMap.pythondata3['2018']},//menu6 192 | {data: dataMap.pythondata4['2018']},// menu1 193 | {data: dataMap.pythondata5['2018']},// menu2 194 | {data: dataMap.pythondata6['2018']},// menu3 195 | ] 196 | }, 197 | { 198 | title: {text: '2018-4-web'}, 199 | series: [ 200 | {data: dataMap.webdata1['2018']},// menu4 201 | {data: dataMap.webdata2['2018']},// menu5 202 | {data: dataMap.webdata3['2018']},//menu6 203 | {data: dataMap.webdata4['2018']},// menu1 204 | {data: dataMap.webdata5['2018']},// menu2 205 | {data: dataMap.webdata6['2018']},// menu3 206 | ] 207 | }, 208 | { 209 | title: {text: '2018-4-C++'}, 210 | series: [ 211 | {data: dataMap.cppdata1['2018']},// menu4 212 | {data: dataMap.cppdata2['2018']},// menu5 213 | {data: dataMap.cppdata3['2018']},//menu6 214 | {data: dataMap.cppdata4['2018']},// menu1 215 | {data: dataMap.cppdata5['2018']},// menu2 216 | {data: dataMap.cppdata6['2018']},// menu3 217 | ] 218 | }, 219 | { 220 | title: {text: '2018-4-Android'}, 221 | series: [ 222 | {data: dataMap.androiddata1['2018']},// menu4 223 | {data: dataMap.androiddata2['2018']},// menu5 224 | {data: dataMap.androiddata3['2018']},//menu6 225 | {data: dataMap.androiddata4['2018']},// menu1 226 | {data: dataMap.androiddata5['2018']},// menu2 227 | {data: dataMap.androiddata6['2018']},// menu3 228 | ] 229 | } 230 | ] 231 | }; 232 | $(function () { 233 | chartOutChar = echarts.init(document.getElementById('showChart')); 234 | chartOutChar.setOption(option); 235 | 236 | }); 237 | 238 | // dispatchAction({ 239 | // type: 'legendSelect', 240 | // // 图例名称 241 | // name: string 242 | // }) 243 | -------------------------------------------------------------------------------- /result/模块一_各大省会招聘概况/数据可视化-2018-4-26/view/view1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 各大省会招聘概况 6 | 7 | 9 | 11 | 30 | 31 | 32 | 33 |

各大省会招聘概况

34 |
35 | 36 |
37 | 38 | -------------------------------------------------------------------------------- /result/模块一_各大省会招聘概况/数据清洗阶段-2018-4-26/清洗过程.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/模块一_各大省会招聘概况/数据清洗阶段-2018-4-26/清洗过程.doc -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/模块二_各大编程语言的工作能力成熟度分析/file -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/dynamic/EChartsDemo.war: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/dynamic/EChartsDemo.war -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/android.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 88 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/auto_scroll.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 职业技能分析图 9 | 10 | 11 |
12 |
13 |
14 | 61 |
62 |
63 |
64 | 65 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/c#.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 88 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/c++.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 88 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 15 | 职业技能分析图 16 | 17 | 18 |
19 |
20 | 21 |
22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
33 |
34 | 35 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/java.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 88 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/linux.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 88 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/python.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 88 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据可视化-2018-4-27/view/static/web.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 88 | 89 | 90 |
91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据清洗阶段-2018-4-27/清洗后_key_map.sql/lagou_export/c#.txt: -------------------------------------------------------------------------------- 1 | year 2 2 | yeas 4 3 | interacting 1 4 | ios 12 5 | remote 1 6 | building 6 7 | pick 1 8 | xml 79 9 | visual 60 10 | understanding 2 11 | diagnostic 1 12 | audience 1 13 | solid 1 14 | zookeeper 2 15 | dealing 1 16 | devices 2 17 | excellent 1 18 | raw 1 19 | google 2 20 | optional 1 21 | analysis 4 22 | ktv 1 23 | cdn 1 24 | lighting 1 25 | standards 2 26 | innovations 2 27 | linq 29 28 | andorid 1 29 | science 2 30 | metrics 1 31 | tfs 6 32 | providing 1 33 | restful 11 34 | cef 5 35 | least 3 36 | manual 1 37 | ceo 2 38 | cet 1 39 | procedures 2 40 | aggressive 1 41 | mssql 34 42 | interbase 1 43 | b 104 44 | website 1 45 | c 1064 46 | learn 2 47 | ooxml 1 48 | accessories 2 49 | i 5 50 | k 6 51 | l 1 52 | vlc 1 53 | knockoutjs 1 54 | n 2 55 | abp 1 56 | o 6 57 | singalr 1 58 | p 1 59 | s 220 60 | t 1 61 | u 7 62 | modbus 2 63 | qualities 1 64 | y 1 65 | information 1 66 | routines 1 67 | standard 2 68 | reports 2 69 | msdn 1 70 | good 12 71 | specifications 2 72 | deploy 2 73 | webservcie 1 74 | spring 5 75 | pad 1 76 | post 1 77 | dapper 7 78 | implement 1 79 | finish 1 80 | nhibenate 2 81 | protocols 1 82 | others 1 83 | its 1 84 | gps 3 85 | arcore 1 86 | ado 1 87 | tests 1 88 | shell 3 89 | guideline 1 90 | http 56 91 | technical 3 92 | proactive 3 93 | nodejs 3 94 | prevent 1 95 | ehchache 1 96 | functionalities 1 97 | provides 3 98 | webforms 2 99 | responsible 3 100 | jetty 1 101 | winformsocket 2 102 | afc 1 103 | effectiveness 1 104 | practicing 1 105 | perform 1 106 | mobile 3 107 | multiple 2 108 | prewritten 1 109 | powerbi 1 110 | powerdesigner 3 111 | pda 2 112 | improvement 1 113 | ivy 4 114 | windowsforms 1 115 | activex 2 116 | environment 1 117 | myql 1 118 | vpn 1 119 | service 31 120 | handling 1 121 | requirejs 1 122 | pdt 1 123 | tomcat 2 124 | focus 1 125 | pkpm 2 126 | years 6 127 | qualifications 4 128 | write 5 129 | flow 2 130 | hololens 1 131 | cli 2 132 | testing 7 133 | clr 3 134 | understand 1 135 | script 7 136 | aeon 1 137 | silverlight 7 138 | angular 5 139 | system 1 140 | windowsce 1 141 | kafka 2 142 | analyze 1 143 | openxml 1 144 | mysq 1 145 | owin 2 146 | gui 4 147 | razor 1 148 | presenting 1 149 | fundamentals 1 150 | algorithms 1 151 | tob 1 152 | against 1 153 | functionality 1 154 | local 1 155 | rails 1 156 | echarts 1 157 | vss 9 158 | deployment 1 159 | erp 41 160 | product 7 161 | robust 1 162 | television 1 163 | spirit 1 164 | produce 2 165 | prototype 1 166 | javasctipt 1 167 | tpl 1 168 | framework 85 169 | php 6 170 | rose 2 171 | entity 21 172 | com 11 173 | environments 1 174 | vue 5 175 | screen 1 176 | employee 1 177 | windorm 1 178 | mode 1 179 | automation 8 180 | optimize 1 181 | sqlsever 1 182 | etc 4 183 | enhancements 1 184 | websocket 7 185 | visualize 1 186 | eth 1 187 | ffmpeg 3 188 | net 514 189 | etl 1 190 | verification 1 191 | new 11 192 | including 5 193 | read 3 194 | already 1 195 | snomed 1 196 | less 2 197 | unit 1 198 | improve 1 199 | basic 4 200 | unix 6 201 | outing 2 202 | financial 1 203 | jquery 129 204 | pki 1 205 | vsto 3 206 | unity 5 207 | weui 2 208 | and 1 209 | winsock 5 210 | design 11 211 | soapui 1 212 | working 6 213 | mongodb 19 214 | crm 10 215 | magento 1 216 | rpc 1 217 | efcore 1 218 | plc 4 219 | ant 4 220 | quartz 2 221 | contributing 1 222 | lamp 1 223 | requirement 2 224 | sqlit 1 225 | application 7 226 | spoken 4 227 | msmq 1 228 | xslt 6 229 | maintenance 3 230 | wall 1 231 | forms 1 232 | wininet 4 233 | expanding 1 234 | css 123 235 | aop 3 236 | winserver 1 237 | maintain 3 238 | establishing 1 239 | orcle 1 240 | nhibernate 12 241 | advertising 2 242 | ajax 105 243 | professional 2 244 | skills 15 245 | pmp 3 246 | java 55 247 | cordova 1 248 | pacs 1 249 | jave 1 250 | pms 1 251 | english 9 252 | optimizing 1 253 | api 57 254 | nhiberate 1 255 | fully 1 256 | cto 2 257 | app 20 258 | ext 1 259 | using 4 260 | cache 1 261 | xamarin 3 262 | investigating 1 263 | mircosoft 1 264 | javascript 149 265 | nosql 19 266 | geek 1 267 | ooad 1 268 | creating 2 269 | alternatives 1 270 | directx 1 271 | memcached 5 272 | document 1 273 | adonet 1 274 | hibernate 1 275 | cookies 3 276 | aspnet 1 277 | routing 1 278 | knowledgeable 1 279 | vendor 1 280 | does 1 281 | kinect 1 282 | netbpm 2 283 | popular 1 284 | arm 1 285 | perforce 5 286 | cvs 1 287 | winfom 1 288 | boostrap 1 289 | forecasting 1 290 | team 10 291 | services 3 292 | rtp 1 293 | automated 1 294 | mogondb 1 295 | classic 1 296 | ppt 3 297 | asp 13 298 | medical 1 299 | engagement 1 300 | documents 1 301 | developing 2 302 | run 1 303 | unittest 1 304 | incorporate 1 305 | microsoft 19 306 | research 2 307 | restfulapi 1 308 | lifecycle 1 309 | features 2 310 | view 2 311 | white 1 312 | lis 2 313 | atl 1 314 | atm 5 315 | results 3 316 | hooking 4 317 | tech 1 318 | sqlite 13 319 | enity 1 320 | verbal 2 321 | engineering 5 322 | develop 5 323 | sharing 1 324 | technologies 6 325 | colleagues 1 326 | designs 1 327 | compatibility 1 328 | dotnetty 1 329 | next 2 330 | android 14 331 | edition 1 332 | video 1 333 | beijing 1 334 | solidworks 1 335 | winform 191 336 | mono 2 337 | not 1 338 | sqlserver 104 339 | documenting 1 340 | debugging 5 341 | hadoop 1 342 | javascprit 1 343 | news 4 344 | center 2 345 | wap 2 346 | windows 138 347 | engineer 4 348 | anomalies 1 349 | remoting 3 350 | xhtml 4 351 | manage 1 352 | grey 1 353 | guidance 1 354 | risk 1 355 | window 1 356 | finance 1 357 | applications 1 358 | studio 56 359 | play 1 360 | leader 7 361 | developers 1 362 | palm 1 363 | quickly 1 364 | interactive 1 365 | extracting 1 366 | program 5 367 | when 1 368 | required 4 369 | redis 40 370 | proficiency 3 371 | multi 1 372 | jit 1 373 | wcf 77 374 | plan 1 375 | case 1 376 | hardware 2 377 | insightful 1 378 | leveldb 1 379 | philosophy 1 380 | xunit 1 381 | creative 1 382 | npoi 1 383 | socekt 1 384 | feasibility 1 385 | provide 1 386 | opencascade 1 387 | phone 3 388 | style 1 389 | centos 1 390 | boss 1 391 | dotnet 4 392 | angela 1 393 | log 1 394 | enterprise 1 395 | methods 1 396 | testable 2 397 | bugzilla 4 398 | smart 3 399 | controltemplate 1 400 | webserivce 2 401 | computer 3 402 | premium 2 403 | web 207 404 | entityframwork 1 405 | phases 1 406 | enables 1 407 | efficient 2 408 | cosmos 1 409 | opengl 12 410 | reactjs 1 411 | shopify 1 412 | organization 1 413 | webservice 61 414 | levels 1 415 | wfp 1 416 | autofac 3 417 | used 1 418 | experience 49 419 | cloud 1 420 | protocol 1 421 | responsibilities 5 422 | sliverlight 1 423 | group 1 424 | fixing 2 425 | vxworks 1 426 | servicefabric 1 427 | webvr 1 428 | eclipse 1 429 | daily 3 430 | job 6 431 | dhtml 3 432 | soap 2 433 | itil 1 434 | udp 14 435 | contribute 1 436 | webkit 9 437 | candidate 1 438 | database 2 439 | bacnet 1 440 | ltc 1 441 | win 6 442 | designed 2 443 | rest 2 444 | process 3 445 | requirements 4 446 | chromium 1 447 | debug 3 448 | ddd 4 449 | designer 1 450 | easyui 15 451 | third 1 452 | build 1 453 | visio 3 454 | jqeury 1 455 | lua 1 456 | vuejs 1 457 | jqueryui 1 458 | further 1 459 | user 1 460 | opencv 4 461 | methodology 1 462 | webservices 1 463 | mostly 1 464 | projects 2 465 | emgucv 1 466 | videos 1 467 | executing 2 468 | conducting 1 469 | evaluation 1 470 | dev 9 471 | finds 1 472 | fix 1 473 | bat 2 474 | complex 2 475 | manufacturing 1 476 | knowledge 26 477 | databases 1 478 | documentation 3 479 | ai 3 480 | opportunity 1 481 | engineers 1 482 | personal 1 483 | rtsp 1 484 | javascipt 1 485 | senior 3 486 | bbq 4 487 | ar 11 488 | profiling 1 489 | plans 2 490 | moxa 1 491 | webar 1 492 | looking 2 493 | sdk 11 494 | agile 3 495 | drive 1 496 | attitude 1 497 | strong 24 498 | supermap 2 499 | prototyping 1 500 | bi 2 501 | dubbo 1 502 | skyline 1 503 | coding 3 504 | bs 19 505 | responsibility 2 506 | oculus 1 507 | wms 3 508 | embedded 1 509 | cc 2 510 | innovation 1 511 | rdbms 3 512 | business 4 513 | operational 1 514 | familiar 6 515 | bing 2 516 | integrate 1 517 | cs 13 518 | ct 1 519 | winforms 6 520 | cv 1 521 | partner 1 522 | wpfwinformsilverlight 1 523 | monogodb 1 524 | printstudio 1 525 | db 4 526 | added 1 527 | terabytes 1 528 | arkit 1 529 | language 3 530 | sites 2 531 | interpersonal 3 532 | div 23 533 | aspnetcore 2 534 | reply 1 535 | workbench 1 536 | ea 1 537 | programming 2 538 | info 1 539 | ef 29 540 | test 21 541 | chrome 1 542 | wpf 196 543 | restapi 1 544 | es 1 545 | budgets 1 546 | nunit 2 547 | scrum 2 548 | helps 1 549 | party 6 550 | identifying 1 551 | analytical 1 552 | session 4 553 | golang 2 554 | capable 1 555 | mach 3 556 | yui 1 557 | ft 1 558 | desktop 1 559 | related 2 560 | uml 14 561 | skill 7 562 | await 2 563 | json 38 564 | client 1 565 | gc 3 566 | views 1 567 | reporting 1 568 | knockout 2 569 | billions 1 570 | dll 6 571 | custom 1 572 | asia 1 573 | existing 6 574 | form 10 575 | hub 1 576 | management 3 577 | soket 2 578 | myssql 2 579 | big 2 580 | expert 1 581 | advanced 1 582 | windowsserver 1 583 | improves 1 584 | bim 1 585 | hr 3 586 | invoke 1 587 | halcon 1 588 | hook 2 589 | elasticsearch 3 590 | wss 10 591 | sqllite 3 592 | ia 1 593 | model 1 594 | https 4 595 | prism 2 596 | python 17 597 | il 1 598 | large 2 599 | issue 3 600 | cookie 1 601 | surface 1 602 | im 1 603 | maturity 1 604 | io 3 605 | ip 39 606 | sense 1 607 | wtl 1 608 | slg 1 609 | certification 1 610 | sli 1 611 | traditional 1 612 | flatbuffers 1 613 | field 1 614 | contents 1 615 | wtt 1 616 | slo 1 617 | doc 1 618 | status 1 619 | server 193 620 | clients 1 621 | works 1 622 | dom 12 623 | js 40 624 | tailor 1 625 | thinking 1 626 | products 2 627 | ehcache 1 628 | world 1 629 | jenkins 3 630 | orcal 2 631 | ability 3 632 | together 1 633 | creator 1 634 | ftp 2 635 | may 2 636 | orcale 2 637 | fastreport 1 638 | weblogic 2 639 | willingness 1 640 | health 1 641 | trigger 1 642 | rabbitmq 6 643 | hessian 1 644 | mbp 1 645 | oral 1 646 | complete 1 647 | powerdesign 1 648 | soa 11 649 | webgl 3 650 | usb 3 651 | wwf 2 652 | main 1 653 | serve 3 654 | office 7 655 | supersocket 1 656 | soso 1 657 | swagger 1 658 | ihistorian 1 659 | high 2 660 | solution 1 661 | reviews 1 662 | oracl 1 663 | continuous 2 664 | communication 11 665 | different 2 666 | nsis 1 667 | winrt 1 668 | workflow 6 669 | mq 1 670 | nginx 1 671 | mr 2 672 | ms 27 673 | follows 1 674 | spe 1 675 | photoshop 1 676 | experiences 1 677 | plus 9 678 | task 2 679 | nb 1 680 | orleans 1 681 | position 2 682 | angularjs 10 683 | problems 1 684 | shader 2 685 | no 1 686 | bom 1 687 | code 7 688 | passionate 1 689 | blend 3 690 | box 3 691 | storage 1 692 | demo 3 693 | glsl 1 694 | sql 397 695 | mef 3 696 | qml 1 697 | oa 5 698 | lamada 1 699 | codefirst 1 700 | postgresql 4 701 | consumers 1 702 | mes 36 703 | ok 2 704 | met 1 705 | experienced 1 706 | highly 10 707 | oo 3 708 | delphi 2 709 | execution 2 710 | processes 2 711 | or 1 712 | initiative 1 713 | determine 1 714 | mfc 6 715 | thread 3 716 | structure 1 717 | ruby 2 718 | master 4 719 | windws 1 720 | accountabilities 2 721 | pc 17 722 | pd 2 723 | winfrom 6 724 | gdal 1 725 | online 4 726 | socket 74 727 | pl 5 728 | httprestfull 1 729 | curd 1 730 | sketchup 1 731 | mangodb 1 732 | discipline 1 733 | ssl 1 734 | uwp 5 735 | objective 1 736 | qa 1 737 | dynamics 5 738 | lambda 1 739 | mvvm 28 740 | devops 1 741 | makefile 4 742 | band 1 743 | goals 1 744 | qq 1 745 | based 2 746 | github 3 747 | qt 8 748 | closely 1 749 | brt 2 750 | stl 2 751 | fluent 2 752 | quality 6 753 | concepts 1 754 | rf 2 755 | cassandra 3 756 | processing 1 757 | device 2 758 | websphere 2 759 | components 2 760 | functional 1 761 | access 5 762 | fundamental 2 763 | industry 4 764 | global 2 765 | jmeter 1 766 | btc 1 767 | josn 1 768 | current 2 769 | xpath 6 770 | mis 4 771 | datatemplate 1 772 | tcpip 3 773 | operating 1 774 | so 2 775 | unreal 1 776 | sp 2 777 | holes 1 778 | key 2 779 | ss 1 780 | activemq 2 781 | expression 1 782 | rocketmq 1 783 | one 2 784 | svn 33 785 | designing 2 786 | releases 1 787 | xaml 16 788 | extensive 1 789 | aiax 2 790 | bug 37 791 | teamwork 5 792 | assist 3 793 | troubleshooting 1 794 | rfid 4 795 | ooa 5 796 | protobuf 1 797 | tv 1 798 | jsonp 1 799 | ood 9 800 | willing 3 801 | project 2 802 | express 2 803 | webapi 38 804 | icd 1 805 | nopcommerce 1 806 | qss 1 807 | oop 15 808 | ui 29 809 | sever 8 810 | monetize 1 811 | romting 2 812 | mysql 168 813 | written 6 814 | thinkpad 1 815 | passion 1 816 | navisworks 1 817 | oracle 190 818 | ensure 2 819 | opc 3 820 | solutions 1 821 | degree 2 822 | ide 5 823 | vb 4 824 | vc 8 825 | assurance 1 826 | vf 1 827 | demonstrate 1 828 | vr 8 829 | vs 15 830 | support 5 831 | mongo 2 832 | implements 1 833 | needed 1 834 | learning 1 835 | inventory 1 836 | scada 2 837 | autocad 2 838 | devexpress 18 839 | docker 5 840 | wf 2 841 | odata 1 842 | datagridview 2 843 | linux 36 844 | ionic 1 845 | ws 1 846 | hbase 2 847 | coverage 1 848 | cases 5 849 | reading 2 850 | orm 23 851 | technology 2 852 | informatics 1 853 | webform 19 854 | extjs 5 855 | windowsform 1 856 | bentley 1 857 | bash 1 858 | relational 1 859 | osg 1 860 | software 18 861 | frameworks 1 862 | gcc 1 863 | binding 1 864 | react 2 865 | desirable 4 866 | moq 1 867 | relation 1 868 | feedback 1 869 | problem 3 870 | review 10 871 | premises 1 872 | freewheel 3 873 | azure 7 874 | wifi 2 875 | teams 2 876 | work 7 877 | visualstudio 4 878 | focusing 1 879 | gdi 7 880 | ott 1 881 | samples 1 882 | innovative 1 883 | vba 4 884 | saas 2 885 | memcahced 1 886 | fabric 1 887 | struts 1 888 | following 1 889 | ifix 1 890 | word 1 891 | internal 1 892 | architect 1 893 | enjoy 1 894 | sharepoint 15 895 | requests 1 896 | bootstrap 28 897 | foundation 2 898 | codeplex 1 899 | tools 3 900 | across 1 901 | iis 11 902 | feature 1 903 | writing 2 904 | collective 1 905 | seajs 2 906 | power 2 907 | firmware 1 908 | include 1 909 | netcore 2 910 | netframework 1 911 | dicom 4 912 | nice 3 913 | dotnetcore 1 914 | token 1 915 | excel 2 916 | help 1 917 | oracel 1 918 | htmlcssjavascript 1 919 | revit 2 920 | threading 1 921 | minimum 2 922 | architectural 1 923 | first 1 924 | wince 2 925 | data 3 926 | entityframework 11 927 | vuew 1 928 | spec 2 929 | create 2 930 | html 109 931 | memcache 7 932 | matlab 1 933 | development 27 934 | ubuntu 2 935 | maven 2 936 | cmmi 3 937 | mining 1 938 | core 22 939 | qualification 1 940 | onpremises 1 941 | emgu 2 942 | dash 1 943 | solving 3 944 | arcgis 7 945 | configuration 1 946 | cae 1 947 | cad 3 948 | mybatis 2 949 | scale 1 950 | orscle 1 951 | platform 2 952 | gis 18 953 | git 21 954 | operations 1 955 | flex 2 956 | mvc 165 957 | player 1 958 | tcp 56 959 | memorycache 1 960 | will 3 961 | mvp 7 962 | sourcesafe 1 963 | implementation 2 964 | command 1 965 | async 2 966 | enforces 1 967 | enforcer 1 968 | cbs 1 969 | tdd 2 970 | devexpre 1 971 | compliance 1 972 | efw 1 973 | ioc 6 974 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据清洗阶段-2018-4-27/清洗后_key_map.sql/lagou_export/python.txt: -------------------------------------------------------------------------------- 1 | stack 2 2 | mentor 2 3 | greenlet 1 4 | theano 2 5 | ios 7 6 | icmp 1 7 | without 1 8 | offer 3 9 | fit 1 10 | bat 1 11 | xml 2 12 | rabbit 1 13 | understanding 1 14 | visual 1 15 | lxml 2 16 | near 1 17 | vim 11 18 | uwsgi 3 19 | nova 1 20 | knowledge 1 21 | databases 1 22 | zookeeper 9 23 | excellent 1 24 | ai 6 25 | lvs 5 26 | google 6 27 | infra 1 28 | gunicorn 5 29 | cdn 27 30 | am 1 31 | easy 1 32 | sqlalchemy 13 33 | grpc 1 34 | linq 1 35 | flexible 1 36 | turning 1 37 | science 2 38 | detail 1 39 | agile 1 40 | sdk 1 41 | sdn 3 42 | pony 1 43 | ba 2 44 | schema 1 45 | strong 1 46 | restful 30 47 | bf 1 48 | bi 1 49 | least 3 50 | dubbo 1 51 | bottle 3 52 | ceo 4 53 | coding 4 54 | adaptive 1 55 | search 1 56 | selenium 5 57 | spark 11 58 | systems 2 59 | responsibility 2 60 | beautifulsoup 3 61 | apscheduler 1 62 | systemtap 1 63 | seo 1 64 | querytype 1 65 | innovation 1 66 | cd 2 67 | b 5 68 | kvm 4 69 | c 117 70 | bdd 1 71 | salt 1 72 | ci 4 73 | familiar 1 74 | l 9 75 | practices 1 76 | logical 1 77 | resiful 1 78 | flv 4 79 | qian 2 80 | r 2 81 | rdd 1 82 | s 5 83 | fast 2 84 | u 16 85 | pylint 1 86 | imac 1 87 | cloudfoundry 2 88 | haproxy 1 89 | timeline 1 90 | db 5 91 | gvent 1 92 | rds 2 93 | cgi 1 94 | nltk 1 95 | language 1 96 | acm 2 97 | good 3 98 | spring 7 99 | acid 1 100 | div 2 101 | act 1 102 | kivy 1 103 | urllib 2 104 | multithreading 1 105 | radius 1 106 | smaug 1 107 | programming 2 108 | jvm 1 109 | ef 1 110 | zabbix 4 111 | cinder 1 112 | elk 4 113 | quorum 1 114 | restframework 1 115 | dynamodb 2 116 | druid 1 117 | adx 1 118 | shell 58 119 | freezer 1 120 | http 57 121 | scrum 5 122 | wps 1 123 | icpc 1 124 | deep 2 125 | technical 1 126 | golang 17 127 | nodejs 7 128 | getopenid 1 129 | related 3 130 | uml 2 131 | mongdb 2 132 | responsible 1 133 | vagrant 1 134 | pcl 1 135 | json 1 136 | company 2 137 | jetty 1 138 | oauth 3 139 | ge 1 140 | owner 2 141 | go 30 142 | mobile 2 143 | multiple 1 144 | grand 2 145 | home 2 146 | schedule 1 147 | environment 2 148 | micro 1 149 | service 1 150 | heritrix 1 151 | pyramid 1 152 | hc 1 153 | splash 1 154 | httpclient 2 155 | agg 1 156 | tomcat 10 157 | hr 2 158 | changing 1 159 | years 3 160 | elasticsearch 10 161 | slack 1 162 | model 1 163 | https 10 164 | stackstorm 2 165 | flow 3 166 | tasks 1 167 | travelflan 2 168 | reduce 1 169 | cookie 1 170 | io 4 171 | ip 21 172 | testing 2 173 | understand 1 174 | dns 5 175 | handle 1 176 | mongokit 2 177 | cgroup 1 178 | script 1 179 | angular 3 180 | grafana 2 181 | strongly 1 182 | kafka 8 183 | javascrip 1 184 | gui 2 185 | messagequeue 3 186 | snapshot 1 187 | presto 1 188 | server 4 189 | familiarity 1 190 | js 17 191 | deliver 2 192 | mac 5 193 | products 1 194 | convnet 2 195 | library 1 196 | navicat 1 197 | rails 1 198 | jenkins 7 199 | member 1 200 | ability 1 201 | libevent 1 202 | tengine 4 203 | map 1 204 | macos 2 205 | implementations 1 206 | proficient 2 207 | scipy 3 208 | erp 5 209 | creator 1 210 | product 1 211 | within 1 212 | rhel 1 213 | wxpython 1 214 | torch 2 215 | kv 1 216 | rabbitmq 20 217 | url 1 218 | pandas 13 219 | framework 5 220 | able 2 221 | cmake 1 222 | php 33 223 | italkier 2 224 | opentsdb 1 225 | beego 1 226 | etcd 2 227 | improvements 1 228 | numy 1 229 | coo 1 230 | strdepartment 1 231 | soa 3 232 | lr 1 233 | use 1 234 | hashing 1 235 | vue 6 236 | trafficserver 3 237 | high 1 238 | automation 1 239 | optimize 1 240 | scarpy 1 241 | macbook 1 242 | websocket 1 243 | fastdfs 5 244 | traveflan 1 245 | etl 3 246 | communication 1 247 | net 2 248 | new 5 249 | realtime 1 250 | linix 1 251 | nginx 34 252 | mq 1 253 | mr 1 254 | spa 2 255 | juggle 1 256 | webgui 1 257 | pressure 1 258 | basis 1 259 | isappinstalled 1 260 | plus 1 261 | amazon 1 262 | ioloop 1 263 | spm 4 264 | apis 1 265 | typing 1 266 | gitlab 3 267 | nfs 1 268 | angularjs 4 269 | lbs 1 270 | unix 35 271 | supervisor 2 272 | nfv 1 273 | dsp 1 274 | code 7 275 | pythonic 2 276 | passionate 1 277 | fasting 1 278 | demo 1 279 | jquery 14 280 | sql 51 281 | libvirt 2 282 | grunt 1 283 | effective 1 284 | oa 2 285 | postgresql 38 286 | and 1 287 | design 1 288 | ros 1 289 | pyqt 3 290 | working 1 291 | backend 1 292 | teamcity 1 293 | mongodb 97 294 | apachengnix 1 295 | crm 9 296 | oo 5 297 | qfusion 1 298 | or 1 299 | initiative 2 300 | nonsql 1 301 | rpc 3 302 | os 4 303 | bachelor 1 304 | ruby 8 305 | master 1 306 | pc 3 307 | conduct 1 308 | pg 2 309 | opensource 2 310 | socket 7 311 | scikit 1 312 | pl 1 313 | selenuim 1 314 | pyquery 1 315 | pm 1 316 | coming 1 317 | apsaradb 2 318 | css 51 319 | aop 1 320 | nigix 1 321 | csv 2 322 | ssh 2 323 | nhibernate 1 324 | py 1 325 | ajax 14 326 | constraints 1 327 | ssl 2 328 | htmlparser 1 329 | professional 1 330 | patch 3 331 | skills 1 332 | qa 2 333 | wireshark 2 334 | java 91 335 | speaking 1 336 | mvvm 4 337 | ctf 2 338 | devops 20 339 | okr 1 340 | english 2 341 | api 44 342 | state 1 343 | nix 2 344 | defined 1 345 | cto 1 346 | stackoverflow 5 347 | app 22 348 | based 1 349 | github 26 350 | cache 3 351 | qt 6 352 | openresty 1 353 | pgsql 1 354 | javascript 54 355 | nosql 52 356 | echoing 1 357 | quality 2 358 | node 1 359 | bsd 3 360 | groovy 2 361 | cassandra 5 362 | matplotlib 3 363 | rancher 1 364 | difference 1 365 | flask 118 366 | memcached 5 367 | ironic 1 368 | cmdb 6 369 | odoo 16 370 | document 1 371 | numpy 11 372 | hibernate 2 373 | two 1 374 | ansible 12 375 | matching 1 376 | xpath 10 377 | pregresql 1 378 | celery 18 379 | releasing 1 380 | desired 1 381 | djando 1 382 | thrift 1 383 | pytorch 1 384 | mit 3 385 | tcpip 1 386 | backbone 2 387 | webserver 5 388 | so 1 389 | dtrace 1 390 | postgres 3 391 | arp 1 392 | activemq 3 393 | gitflow 1 394 | storm 3 395 | necessary 1 396 | languages 1 397 | star 1 398 | one 1 399 | svn 7 400 | team 2 401 | services 2 402 | svm 2 403 | openerp 15 404 | opnfv 1 405 | tb 2 406 | rtp 2 407 | jdk 5 408 | pull 2 409 | nlp 2 410 | ppt 1 411 | bug 8 412 | sanic 1 413 | troubleshooting 2 414 | kubernetes 12 415 | bus 1 416 | ooc 1 417 | agent 1 418 | ood 1 419 | qemu 2 420 | express 1 421 | multiprocessing 1 422 | pairs 1 423 | restfulapi 1 424 | oop 3 425 | features 1 426 | apache 13 427 | ui 5 428 | sever 1 429 | things 1 430 | keras 2 431 | mysql 183 432 | keyword 1 433 | connextion 1 434 | yaml 1 435 | surprise 1 436 | oracle 22 437 | boosting 1 438 | sqlite 4 439 | marathon 1 440 | solutions 1 441 | query 1 442 | batch 1 443 | degree 2 444 | idc 1 445 | engineering 1 446 | sharing 1 447 | pro 1 448 | twistd 1 449 | technologies 3 450 | improving 1 451 | timelines 1 452 | angualrjs 1 453 | consul 1 454 | dbaas 1 455 | vs 1 456 | internet 1 457 | sublime 1 458 | full 1 459 | mongo 6 460 | android 2 461 | sklearn 1 462 | tcpdump 1 463 | learning 5 464 | autocad 2 465 | docker 40 466 | mesos 2 467 | sqlserver 4 468 | linux 260 469 | debugging 1 470 | hadoop 19 471 | boot 1 472 | samba 1 473 | icbu 2 474 | hbase 14 475 | netty 3 476 | delivery 1 477 | webpy 2 478 | green 1 479 | cases 1 480 | paced 1 481 | ansimble 1 482 | varnish 3 483 | ssdb 1 484 | reading 3 485 | orm 6 486 | technology 1 487 | windows 3 488 | extjs 1 489 | xhtml 1 490 | asyncio 3 491 | awk 1 492 | money 1 493 | learner 1 494 | nutch 1 495 | hdfs 2 496 | bash 3 497 | time 1 498 | aws 5 499 | base 1 500 | studio 1 501 | leader 1 502 | cqrs 1 503 | software 4 504 | osi 1 505 | frameworks 1 506 | tripleo 1 507 | react 6 508 | whole 1 509 | influxdb 1 510 | redis 107 511 | required 1 512 | proficiency 1 513 | problem 1 514 | review 9 515 | gevent 4 516 | pyside 1 517 | azure 2 518 | xen 2 519 | cloudstack 1 520 | openapi 1 521 | work 3 522 | players 1 523 | caffe 4 524 | emacs 6 525 | scrapy 8 526 | insight 1 527 | jquer 1 528 | zk 1 529 | saas 14 530 | comfortable 1 531 | fabric 2 532 | struts 1 533 | following 2 534 | openstack 15 535 | word 1 536 | centos 4 537 | hive 5 538 | internal 1 539 | study 1 540 | alembic 1 541 | boss 2 542 | widget 2 543 | falcon 2 544 | bootstrap 7 545 | requests 5 546 | rtmp 4 547 | tensorflow 11 548 | computer 3 549 | feature 1 550 | web 239 551 | writing 1 552 | event 1 553 | djangorestframework 1 554 | uioc 1 555 | saltstack 10 556 | architecture 1 557 | tornado 86 558 | twisted 3 559 | consistent 1 560 | jsoup 3 561 | reactjs 1 562 | nice 1 563 | excel 1 564 | vsphere 1 565 | gfs 5 566 | fellow 1 567 | organized 1 568 | webservice 2 569 | trello 1 570 | pythonweb 2 571 | quick 2 572 | data 1 573 | own 1 574 | used 1 575 | blog 5 576 | experience 3 577 | cloud 3 578 | protocol 1 579 | teammates 1 580 | overflow 1 581 | ldap 2 582 | distributing 1 583 | flume 1 584 | dau 1 585 | html 51 586 | memcache 3 587 | potential 1 588 | jira 1 589 | matlab 2 590 | daocloud 8 591 | laravel 1 592 | development 6 593 | fixing 1 594 | eda 1 595 | ubuntu 5 596 | maven 2 597 | runtime 1 598 | pytho 1 599 | qualification 2 600 | pycharm 5 601 | geeeeek 1 602 | daily 1 603 | zeromq 3 604 | squid 3 605 | pyspider 5 606 | udp 1 607 | request 3 608 | solving 1 609 | arcgis 2 610 | scala 1 611 | line 1 612 | mybatis 5 613 | italki 6 614 | pyspark 2 615 | platform 2 616 | database 3 617 | git 62 618 | cap 1 619 | servers 1 620 | asynchronous 1 621 | mvc 12 622 | tcp 26 623 | rest 6 624 | sap 1 625 | debug 2 626 | phantomjs 1 627 | implementation 1 628 | assistant 2 629 | chef 2 630 | soda 1 631 | command 1 632 | hls 4 633 | puppet 5 634 | mariadb 1 635 | django 166 636 | performance 2 637 | swarm 1 638 | tdd 3 639 | visio 1 640 | cbu 2 641 | namespace 1 642 | challenge 1 643 | attention 1 644 | lua 3 645 | cassendra 1 646 | ngnix 4 647 | slash 2 648 | growth 2 649 | travis 1 650 | opencv 2 651 | -------------------------------------------------------------------------------- /result/模块二_各大编程语言的工作能力成熟度分析/数据清洗阶段-2018-4-27/清洗过程.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/模块二_各大编程语言的工作能力成熟度分析/数据清洗阶段-2018-4-27/清洗过程.doc -------------------------------------------------------------------------------- /result/源文件/CrawlerApp-0.0.1-SNAPSHOT-javadoc.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/源文件/CrawlerApp-0.0.1-SNAPSHOT-javadoc.jar -------------------------------------------------------------------------------- /result/源文件/CrawlerApp-0.0.1-SNAPSHOT-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/源文件/CrawlerApp-0.0.1-SNAPSHOT-sources.jar -------------------------------------------------------------------------------- /result/源文件/CrawlerApp-0.0.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/源文件/CrawlerApp-0.0.1-SNAPSHOT.jar -------------------------------------------------------------------------------- /result/项目演讲.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/result/项目演讲.ppt -------------------------------------------------------------------------------- /sources/hadoop.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/sources/hadoop.dll -------------------------------------------------------------------------------- /sources/winutils.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/sources/winutils.exe -------------------------------------------------------------------------------- /src/main/java/com/edmund/crawler/JobCrawler.java: -------------------------------------------------------------------------------- 1 | package com.edmund.crawler; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.Properties; 9 | 10 | import org.openqa.selenium.By; 11 | import org.openqa.selenium.WebElement; 12 | import org.openqa.selenium.chrome.ChromeDriver; 13 | import org.openqa.selenium.support.ui.ExpectedConditions; 14 | import org.openqa.selenium.support.ui.WebDriverWait; 15 | 16 | import com.edmund.utils.DBUtils; 17 | import com.edmund.vo.Job; 18 | 19 | /** 20 | * 用于职位信息爬取的爬虫类 21 | * 22 | * @author Edmund 23 | * 24 | */ 25 | public class JobCrawler { 26 | // private static String[] keys = { "java", "python", "c++", "android", 27 | // "php" }; 28 | private static String[] keys = { "web" }; 29 | private static Map> infos = null; 30 | 31 | private static List cities = null; 32 | private static List roots = null; 33 | private static String localdriver = null; // 本地浏览器驱动位置 34 | private static String localexport = null; // 本地输出路径 35 | 36 | private static final int THREAD_NUMBER = 5; 37 | 38 | /** 39 | * 读取配置文件 40 | */ 41 | static { 42 | Properties property = new Properties(); 43 | try { 44 | property.load(new FileInputStream( 45 | "./src/main/java/com/edmund/properties")); 46 | } catch (IOException e) { 47 | e.printStackTrace(); 48 | } 49 | localdriver = property.getProperty("LocalChromedriver"); 50 | localexport = property.getProperty("LocalExportPath"); 51 | 52 | } 53 | 54 | public static void main(String[] args) { 55 | for (String strkey : keys) { 56 | initLists(strkey); 57 | } 58 | 59 | for (int i = 0; i < THREAD_NUMBER; i++) { 60 | new JobCrawler().new crawThread().start(); 61 | } 62 | } 63 | 64 | /** 65 | * 爬取数据的线程类 66 | * @author Edmund 67 | * 68 | */ 69 | class crawThread extends Thread { 70 | ChromeDriver driver = initBrowser(); 71 | 72 | @Override 73 | public void run() { 74 | while (true) { 75 | String[] urls = getURL(); 76 | if (urls == null) { 77 | break; 78 | } 79 | String key = whichKey(urls[1]); 80 | 81 | List jobs = null; 82 | try { 83 | jobs = crawJobs(urls, key, driver); 84 | } catch (Exception e) { 85 | pushIntoLists(urls); 86 | } 87 | DBUtils.writeToFile(jobs, 88 | localexport + "/" + key + "/" + this.getName() + "/" 89 | + urls[0] + "-" + key + "-info.txt"); 90 | } 91 | } 92 | } 93 | 94 | /** 95 | * 线程同步取url和city信息 96 | * @return urls[0]保存city,urls[1]保存url 97 | */ 98 | private synchronized static String[] getURL() { 99 | if (cities == null || cities.isEmpty()) { 100 | return null; 101 | } 102 | if (roots == null || roots.isEmpty()) { 103 | return null; 104 | } 105 | String[] urls = { cities.get(0), roots.get(0) }; 106 | cities.remove(0); 107 | roots.remove(0); 108 | 109 | return urls; 110 | } 111 | 112 | /** 113 | * 静态初始化职位信息,将所有信息加载到内存中 114 | * @param strkey 关键字 115 | */ 116 | private static void initLists(String strkey) { 117 | try { 118 | infos = DBUtils 119 | .readFromFile("./result-sources/EdmundDXu/files/emp.txt"); 120 | } catch (IOException e) { 121 | } 122 | List newroot = new ArrayList(); 123 | cities = infos.get("cities"); 124 | 125 | for (String root : infos.get("roots")) { 126 | newroot.add(root.replace("#", strkey)); 127 | } 128 | roots = newroot; 129 | } 130 | 131 | /** 132 | * 初始化浏览器驱动 133 | * @return 浏览器驱动对象 134 | */ 135 | private static ChromeDriver initBrowser() { 136 | System.setProperty("webdriver.chrome.driver", localdriver); 137 | ChromeDriver driver = new ChromeDriver(); 138 | return driver; 139 | } 140 | 141 | /** 142 | * 如果出现异常情况导致没有被其他catch语句捕获,就将该url重新加入列表中处理 143 | * @param url 144 | */ 145 | private synchronized static void pushIntoLists(String[] urls) { 146 | cities.add(urls[0]); 147 | roots.add(urls[1]); 148 | } 149 | 150 | /** 151 | * 根据url判断该url属于哪个关键字 152 | * @param url 153 | * @return 关键字 154 | */ 155 | private static String whichKey(String url) { 156 | for (String key : keys) { 157 | if (url.contains(key)) { 158 | return key; 159 | } 160 | } 161 | return null; 162 | } 163 | 164 | /** 165 | * 从指定根站点,以指定关键字开始爬取职位信息,多线程方式将职位信息逐条写入文件中 58同城 166 | * 该方法暂时废弃 167 | */ 168 | 169 | /** 170 | * 从指定根站点,以指定关键字开始爬取职位信息 58同城 171 | * @param urls 保存url和city信息的数组,urls[0]保存city,urls[1]保存url 172 | * @param key 需要爬取的关键字 173 | * @param driver 浏览器驱动对象 174 | * @return 包含职位信息的列表 175 | */ 176 | public static List crawJobs(String[] urls, String key, 177 | ChromeDriver driver) { 178 | 179 | if (pretreatment(urls[1], driver) == -1) { 180 | return null; 181 | } 182 | 183 | List jobs = new ArrayList(); 184 | while (true) { 185 | WebElement list = driver.findElementById("list_con"); 186 | List positions = list.findElements(By.tagName("li")); 187 | for (WebElement webElement : positions) { 188 | // 出现此条语句表示下面的结果与搜索关键字无关,故直接抛弃下面的职位 189 | if (webElement.getAttribute("class").contains("noData")) { 190 | break; 191 | } 192 | jobs.add(createJobVo(webElement, urls[0], key)); 193 | } 194 | if (nextPage(driver) == -1) { 195 | break; 196 | } 197 | } 198 | return jobs; 199 | 200 | } 201 | 202 | /** 203 | * 在爬取数据之前做的预处理工作 204 | * @param url 需要爬取的url 205 | * @param driver 浏览器驱动对象 206 | * @return 0表示预处理正常,-1表示预处理失败 207 | */ 208 | private static int pretreatment(String url, ChromeDriver driver) { 209 | driver.get(url); 210 | // 最大化窗口 211 | // driver.manage().window().maximize(); 212 | 213 | WebDriverWait wait = new WebDriverWait(driver, 10); 214 | 215 | // 等待职位列表和分页列表加载完毕 216 | try { 217 | wait.until(ExpectedConditions 218 | .presenceOfElementLocated(By.id("list_con"))); 219 | } catch (Exception e) { 220 | // 如果出现页面中没有list_con元素的情况,视为没有职位信息,直接退出本页面 221 | return -1; 222 | } 223 | // wait.until(ExpectedConditions.presenceOfElementLocated(By.className("next"))); 224 | 225 | return 0; 226 | } 227 | 228 | /** 229 | * 爬取完数据后的翻页操作 230 | * @param driver 浏览器驱动对象 231 | * @return 0表示翻页操作可以正常执行,-1表示翻页操作不能继续进行 232 | */ 233 | public static int nextPage(ChromeDriver driver) { 234 | // 使用findElements可以避免出现‘页面中没有next元素’而导致的异常 235 | List nextlist = driver.findElementsByClassName("next"); 236 | // 如果页面中没有next元素,则不点击next,直接退出本次循环 237 | if (nextlist == null || nextlist.isEmpty()) { 238 | return -1; 239 | } 240 | 241 | WebElement next = nextlist.get(0); 242 | 243 | // 一旦翻页按钮无法使用,表示到了最后一页,则退出循环 244 | if (next.getAttribute("class").contains("disabled")) { 245 | return -1; 246 | } 247 | next.click(); 248 | return 0; 249 | } 250 | 251 | /** 252 | * 创建职位信息的封装类 253 | * @param webElement 254 | * @param city 城市信息 255 | * @param key 关键字 256 | * @return 封装职位信息的Job对象 257 | */ 258 | private static Job createJobVo(WebElement webElement, String city, 259 | String key) { 260 | String title = webElement.findElement(By.className("job_name")) 261 | .getText(); 262 | String job_name = webElement.findElement(By.className("cate")) 263 | .getText(); 264 | String salary = webElement.findElement(By.className("job_salary")) 265 | .getText(); 266 | String company = webElement.findElement(By.className("comp_name")) 267 | .getText(); 268 | String education = webElement.findElement(By.className("xueli")) 269 | .getText(); 270 | String experience = webElement.findElement(By.className("jingyan")) 271 | .getText(); 272 | 273 | Job job = new Job(null, city, key, title, salary.split("元/月")[0], 274 | company.split(" ")[0], job_name, education, experience); 275 | return job; 276 | } 277 | } 278 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/crawler/KeyMapMerger.java: -------------------------------------------------------------------------------- 1 | package com.edmund.crawler; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | import java.util.Set; 7 | 8 | import com.edmund.utils.DataBaseConnection; 9 | import com.edmund.utils.LGDBUtils; 10 | import com.edmund.vo.KeyMap; 11 | 12 | /** 13 | * 关键字map的合并类 14 | * @author Edmund 15 | * 16 | */ 17 | public class KeyMapMerger { 18 | 19 | private DataBaseConnection dbc = new DataBaseConnection(); 20 | private LGDBUtils utils = new LGDBUtils(dbc); 21 | private static String[] keys = { "web", "java", "python", "c++", "c#", 22 | "android", "linux" }; 23 | 24 | public static void main(String[] args) { 25 | new KeyMapMerger().merge(); 26 | } 27 | 28 | /** 29 | * 合并数据库中所有条目的Map集合,整合为一个Map集合,并输出到本地文件系统中 30 | */ 31 | private void merge() { 32 | for (String keyword : keys) { 33 | Map kwMerge = new HashMap(); 34 | // List jobs = utils.getLGJob(keyword); 35 | // for (LGJob job : jobs) { 36 | List kms = utils.getKeyMap(keyword); 37 | for (KeyMap km : kms) { 38 | // Map kwMap = job.getKeywords(); 39 | Map kwMap = km.getKeywords(); 40 | Set keyset = kwMap.keySet(); 41 | for (String key : keyset) { 42 | if (kwMerge.containsKey(key)) { 43 | kwMerge.put(key, kwMerge.get(key) + kwMap.get(key)); 44 | } else { 45 | if (key.contains("/")) { 46 | String[] keys = key.split("/"); 47 | for (String inner_key : keys) { 48 | if (kwMerge.containsKey(inner_key)) { 49 | kwMerge.put(inner_key, 50 | kwMerge.get(inner_key) 51 | + kwMap.get(key)); 52 | } else { 53 | kwMerge.put(inner_key, kwMap.get(key)); 54 | } 55 | } 56 | } else { 57 | kwMerge.put(key, kwMap.get(key)); 58 | } 59 | 60 | } 61 | } 62 | } 63 | utils.writeKeyMapToMysql(kwMerge, keyword); 64 | } 65 | dbc.close(); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/crawler/LGJobCleaner.java: -------------------------------------------------------------------------------- 1 | package com.edmund.crawler; 2 | 3 | import com.edmund.utils.DataBaseConnection; 4 | import com.edmund.utils.LGCleanUtils; 5 | 6 | /** 7 | * 拉钩数据表的清洗类 8 | * @author Edmund 9 | * 10 | */ 11 | public class LGJobCleaner { 12 | private DataBaseConnection dbc = new DataBaseConnection(); 13 | private LGCleanUtils utils = new LGCleanUtils(dbc); 14 | 15 | private static final int LAGOU = 0; 16 | private static final int BOSS = 1; 17 | 18 | public static void main(String[] args) { 19 | new LGJobCleaner().clean(); 20 | } 21 | 22 | private void clean() { 23 | utils.JobClean(LAGOU); 24 | dbc.close(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/crawler/LGJobCrawler.java: -------------------------------------------------------------------------------- 1 | package com.edmund.crawler; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.util.List; 6 | import java.util.Properties; 7 | 8 | import org.openqa.selenium.By; 9 | import org.openqa.selenium.Keys; 10 | import org.openqa.selenium.WebElement; 11 | import org.openqa.selenium.chrome.ChromeDriver; 12 | import org.openqa.selenium.support.ui.ExpectedConditions; 13 | import org.openqa.selenium.support.ui.WebDriverWait; 14 | 15 | import com.edmund.utils.DataBaseConnection; 16 | import com.edmund.utils.LGDBUtils; 17 | 18 | /** 19 | * 拉钩网爬虫类 20 | * 现在用于从city_url表中读取需要处理的所有url,然后将抓取到的所有href保存到ready_url表中 21 | * 爬虫处理阶段2 22 | * @author Edmund 23 | * 24 | */ 25 | public class LGJobCrawler { 26 | private static String[] keys = { "web", "java", "python", "c++", "c#", 27 | "android", "linux" }; 28 | 29 | private static String localdriver = null; // 本地浏览器驱动位置 30 | private DataBaseConnection dbc = new DataBaseConnection(); 31 | private LGDBUtils utils = new LGDBUtils(dbc); 32 | 33 | /** 34 | * 读取配置文件 35 | */ 36 | static { 37 | Properties property = new Properties(); 38 | try { 39 | property.load(new FileInputStream( 40 | "./src/main/java/com/edmund/properties")); 41 | } catch (IOException e) { 42 | e.printStackTrace(); 43 | } 44 | localdriver = property.getProperty("LocalChromedriver"); 45 | 46 | } 47 | 48 | public static void main(String[] args) throws Exception { 49 | LGJobCrawler lgCrawler = new LGJobCrawler(); 50 | ChromeDriver driver = initBrowser(); 51 | 52 | // for (int i = 0; i < THREAD_NUMBER; i++) { 53 | // new LGJobCrawler().new LGJobCrawlerThread().start(); 54 | // } 55 | String url = null; 56 | 57 | while ((url = lgCrawler.read()) != null) { 58 | lgCrawler.crawJobs(url, driver); 59 | } 60 | } 61 | 62 | private String read() { 63 | return utils.readFromCityURL(); 64 | } 65 | 66 | /** 67 | * 初始化浏览器驱动 68 | * @return 浏览器驱动对象 69 | */ 70 | private static ChromeDriver initBrowser() { 71 | System.setProperty("webdriver.chrome.driver", localdriver); 72 | ChromeDriver driver = new ChromeDriver(); 73 | return driver; 74 | } 75 | 76 | /** 77 | * 在爬取数据之前做的预处理工作 78 | * @param url 需要爬取的url 79 | * @param driver 浏览器驱动对象 80 | * @return 0表示预处理正常,-1表示预处理失败 81 | */ 82 | private static int pretreatment(String url, ChromeDriver driver) { 83 | driver.get(url); 84 | // driver.manage().window().maximize(); 85 | 86 | WebDriverWait wait = new WebDriverWait(driver, 5); 87 | 88 | try { 89 | wait.until(ExpectedConditions 90 | .presenceOfElementLocated(By.id("s_position_list"))); 91 | } catch (Exception e) { 92 | return -1; 93 | } 94 | 95 | return 0; 96 | } 97 | 98 | /** 99 | * 从给定url爬取职位信息 100 | * @param url 网页路径 101 | * @param driver 浏览器驱动 102 | * @return 职位信息列表 103 | * @throws Exception 104 | */ 105 | public void crawJobs(String url, ChromeDriver driver) throws Exception { 106 | 107 | try { 108 | if (pretreatment(url, driver) == -1) { 109 | return; 110 | } 111 | 112 | while (true) { 113 | WebElement list = driver.findElementById("s_position_list"); 114 | WebElement list_ul = list.findElement(By.tagName("ul")); 115 | List positions = list_ul 116 | .findElements(By.tagName("li")); 117 | for (WebElement webElement : positions) { 118 | String href = webElement.findElement(By.tagName("a")) 119 | .getAttribute("href"); 120 | utils.writeIntoReadyURL(href, whichKey(url)); 121 | } 122 | 123 | if (nextPage(driver) == -1) { 124 | break; 125 | } 126 | } 127 | } catch (Exception e) { 128 | restart(url); 129 | e.printStackTrace(); 130 | } 131 | } 132 | 133 | /** 134 | * 处理url出现异常时,恢复该url在数据库中的状态,并且休息10秒钟 135 | * @param url 136 | */ 137 | private void restart(String url) { 138 | utils.restoreReadyURL(url); 139 | System.out.println("正在回滚数据"); 140 | try { 141 | Thread.sleep(10000); 142 | } catch (InterruptedException e) { 143 | e.printStackTrace(); 144 | } 145 | } 146 | 147 | /** 148 | * 爬取完数据后的翻页操作 149 | * @param driver 浏览器驱动对象 150 | * @return 0表示翻页操作可以正常执行,-1表示翻页操作不能继续进行 151 | * @throws InterruptedException 152 | */ 153 | private static int nextPage(ChromeDriver driver) 154 | throws InterruptedException { 155 | // 使用findElements可以避免出现‘页面中没有next元素’而导致的异常 156 | List nextlist = driver.findElements( 157 | By.cssSelector("#s_position_list span.pager_next")); 158 | 159 | // 如果页面中没有next元素,则不点击next,直接退出本次循环 160 | if (nextlist == null || nextlist.isEmpty()) { 161 | return -1; 162 | } 163 | 164 | WebElement next = nextlist.get(0); 165 | driver.getKeyboard().sendKeys(Keys.END); 166 | Thread.sleep(2000); 167 | // 一旦翻页按钮无法使用,表示到了最后一页,则退出循环 168 | if (next.getAttribute("class").contains("pager_next_disabled")) { 169 | return -1; 170 | } 171 | next.click(); 172 | Thread.sleep(2000); 173 | return 0; 174 | } 175 | 176 | /** 177 | * 根据url判断该url属于哪个关键字 178 | * @param url 179 | * @return 关键字 180 | */ 181 | private static String whichKey(String url) { 182 | for (String key : keys) { 183 | if (url.contains(key)) { 184 | return key; 185 | } 186 | } 187 | return null; 188 | } 189 | 190 | } 191 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/crawler/LGJobCrawlerThread.java: -------------------------------------------------------------------------------- 1 | package com.edmund.crawler; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | import java.util.Properties; 8 | 9 | import org.jsoup.Jsoup; 10 | import org.jsoup.nodes.Document; 11 | import org.openqa.selenium.By; 12 | import org.openqa.selenium.chrome.ChromeDriver; 13 | import org.openqa.selenium.support.ui.ExpectedConditions; 14 | import org.openqa.selenium.support.ui.WebDriverWait; 15 | 16 | import com.edmund.utils.DataBaseConnection; 17 | import com.edmund.utils.LGDBUtils; 18 | import com.edmund.vo.LGJob; 19 | 20 | import jeasy.analysis.MMAnalyzer; 21 | 22 | /** 23 | * 多线程静态爬取职位信息的线程类 24 | * 现在用于从ready_url表中读取出需要处理的url,然后将处理结果存入lagou表中 25 | * 爬虫处理阶段3 26 | * @author Edmund 27 | * 28 | */ 29 | class LGJobCrawlerThread extends Thread { 30 | 31 | private DataBaseConnection dbc = new DataBaseConnection(); 32 | private LGDBUtils utils = new LGDBUtils(dbc); 33 | 34 | private static String localdriver = null; // 本地浏览器驱动位置 35 | 36 | private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"; 37 | 38 | /** 39 | * 读取配置文件 40 | */ 41 | static { 42 | Properties property = new Properties(); 43 | try { 44 | property.load(new FileInputStream( 45 | "./src/main/java/com/edmund/properties")); 46 | } catch (IOException e) { 47 | e.printStackTrace(); 48 | } 49 | localdriver = property.getProperty("LocalChromedriver"); 50 | } 51 | 52 | public static void main(String[] args) throws InterruptedException { 53 | // for (int i = 0; i < THREAD_NUMBER; i++) { 54 | // new LGJobCrawlerThread().start(); 55 | // Thread.sleep(5000); 56 | // } 57 | } 58 | 59 | @Override 60 | public void run() { 61 | ChromeDriver driver = initBrowser(); 62 | while (true) { 63 | try { 64 | String[] infos = null; 65 | if ((infos = utils.readFromReadyURL()) == null) { 66 | try { 67 | Thread.sleep(6000); 68 | } catch (InterruptedException e) { 69 | e.printStackTrace(); 70 | } 71 | } else { 72 | LGJob job = getJobDetails_Dynamic(infos, driver); 73 | utils.insertLGJob(job); 74 | } 75 | } catch (Exception e) { 76 | e.printStackTrace(); 77 | } 78 | } 79 | } 80 | 81 | /** 82 | * 初始化浏览器驱动 83 | * @return 浏览器驱动对象 84 | */ 85 | private static ChromeDriver initBrowser() { 86 | System.setProperty("webdriver.chrome.driver", localdriver); 87 | ChromeDriver driver = new ChromeDriver(); 88 | return driver; 89 | } 90 | 91 | /** 92 | * 在爬取数据之前做的预处理工作 93 | * @param url 需要爬取的url 94 | * @param driver 浏览器驱动对象 95 | * @return 0表示预处理正常,-1表示预处理失败 96 | */ 97 | private int pretreatment(String url, ChromeDriver driver) { 98 | driver.get(url); 99 | // driver.manage().window().maximize(); 100 | 101 | WebDriverWait wait = new WebDriverWait(driver, 5); 102 | 103 | try { 104 | wait.until(ExpectedConditions 105 | .presenceOfElementLocated(By.className("position-head"))); 106 | wait.until(ExpectedConditions 107 | .presenceOfElementLocated(By.id("job_detail"))); 108 | 109 | } catch (Exception e) { 110 | e.printStackTrace(); 111 | restart(url); 112 | return -1; 113 | } 114 | 115 | return 0; 116 | } 117 | 118 | private LGJob getJobDetails_Dynamic(String[] infos, ChromeDriver driver) { 119 | LGJob job = null; 120 | String url = infos[0]; 121 | // 过滤条件,只允许包含数据的url通过 122 | if (url.matches(".*lagou\\.com/jobs/[0-9]+\\..?html")) { 123 | if (pretreatment(url, driver) == -1) { 124 | return null; 125 | } 126 | String key = infos[1]; 127 | String[] job_request = driver.findElementByClassName("job_request") 128 | .getText().split("/"); 129 | String salary = job_request[0].trim(); 130 | String city = job_request[1].trim(); 131 | String experience = job_request[2].trim(); 132 | String education = job_request[3].trim(); 133 | 134 | String company = driver.findElementByClassName("company").getText(); 135 | String keywords = driver.findElementByClassName("job_bt") 136 | .findElement(By.tagName("div")).getText(); 137 | 138 | job = new LGJob(null, key, null, salary, city, experience, 139 | education, company.substring(0, company.length() - 2), 140 | getKeywordsMap(keywords)); 141 | 142 | } else { 143 | return null; 144 | } 145 | 146 | return job; 147 | } 148 | 149 | /** 150 | * 根据infos数组获取工作的详细信息,infos[0]保存url,infos[1]保存keyword 151 | * @param infos 保存了url和keyword的数组 152 | * @return 职位信息的封装类 153 | */ 154 | private LGJob getJobDetails(String[] infos) { 155 | Document doc = null; 156 | LGJob job = null; 157 | String url = infos[0]; 158 | try { 159 | // 过滤条件,只允许包含数据的url通过 160 | if (url.matches(".*lagou\\.com/jobs/[0-9]+\\..?html")) { 161 | doc = Jsoup.connect(url).userAgent(USER_AGENT).get(); 162 | } else { 163 | return null; 164 | } 165 | } catch (IOException e) { 166 | e.printStackTrace(); 167 | return null; 168 | } 169 | String key = infos[1]; 170 | String[] job_request = null; 171 | try { 172 | job_request = doc.getElementsByClass("job_request").first().text() 173 | .split("/"); 174 | } catch (Exception e) { 175 | restart(url); 176 | e.printStackTrace(); 177 | } 178 | String salary = job_request[0].trim(); 179 | String city = job_request[1].trim(); 180 | String experience = job_request[2].trim(); 181 | String education = job_request[3].trim(); 182 | 183 | String company = doc.getElementsByClass("company").first().text(); 184 | String keywords = doc.getElementsByClass("job_bt").first() 185 | .getElementsByTag("div").text(); 186 | 187 | job = new LGJob(null, key, null, salary, city, experience, education, 188 | company, getKeywordsMap(keywords)); 189 | 190 | return job; 191 | } 192 | 193 | /** 194 | * 处理url出现异常时,恢复该url在数据库中的状态,并且休息10秒钟 195 | * @param url 196 | */ 197 | private void restart(String url) { 198 | utils.restoreReadyURL(url); 199 | try { 200 | Thread.sleep(10000); 201 | } catch (InterruptedException e) { 202 | e.printStackTrace(); 203 | } 204 | } 205 | 206 | /** 207 | * 根据传入的文本进行分词,取出其中的英文单词,并且将其出现的次数按照map的格式保存 208 | * @param keywords 需要分词的文本 209 | * @return 分词后的单词和其出现的次数 210 | */ 211 | private static Map getKeywordsMap(String keywords) { 212 | Map kwMap = new HashMap(); 213 | MMAnalyzer mm = new MMAnalyzer(); 214 | MMAnalyzer.addWord("C#"); 215 | MMAnalyzer.addWord("c#"); 216 | try { 217 | String[] kwStrs = mm.segment(keywords, "|").split("\\|"); 218 | for (String kwStr : kwStrs) { 219 | if (!kwStr.matches("[a-zA-Z/#\\\\]+")) { 220 | continue; 221 | } 222 | if (kwMap.containsKey(kwStr)) { 223 | kwMap.put(kwStr, kwMap.get(kwStr) + 1); 224 | } else { 225 | kwMap.put(kwStr, 1); 226 | } 227 | } 228 | } catch (IOException e) { 229 | e.printStackTrace(); 230 | } 231 | return kwMap; 232 | } 233 | } -------------------------------------------------------------------------------- /src/main/java/com/edmund/crawler/LGJobUrlGenerator.java: -------------------------------------------------------------------------------- 1 | package com.edmund.crawler; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import com.edmund.utils.DataBaseConnection; 7 | import com.edmund.utils.LGDBUtils; 8 | import com.edmund.vo.LGJob; 9 | 10 | /** 11 | * 用于根据关键字和城市来生成所有需要处理的url,并存入city_url表中 12 | * 爬虫处理阶段1 13 | * @author Edmund 14 | * 15 | */ 16 | public class LGJobUrlGenerator { 17 | 18 | private DataBaseConnection dbc = new DataBaseConnection(); 19 | private LGDBUtils utils = new LGDBUtils(dbc); 20 | private static String[] keys = { "web", "java", "python", "c++", "c#", 21 | "android", "linux" }; 22 | private static String root = "https://www.lagou.com/jobs/list_%KW%?px=default&city=%CT%#filterBox"; 23 | 24 | public static void main(String[] args) throws IOException { 25 | new LGJobUrlGenerator().initURLList(); 26 | // Test test = new Test(); 27 | // String line = null; 28 | // List jobs = test.read(); 29 | // for (LGJob lgJob : jobs) { 30 | // System.out.println(lgJob.getKeywords()); 31 | // } 32 | } 33 | 34 | private void initURLList() throws IOException { 35 | List cities = utils 36 | .readFromFile("C:/Users/admin/Desktop/files/lagou.txt"); 37 | for (String key : keys) { 38 | for (String city : cities) { 39 | String url = root.replace("%KW%", key).replace("%CT%", city); 40 | utils.writeIntoCityURL(url); 41 | } 42 | } 43 | } 44 | 45 | private List read() { 46 | return utils.getLGJob("web"); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/properties: -------------------------------------------------------------------------------- 1 | LocalChromedriver=D:/utils/chromedriver.exe 2 | LocalExportPath=C:/Users/admin/Desktop/export -------------------------------------------------------------------------------- /src/main/java/com/edmund/test/Test.java: -------------------------------------------------------------------------------- 1 | package com.edmund.test; 2 | 3 | import java.io.IOException; 4 | 5 | import com.edmund.utils.DBUtils; 6 | 7 | public class Test { 8 | public static void main(String[] args) throws IOException { 9 | DBUtils.readFromFile("emp.txt"); 10 | 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/utils/DBUtils.java: -------------------------------------------------------------------------------- 1 | package com.edmund.utils; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.FileNotFoundException; 7 | import java.io.FileOutputStream; 8 | import java.io.IOException; 9 | import java.io.InputStreamReader; 10 | import java.io.PrintWriter; 11 | import java.util.ArrayList; 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | 16 | import com.edmund.vo.Job; 17 | 18 | /** 19 | * 用于支持职位信息爬虫的文件操作类 20 | * 21 | * @author Edmund 22 | * 23 | */ 24 | public class DBUtils { 25 | private static PrintWriter pw = null; 26 | private static int count = 1; 27 | 28 | /** 29 | * 从文件中读取网站根路径和城市 30 | * 31 | * @param filepath 32 | * 文件路径 33 | * @return 包含网站根路径列表和城市列表的map集合,可以通过get("cities")获得城市列表,get("roots")获得网站根路径列表,两个列表的索引一一对应 34 | * @throws IOException 35 | */ 36 | public static Map> readFromFile(String filepath) 37 | throws IOException { 38 | Map> infos = new HashMap>(); 39 | List cities = new ArrayList(); 40 | List roots = new ArrayList(); 41 | 42 | File file = new File(filepath); 43 | FileInputStream in = new FileInputStream(file); 44 | BufferedReader reader = new BufferedReader( 45 | new InputStreamReader(in, "UTF-8")); 46 | String line = null; 47 | 48 | while ((line = reader.readLine()) != null) { 49 | cities.add(line.split("\\t")[1]); 50 | roots.add(line.split("\\t")[2]); 51 | } 52 | infos.put("cities", cities); 53 | infos.put("roots", roots); 54 | reader.close(); 55 | return infos; 56 | } 57 | 58 | /** 59 | * 将职位信息写入到文件中 60 | * 61 | * @param job 62 | * 职位信息 63 | * @param filepath 64 | * 保存的文件路径 65 | * @throws FileNotFoundException 66 | */ 67 | public static void writeToFile(Job job, String filepath) 68 | throws FileNotFoundException { 69 | PrintWriter pw = new PrintWriter( 70 | new FileOutputStream(new File(filepath), true)); 71 | pw.print(job.getCity() + "\t"); 72 | pw.print(job.getKey() + "\t"); 73 | pw.print(job.getTitle() + "\t"); 74 | pw.print(job.getSalary() + "\t"); 75 | pw.print(job.getCompany() + "\t"); 76 | pw.print(job.getJob() + "\t"); 77 | pw.print(job.getEducation() + "\t"); 78 | pw.println(job.getExperience()); 79 | pw.flush(); 80 | pw.close(); 81 | } 82 | 83 | /** 84 | * 将职位信息列表中的职位信息写入到文件中 85 | * 86 | * @param jobs 87 | * 职位信息列表 88 | * @param filepath 89 | * 文件路径 90 | */ 91 | public static void writeToFile(List jobs, String filepath) { 92 | if (jobs == null || jobs.isEmpty()) { 93 | return; 94 | } 95 | try { 96 | initWriter(filepath); 97 | } catch (FileNotFoundException e) { 98 | e.printStackTrace(); 99 | } 100 | for (Job job : jobs) { 101 | System.out.println("正在处理: " + job + ",已处理: " + count++); 102 | pw.print(job.getCity() + "\t"); 103 | pw.print(job.getKey() + "\t"); 104 | pw.print(job.getTitle() + "\t"); 105 | pw.print(job.getSalary() + "\t"); 106 | pw.print(job.getCompany() + "\t"); 107 | pw.print(job.getJob() + "\t"); 108 | pw.print(job.getEducation() + "\t"); 109 | pw.println(job.getExperience()); 110 | } 111 | pw.flush(); 112 | closeAll(); 113 | 114 | } 115 | 116 | /** 117 | * 关闭writer 118 | */ 119 | public static void closeAll() { 120 | if (pw != null) { 121 | pw.close(); 122 | pw = null; 123 | } 124 | } 125 | 126 | /** 127 | * 开启writer 128 | * @param filepath 文件路径 129 | * @throws FileNotFoundException 130 | */ 131 | public static void initWriter(String filepath) 132 | throws FileNotFoundException { 133 | if (pw == null) { 134 | File file = new File(filepath); 135 | if (!file.getParentFile().exists()) { 136 | file.getParentFile().mkdirs(); 137 | } 138 | pw = new PrintWriter(new FileOutputStream(file, true)); 139 | } 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/utils/DataBaseConnection.java: -------------------------------------------------------------------------------- 1 | package com.edmund.utils; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.SQLException; 6 | 7 | /** 8 | * 数据库连接管理类 9 | * 10 | * @author Edmund 11 | * 12 | */ 13 | public class DataBaseConnection { 14 | private static String DBDRIVER = "org.gjt.mm.mysql.Driver"; 15 | private static String DBURL = "jdbc:mysql://10.60.72.28:3306/test"; 16 | private static String DBUSER = "root"; 17 | private static String DBPASSWORD = "redhat"; 18 | 19 | private Connection conn = null; 20 | 21 | public DataBaseConnection() { 22 | super(); 23 | } 24 | 25 | /** 26 | * 返回一个数据库连接 27 | * 28 | * @return 29 | */ 30 | public Connection getConn() { 31 | try { 32 | if (conn == null || conn.isClosed()) { 33 | Class.forName(DBDRIVER); 34 | conn = DriverManager.getConnection(DBURL, DBUSER, DBPASSWORD); 35 | } 36 | } catch (SQLException e) { 37 | e.printStackTrace(); 38 | } catch (ClassNotFoundException e) { 39 | e.printStackTrace(); 40 | } 41 | return conn; 42 | } 43 | 44 | /** 45 | * 关闭数据库连接 46 | */ 47 | public void close() { 48 | if (conn != null) { 49 | try { 50 | conn.close(); 51 | } catch (SQLException e) { 52 | // TODO Auto-generated catch block 53 | e.printStackTrace(); 54 | } 55 | } 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/utils/LGCleanUtils.java: -------------------------------------------------------------------------------- 1 | package com.edmund.utils; 2 | 3 | import java.sql.PreparedStatement; 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | 7 | /** 8 | * 拉钩数据表清洗工具类 9 | * @author Edmund 10 | * 11 | */ 12 | public class LGCleanUtils { 13 | public DataBaseConnection dbc = null; 14 | 15 | public LGCleanUtils(DataBaseConnection dbc) { 16 | this.dbc = dbc; 17 | } 18 | 19 | /** 20 | * experience字段的清洗方法 21 | * @param experience 22 | * @return -1表示清洗失败, 否则返回大于0的值 23 | */ 24 | private int experienceClean(String experience) { 25 | String strClean = experience.substring(2); 26 | int expClean = -1; 27 | if (strClean.matches("[0-9]+-[0-9]+年")) { 28 | expClean = Integer.parseInt(strClean.split("-")[0]); 29 | } else if (strClean.contains("不限") || strClean.contains("应届毕业生") 30 | || strClean.matches("[0-9]+年以下")) { 31 | expClean = 0; 32 | } else if (strClean.matches("[0-9]+年以上")) { 33 | expClean = Integer.parseInt(strClean.split("年")[0]); 34 | } 35 | return expClean; 36 | } 37 | 38 | /** 39 | * education字段的清洗方法 40 | * @param education 41 | * @return -1 表示清洗失败,否则返回大于0的值 42 | */ 43 | private int educationClean(String education) { 44 | int edu = -1; 45 | if (education.matches("学历不限")) { 46 | edu = 0; 47 | } else if (education.matches("大专及以上")) { 48 | edu = 1; 49 | } else if (education.matches("本科及以上")) { 50 | edu = 2; 51 | } else if (education.matches("硕士及以上")) { 52 | edu = 3; 53 | } 54 | return edu; 55 | } 56 | 57 | /** 58 | * salary字段的清洗方法 59 | * @param salary 60 | * @return 三个均为0表示清洗失败,否则返回大于0的三个值数组 61 | */ 62 | private int[] salaryClean(String salary) { 63 | int[] cleanSal = new int[3]; 64 | int min_salary = 0; 65 | int max_salary = 0; 66 | int avg_salary = 0; 67 | if (salary.matches("[0-9]+[kK]-[0-9]+[kK]")) { 68 | String[] sals = salary.split("-"); 69 | min_salary = Integer 70 | .parseInt(sals[0].replace("k", "000").replace("K", "000")); 71 | max_salary = Integer 72 | .parseInt(sals[1].replace("k", "000").replace("K", "000")); 73 | } else if (salary.matches("[0-9]+[kK]以上")) { 74 | String[] sals = salary.split("以上"); 75 | min_salary = Integer 76 | .parseInt(sals[0].replace("k", "000").replace("K", "000")); 77 | max_salary = Integer.parseInt( 78 | sals[0].replace("k", "000").replace("K", "000")) + 5000; 79 | } 80 | avg_salary = (min_salary + max_salary) / 2; 81 | cleanSal[0] = min_salary; 82 | cleanSal[1] = max_salary; 83 | cleanSal[2] = avg_salary; 84 | return cleanSal; 85 | } 86 | 87 | /** 88 | * 拉钩数据表条目的清洗方法,用于清洗整个数据条目 89 | * @param data_from 0表示清洗拉勾网数据,1表示清洗BOSS网数据 90 | */ 91 | public void JobClean(int data_from) { 92 | String query_sql = "SELECT id,key_word,job,salary,province,city,experience,education,company FROM lagou"; 93 | String insert_sql = "INSERT INTO job_data_result(data_from,province,city,key_word,company_or_team,min_salary,max_salary,avg_salary,min_experience,min_education) VALUES(?,?,?,?,?,?,?,?,?,?)"; 94 | try { 95 | PreparedStatement pst = dbc.getConn().prepareStatement(query_sql); 96 | ResultSet rs = pst.executeQuery(); 97 | while (rs.next()) { 98 | int id = rs.getInt(1); 99 | String key_word = rs.getString(2); 100 | String job = rs.getString(3); 101 | String salary = rs.getString(4); 102 | String province = rs.getString(5); 103 | String city = rs.getString(6); 104 | String experience = rs.getString(7); 105 | String education = rs.getString(8); 106 | String company_or_team = rs.getString(9); 107 | 108 | int min_education = educationClean(education); 109 | int min_experience = experienceClean(experience); 110 | int[] cleanSal = salaryClean(salary); 111 | int min_salary = cleanSal[0]; 112 | int max_salary = cleanSal[1]; 113 | int avg_salary = cleanSal[2]; 114 | 115 | pst = dbc.getConn().prepareStatement(insert_sql); 116 | pst.setInt(1, data_from); 117 | pst.setString(2, province); 118 | pst.setString(3, city); 119 | pst.setString(4, key_word); 120 | pst.setString(5, company_or_team); 121 | pst.setInt(6, min_salary); 122 | pst.setInt(7, max_salary); 123 | pst.setInt(8, avg_salary); 124 | pst.setInt(9, min_experience); 125 | pst.setInt(10, min_education); 126 | 127 | pst.executeUpdate(); 128 | } 129 | 130 | rs.close(); 131 | pst.close(); 132 | 133 | } catch (SQLException e) { 134 | e.printStackTrace(); 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/utils/LGDBUtils.java: -------------------------------------------------------------------------------- 1 | package com.edmund.utils; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.FileNotFoundException; 7 | import java.io.FileOutputStream; 8 | import java.io.IOException; 9 | import java.io.InputStreamReader; 10 | import java.io.ObjectInputStream; 11 | import java.io.PrintWriter; 12 | import java.sql.Blob; 13 | import java.sql.PreparedStatement; 14 | import java.sql.ResultSet; 15 | import java.sql.SQLException; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | import java.util.Map; 19 | import java.util.Set; 20 | 21 | import com.edmund.vo.KeyMap; 22 | import com.edmund.vo.LGJob; 23 | 24 | /** 25 | * 拉勾网操作工具类 26 | * @author Edmund 27 | * 28 | */ 29 | public class LGDBUtils { 30 | private PrintWriter pw = null; 31 | 32 | public DataBaseConnection dbc = null; 33 | 34 | public LGDBUtils(DataBaseConnection dbc) { 35 | this.dbc = dbc; 36 | } 37 | 38 | /** 39 | * 向数据库中写入需要处理的url 40 | * @param url 41 | */ 42 | public void writeIntoReadyURL(String url, String keyword) { 43 | String sql = "INSERT INTO ready_url (url,state,keyword) VALUES (?,0,?)"; 44 | try { 45 | dbc.getConn().setAutoCommit(false); 46 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 47 | pst.setString(1, url); 48 | pst.setString(2, keyword); 49 | pst.executeUpdate(); 50 | dbc.getConn().commit(); 51 | pst.close(); 52 | 53 | } catch (SQLException e) { 54 | try { 55 | dbc.getConn().rollback(); 56 | } catch (SQLException e1) { 57 | e1.printStackTrace(); 58 | } 59 | e.printStackTrace(); 60 | } 61 | } 62 | 63 | /** 64 | * 从数据库中读取未被处理过得url,并将其的状态值改为1,由于需要记录职位对应的关键字,故将keyword也一并取出 65 | * @return 66 | * @throws SQLException 67 | */ 68 | public String[] readFromReadyURL() { 69 | try { 70 | dbc.getConn().setAutoCommit(false); 71 | } catch (SQLException e2) { 72 | e2.printStackTrace(); 73 | } 74 | String sql = "SELECT id,url,keyword FROM ready_url WHERE state=0 LIMIT 1"; 75 | String updateSql = "UPDATE ready_url SET state=1 WHERE id=?"; 76 | String[] infos = null; 77 | try { 78 | dbc.getConn().setAutoCommit(false); 79 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 80 | ResultSet rs = pst.executeQuery(); 81 | if (rs.next()) { 82 | String id = rs.getString(1); 83 | String url = rs.getString(2); 84 | String keyword = rs.getString(3); 85 | infos = new String[2]; 86 | infos[0] = url; 87 | infos[1] = keyword; 88 | pst = dbc.getConn().prepareStatement(updateSql); 89 | pst.setInt(1, Integer.parseInt(id)); 90 | pst.executeUpdate(); 91 | System.out.println("正在处理: " + url); 92 | } 93 | 94 | dbc.getConn().commit(); 95 | rs.close(); 96 | pst.close(); 97 | } catch (SQLException e) { 98 | try { 99 | dbc.getConn().rollback(); 100 | } catch (SQLException e1) { 101 | e1.printStackTrace(); 102 | } 103 | e.printStackTrace(); 104 | } 105 | return infos; 106 | 107 | } 108 | 109 | /** 110 | * 处理指定url出现异常后,将其处理状态修改为0 111 | * @param url 112 | */ 113 | public void restoreReadyURL(String url) { 114 | String sql = "UPDATE ready_url SET state=0 WHERE url=? AND state=1"; 115 | try { 116 | dbc.getConn().setAutoCommit(false); 117 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 118 | pst = dbc.getConn().prepareStatement(sql); 119 | pst.setString(1, url); 120 | pst.executeUpdate(); 121 | dbc.getConn().commit(); 122 | pst.close(); 123 | System.out.println("正在回滚: " + url); 124 | } catch (SQLException e) { 125 | try { 126 | dbc.getConn().rollback(); 127 | } catch (SQLException e1) { 128 | e1.printStackTrace(); 129 | } 130 | e.printStackTrace(); 131 | } 132 | } 133 | 134 | /** 135 | * 向数据库中写入需要处理的url 136 | * @param url 137 | */ 138 | public void writeIntoCityURL(String url) { 139 | String sql = "INSERT INTO city_url (url,state) VALUES (?,0)"; 140 | try { 141 | dbc.getConn().setAutoCommit(false); 142 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 143 | pst.setString(1, url); 144 | pst.executeUpdate(); 145 | dbc.getConn().commit(); 146 | pst.close(); 147 | } catch (SQLException e) { 148 | try { 149 | dbc.getConn().rollback(); 150 | } catch (SQLException e1) { 151 | e1.printStackTrace(); 152 | } 153 | e.printStackTrace(); 154 | } 155 | } 156 | 157 | /** 158 | * 从数据库中读取未被处理过得url,并将其的状态值改为1,由于需要记录职位对应的关键字,故将keyword也一并取出 159 | * @return 160 | */ 161 | public String readFromCityURL() { 162 | String sql = "SELECT id,url FROM city_url WHERE state=0 LIMIT 1"; 163 | String updateSql = "UPDATE city_url SET state=1 WHERE id=?"; 164 | String url = null; 165 | try { 166 | dbc.getConn().setAutoCommit(false); 167 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 168 | ResultSet rs = pst.executeQuery(); 169 | if (rs.next()) { 170 | String id = rs.getString(1); 171 | url = rs.getString(2); 172 | pst = dbc.getConn().prepareStatement(updateSql); 173 | pst.setInt(1, Integer.parseInt(id)); 174 | pst.executeUpdate(); 175 | } 176 | 177 | dbc.getConn().commit(); 178 | rs.close(); 179 | pst.close(); 180 | } catch (SQLException e) { 181 | try { 182 | dbc.getConn().rollback(); 183 | } catch (SQLException e1) { 184 | e1.printStackTrace(); 185 | } 186 | e.printStackTrace(); 187 | } 188 | return url; 189 | 190 | } 191 | 192 | /** 193 | * 处理指定url出现异常后,将其处理状态修改为0 194 | * @param url 195 | */ 196 | public void restoreCityURL(String url) { 197 | String sql = "UPDATE city_url SET state=0 WHERE url=? AND state=1"; 198 | try { 199 | dbc.getConn().setAutoCommit(false); 200 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 201 | pst = dbc.getConn().prepareStatement(sql); 202 | pst.setString(1, url); 203 | pst.executeUpdate(); 204 | dbc.getConn().commit(); 205 | pst.close(); 206 | } catch (SQLException e) { 207 | try { 208 | dbc.getConn().rollback(); 209 | } catch (SQLException e1) { 210 | e1.printStackTrace(); 211 | } 212 | e.printStackTrace(); 213 | } 214 | } 215 | 216 | /** 217 | * 向数据库中插入一条职位信息记录 218 | * @param job 职位信息对象LGJob 219 | */ 220 | public void insertLGJob(LGJob job) { 221 | String sql = "INSERT INTO lagou (key_word,job,salary,city,experience,education,company,key_words) VALUES (?,?,?,?,?,?,?,?)"; 222 | try { 223 | dbc.getConn().setAutoCommit(false); 224 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 225 | pst.setString(1, job.getKeyword()); 226 | pst.setString(2, null); 227 | pst.setString(3, job.getSalary()); 228 | pst.setString(4, job.getCity()); 229 | pst.setString(5, job.getExperience()); 230 | pst.setString(6, job.getEducation()); 231 | pst.setString(7, job.getCompany()); 232 | pst.setObject(8, job.getKeywords()); 233 | 234 | pst.executeUpdate(); 235 | dbc.getConn().commit(); 236 | pst.close(); 237 | System.out.println("正在写入: " + job); 238 | } catch (SQLException e) { 239 | try { 240 | dbc.getConn().rollback(); 241 | } catch (SQLException e1) { 242 | e1.printStackTrace(); 243 | } 244 | e.printStackTrace(); 245 | } 246 | 247 | } 248 | 249 | /** 250 | * 读取数据库中的所有职位信息记录,并封装为对象列表 251 | * @return 职位信息对象列表 252 | */ 253 | public List getLGJob(String keyword) { 254 | String sql = "SELECT key_word,job,salary,city,experience,education,company,key_words FROM lagou WHERE key_word=?"; 255 | List jobs = new ArrayList(); 256 | try { 257 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 258 | pst.setString(1, keyword); 259 | ResultSet rs = pst.executeQuery(); 260 | while (rs.next()) { 261 | Blob kwBlob = rs.getBlob(8); 262 | ObjectInputStream objIn = new ObjectInputStream( 263 | kwBlob.getBinaryStream()); 264 | Map keywords = (Map) objIn 265 | .readObject(); 266 | LGJob job = new LGJob(null, rs.getString(1), null, 267 | rs.getString(3), rs.getString(4), rs.getString(5), 268 | rs.getString(6), rs.getString(7), keywords); 269 | jobs.add(job); 270 | objIn.close(); 271 | } 272 | rs.close(); 273 | pst.close(); 274 | } catch (SQLException e) { 275 | e.printStackTrace(); 276 | } catch (IOException e) { 277 | e.printStackTrace(); 278 | } catch (ClassNotFoundException e) { 279 | e.printStackTrace(); 280 | } 281 | 282 | return jobs; 283 | } 284 | 285 | /** 286 | * 读取数据库中的所有关键字图,并封装成KeyMap对象,保存到KeyMap列表中 287 | * @return KeyMap列表 288 | */ 289 | public List getKeyMap(String keyword) { 290 | String sql = "SELECT id,key_word,key_words FROM lagou WHERE key_word=?"; 291 | List kmaps = new ArrayList(); 292 | try { 293 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 294 | pst.setString(1, keyword); 295 | ResultSet rs = pst.executeQuery(); 296 | while (rs.next()) { 297 | Blob kwBlob = rs.getBlob(3); 298 | ObjectInputStream objIn = new ObjectInputStream( 299 | kwBlob.getBinaryStream()); 300 | Map keywords = (Map) objIn 301 | .readObject(); 302 | KeyMap kmap = new KeyMap(rs.getInt(1), rs.getString(2), 303 | keywords); 304 | kmaps.add(kmap); 305 | objIn.close(); 306 | } 307 | rs.close(); 308 | pst.close(); 309 | } catch (SQLException e) { 310 | e.printStackTrace(); 311 | } catch (IOException e) { 312 | e.printStackTrace(); 313 | } catch (ClassNotFoundException e) { 314 | e.printStackTrace(); 315 | } 316 | 317 | return kmaps; 318 | } 319 | 320 | /** 321 | * 将分析报告输出到mysql中 322 | * @param kwMap 经过merge后生成的map 323 | * @param key_word 该map对应的关键词 324 | */ 325 | public void writeKeyMapToMysql(Map kwMap, 326 | String key_word) { 327 | String sql = "INSERT INTO key_map_export(word,value,key_word) VALUES(?,?,?)"; 328 | try { 329 | Set keyset = kwMap.keySet(); 330 | for (String key : keyset) { 331 | PreparedStatement pst = dbc.getConn().prepareStatement(sql); 332 | pst.setString(1, key); 333 | pst.setInt(2, kwMap.get(key)); 334 | pst.setString(3, key_word); 335 | pst.executeUpdate(); 336 | pst.close(); 337 | } 338 | } catch (SQLException e) { 339 | e.printStackTrace(); 340 | } 341 | } 342 | 343 | /** 344 | * 将分析报告写入到文件中 345 | * @param kwMap 346 | * @param filepath 347 | * @throws FileNotFoundException 348 | */ 349 | public static void writeToFile(Map kwMap, 350 | String filepath) { 351 | int i = 1; 352 | try { 353 | PrintWriter pw = new PrintWriter( 354 | new FileOutputStream(new File(filepath), true)); 355 | Set keyset = kwMap.keySet(); 356 | for (String key : keyset) { 357 | pw.println(key + "\t" + kwMap.get(key)); 358 | System.out.println("已处理: " + i++); 359 | } 360 | pw.flush(); 361 | pw.close(); 362 | } catch (FileNotFoundException e) { 363 | e.printStackTrace(); 364 | } 365 | } 366 | 367 | /** 368 | * 从指定文件路径中读取文件 369 | * @param filepath 文件路径 370 | * @return 以行为单位保存的列表 371 | * @throws IOException 372 | */ 373 | public List readFromFile(String filepath) throws IOException { 374 | List cities = new ArrayList(); 375 | 376 | File file = new File(filepath); 377 | FileInputStream in = new FileInputStream(file); 378 | BufferedReader reader = new BufferedReader( 379 | new InputStreamReader(in, "UTF-8")); 380 | String line = null; 381 | 382 | while ((line = reader.readLine()) != null) { 383 | cities.add(line); 384 | } 385 | reader.close(); 386 | return cities; 387 | } 388 | 389 | /** 390 | * 关闭writer 391 | */ 392 | public void closeAll() { 393 | if (pw != null) { 394 | pw.close(); 395 | pw = null; 396 | } 397 | } 398 | 399 | /** 400 | * 开启writer 401 | * @param filepath 文件路径 402 | * @throws FileNotFoundException 403 | */ 404 | public void initWriter(String filepath) throws FileNotFoundException { 405 | if (pw == null) { 406 | File file = new File(filepath); 407 | if (!file.getParentFile().exists()) { 408 | file.getParentFile().mkdirs(); 409 | } 410 | pw = new PrintWriter(new FileOutputStream(file, true)); 411 | } 412 | } 413 | 414 | } 415 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/vo/Job.java: -------------------------------------------------------------------------------- 1 | package com.edmund.vo; 2 | 3 | /** 4 | * 职位信息封装类 5 | * 6 | * @author Edmund 7 | * 8 | */ 9 | public class Job { 10 | private Integer jid; 11 | private String city; 12 | private String key; 13 | private String title; 14 | private String salary; 15 | private String company; 16 | private String job; 17 | private String education; 18 | private String experience; 19 | 20 | public Job() { 21 | super(); 22 | } 23 | 24 | public Job(Integer jid, String city, String key, String title, String salary, String company, String job, 25 | String education, String experience) { 26 | super(); 27 | this.jid = jid; 28 | this.city = city; 29 | this.key = key; 30 | this.title = title; 31 | this.salary = salary; 32 | this.company = company; 33 | this.job = job; 34 | this.education = education; 35 | this.experience = experience; 36 | } 37 | 38 | public Integer getJid() { 39 | return jid; 40 | } 41 | 42 | public void setJid(Integer jid) { 43 | this.jid = jid; 44 | } 45 | 46 | public String getCity() { 47 | return city; 48 | } 49 | 50 | public void setCity(String city) { 51 | this.city = city; 52 | } 53 | 54 | public String getKey() { 55 | return key; 56 | } 57 | 58 | public void setKey(String key) { 59 | this.key = key; 60 | } 61 | 62 | public String getTitle() { 63 | return title; 64 | } 65 | 66 | public void setTitle(String title) { 67 | this.title = title; 68 | } 69 | 70 | public String getSalary() { 71 | return salary; 72 | } 73 | 74 | public void setSalary(String salary) { 75 | this.salary = salary; 76 | } 77 | 78 | public String getCompany() { 79 | return company; 80 | } 81 | 82 | public void setCompany(String company) { 83 | this.company = company; 84 | } 85 | 86 | public String getJob() { 87 | return job; 88 | } 89 | 90 | public void setJob(String job) { 91 | this.job = job; 92 | } 93 | 94 | public String getEducation() { 95 | return education; 96 | } 97 | 98 | public void setEducation(String education) { 99 | this.education = education; 100 | } 101 | 102 | public String getExperience() { 103 | return experience; 104 | } 105 | 106 | public void setExperience(String experience) { 107 | this.experience = experience; 108 | } 109 | 110 | @Override 111 | public String toString() { 112 | return "Job [jid=" + jid + ", city=" + city + ", key=" + key + ", title=" + title + ", salary=" + salary 113 | + ", company=" + company + ", job=" + job + ", education=" + education + ", experience=" + experience 114 | + "]"; 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/vo/KeyMap.java: -------------------------------------------------------------------------------- 1 | package com.edmund.vo; 2 | 3 | import java.util.Map; 4 | 5 | public class KeyMap { 6 | private int id; 7 | private String keyword; 8 | private Map keywords; 9 | 10 | public KeyMap() { 11 | super(); 12 | } 13 | 14 | public KeyMap(int id, String keyword, Map keywords) { 15 | super(); 16 | this.id = id; 17 | this.keyword = keyword; 18 | this.keywords = keywords; 19 | } 20 | 21 | public int getId() { 22 | return id; 23 | } 24 | 25 | public void setId(int id) { 26 | this.id = id; 27 | } 28 | 29 | public String getKeyword() { 30 | return keyword; 31 | } 32 | 33 | public void setKeyword(String keyword) { 34 | this.keyword = keyword; 35 | } 36 | 37 | public Map getKeywords() { 38 | return keywords; 39 | } 40 | 41 | public void setKeywords(Map keywords) { 42 | this.keywords = keywords; 43 | } 44 | 45 | @Override 46 | public String toString() { 47 | return "KeyMap [id=" + id + ", keyword=" + keyword + ", keywords=" 48 | + keywords + "]"; 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/edmund/vo/LGJob.java: -------------------------------------------------------------------------------- 1 | package com.edmund.vo; 2 | 3 | import java.util.Map; 4 | 5 | public class LGJob { 6 | private Integer id; 7 | private String keyword; 8 | private String job; 9 | private String salary; 10 | private String city; 11 | private String experience; 12 | private String education; 13 | private String company; 14 | private Map keywords; 15 | 16 | public LGJob() { 17 | super(); 18 | } 19 | 20 | public LGJob(Integer id, String keyword, String job, String salary, 21 | String city, String experience, String education, String company, 22 | Map keywords) { 23 | super(); 24 | this.id = id; 25 | this.keyword = keyword; 26 | this.job = job; 27 | this.salary = salary; 28 | this.city = city; 29 | this.experience = experience; 30 | this.education = education; 31 | this.company = company; 32 | this.keywords = keywords; 33 | } 34 | 35 | public Integer getId() { 36 | return id; 37 | } 38 | 39 | public void setId(Integer id) { 40 | this.id = id; 41 | } 42 | 43 | public String getKeyword() { 44 | return keyword; 45 | } 46 | 47 | public void setKeyword(String keyword) { 48 | this.keyword = keyword; 49 | } 50 | 51 | public String getJob() { 52 | return job; 53 | } 54 | 55 | public void setJob(String job) { 56 | this.job = job; 57 | } 58 | 59 | public String getSalary() { 60 | return salary; 61 | } 62 | 63 | public void setSalary(String salary) { 64 | this.salary = salary; 65 | } 66 | 67 | public String getCity() { 68 | return city; 69 | } 70 | 71 | public void setCity(String city) { 72 | this.city = city; 73 | } 74 | 75 | public String getExperience() { 76 | return experience; 77 | } 78 | 79 | public void setExperience(String experience) { 80 | this.experience = experience; 81 | } 82 | 83 | public String getEducation() { 84 | return education; 85 | } 86 | 87 | public void setEducation(String education) { 88 | this.education = education; 89 | } 90 | 91 | public String getCompany() { 92 | return company; 93 | } 94 | 95 | public void setCompany(String company) { 96 | this.company = company; 97 | } 98 | 99 | public Map getKeywords() { 100 | return keywords; 101 | } 102 | 103 | public void setKeywords(Map keywords) { 104 | this.keywords = keywords; 105 | } 106 | 107 | @Override 108 | public String toString() { 109 | return "LGJob [id=" + id + ", keyword=" + keyword + ", job=" + job 110 | + ", salary=" + salary + ", city=" + city + ", experience=" 111 | + experience + ", education=" + education + ", company=" 112 | + company + ", keywords=" + keywords + "]"; 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/com/radish/HDFSUtil/HDFSTest.java: -------------------------------------------------------------------------------- 1 | package com.radish.HDFSUtil; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataOutputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | 8 | public class HDFSTest { 9 | 10 | public static void main(String[] args) { 11 | // TODO Auto-generated method stub 12 | Configuration configuration = new Configuration(); 13 | Path path = new Path("hdfs://192.168.199.233:9000/input/H2.txt"); 14 | try { 15 | FileSystem fs = path.getFileSystem(configuration); 16 | FSDataOutputStream os = fs.create(path); 17 | os.writeUTF("Ni Hao ~"); 18 | os.close(); 19 | } catch (Exception e) { 20 | System.out.println("catch a exception"); 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/radish/analysis/DataAnalysiser.java: -------------------------------------------------------------------------------- 1 | package com.radish.analysis; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.PreparedStatement; 6 | import java.sql.ResultSet; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * 用于得出最终要展示的数据 12 | * @author radish 13 | * 14 | */ 15 | public class DataAnalysiser { 16 | private static Connection conn; 17 | private static String[] provinceArray = new String[] { "北京", "天津", "河北", "山西", "内蒙古", "辽宁", "吉林", "黑龙江", "上海", "江苏", 18 | "浙江", "安徽", "福建", "江西", "山东", "河南", "湖北", "湖南", "广东", "广西", "海南", "重庆", "四川", "贵州", "云南", "西藏", "陕西", "甘肃", 19 | "青海", "宁夏", "新疆" }; 20 | private static String[] keyWordArray = new String[] { "java", "C#", "linux", "python", "web", "c++", "android" }; 21 | 22 | static { 23 | try { 24 | Class.forName("com.mysql.jdbc.Driver"); 25 | String url = "jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8"; 26 | String username = "root"; 27 | String password = "admin"; 28 | conn = DriverManager.getConnection(url, username, password); 29 | } catch (Exception e) { 30 | // TODO Auto-generated catch block 31 | e.printStackTrace(); 32 | } 33 | } 34 | 35 | /* 36 | * 生成数据 37 | */ 38 | public static void main(String[] args) throws Exception { 39 | 40 | for (int i = 0; i < keyWordArray.length; i++) { 41 | String key = keyWordArray[i]; 42 | for (int strNum = 1; strNum <= 6; strNum++) { 43 | System.out.print("dataMap."+keyWordArray[i] + "data" + strNum + "=dataFormatter({");// javadata1-- 44 | if (strNum == 1) { 45 | System.out.print("2018:["); 46 | List countList = countJobRequestNumber(key); 47 | for (int k = 0; k < countList.size() - 1; k++) { 48 | System.out.print(countList.get(k) + ","); 49 | } 50 | System.out.print(countList.get(countList.size() - 1)); 51 | System.out.print("]});"); 52 | System.out.println(); 53 | } 54 | // 第二行数据是平均薪资 55 | if (strNum == 2) { 56 | System.out.print("2018:["); 57 | List countList = countAvgSalary(key); 58 | for (int k = 0; k < countList.size() - 1; k++) { 59 | System.out.print(countList.get(k) + ","); 60 | } 61 | System.out.print(countList.get(countList.size() - 1)); 62 | System.out.print("]});"); 63 | System.out.println(); 64 | } 65 | // 第三行数据是平均最高工资 66 | if (strNum == 3) { 67 | System.out.print("2018:["); 68 | List countList = countMaxSalary(key); 69 | for (int k = 0; k < countList.size() - 1; k++) { 70 | System.out.print(countList.get(k) + ","); 71 | } 72 | System.out.print(countList.get(countList.size() - 1)); 73 | System.out.print("]});"); 74 | System.out.println(); 75 | } 76 | // 第四行数据是平均最低工资 77 | if (strNum == 4) { 78 | System.out.print("2018:["); 79 | List countList = countMinSalary(key); 80 | for (int k = 0; k < countList.size() - 1; k++) { 81 | System.out.print(countList.get(k) + ","); 82 | } 83 | System.out.print(countList.get(countList.size() - 1)); 84 | System.out.print("]});"); 85 | System.out.println(); 86 | } 87 | // 本科以及以上员工占岗位数的百分比 88 | if (strNum == 5) { 89 | System.out.print("2018:["); 90 | List countList = countEducationOver2Percent(key); 91 | for (int k = 0; k < countList.size() - 1; k++) { 92 | System.out.print(countList.get(k) + ","); 93 | } 94 | System.out.print(countList.get(countList.size() - 1)); 95 | System.out.print("]});"); 96 | System.out.println(); 97 | } 98 | // 工作经验不限的比例 99 | if (strNum == 6) { 100 | System.out.print("2018:["); 101 | List countList = countExperienceIn0(key); 102 | for (int k = 0; k < countList.size() - 1; k++) { 103 | System.out.print(countList.get(k) + ","); 104 | } 105 | System.out.print(countList.get(countList.size() - 1)); 106 | System.out.print("]});"); 107 | System.out.println(); 108 | } 109 | } 110 | } 111 | 112 | } 113 | 114 | /** 115 | * 116 | * @param key 搜索关键字 117 | * @param city 搜索所在省市 118 | * @return 搜索数据库中各大province的指定搜索关键词key的总条数 119 | */ 120 | public static List countJobRequestNumber(String key) throws Exception { 121 | List countList = new ArrayList(); 122 | String sql = null; 123 | PreparedStatement stmt = null; 124 | ResultSet rs = null; 125 | for (String province : provinceArray) { 126 | sql = "SELECT COUNT(id) FROM job_data_result where key_word=? and province=?"; 127 | stmt = conn.prepareStatement(sql); 128 | stmt.setString(1, key); 129 | stmt.setString(2, province); 130 | rs = stmt.executeQuery(); 131 | rs.next(); 132 | int jobRequestNumber = rs.getInt(1); 133 | countList.add(jobRequestNumber); 134 | } 135 | if (rs != null) { 136 | rs.close(); 137 | } 138 | if (stmt != null) { 139 | stmt.close(); 140 | } 141 | return countList; 142 | } 143 | 144 | /** 145 | * 计算平均薪资 146 | * @param key 147 | * @return 148 | * @throws Exception 149 | */ 150 | public static List countAvgSalary(String key) throws Exception { 151 | List countList = new ArrayList(); 152 | String sql = null; 153 | PreparedStatement stmt = null; 154 | ResultSet rs = null; 155 | for (String province : provinceArray) { 156 | sql = "SELECT AVG(avg_salary) FROM job_data_result where key_word=? and province=?"; 157 | stmt = conn.prepareStatement(sql); 158 | stmt.setString(1, key); 159 | stmt.setString(2, province); 160 | rs = stmt.executeQuery(); 161 | rs.next(); 162 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", rs.getDouble(1))); 163 | if (!Double.isNaN(jobRequestNumber)) { 164 | countList.add(jobRequestNumber); 165 | } else { 166 | countList.add(0.0); 167 | } 168 | } 169 | if (rs != null) { 170 | rs.close(); 171 | } 172 | if (stmt != null) { 173 | stmt.close(); 174 | } 175 | return countList; 176 | } 177 | 178 | /** 179 | * 计算平均最大工资值 180 | * @param key 181 | * @return 搜索数据库中各大province的指定搜索关键词key的平均最大工资值 182 | * @throws Exception 183 | */ 184 | public static List countMaxSalary(String key) throws Exception { 185 | List countList = new ArrayList(); 186 | String sql = null; 187 | PreparedStatement stmt = null; 188 | ResultSet rs = null; 189 | for (String province : provinceArray) { 190 | sql = "SELECT AVG(max_salary) FROM job_data_result where key_word=? and province=?"; 191 | stmt = conn.prepareStatement(sql); 192 | stmt.setString(1, key); 193 | stmt.setString(2, province); 194 | rs = stmt.executeQuery(); 195 | rs.next(); 196 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", rs.getDouble(1))); 197 | if (!Double.isNaN(jobRequestNumber)) { 198 | countList.add(jobRequestNumber); 199 | } else { 200 | countList.add(0.0); 201 | } 202 | } 203 | if (rs != null) { 204 | rs.close(); 205 | } 206 | if (stmt != null) { 207 | stmt.close(); 208 | } 209 | return countList; 210 | } 211 | 212 | /** 213 | * 计算平均最小工资值 214 | * @param key 215 | * @return 216 | * @throws Exception 217 | */ 218 | public static List countMinSalary(String key) throws Exception { 219 | List countList = new ArrayList(); 220 | String sql = null; 221 | PreparedStatement stmt = null; 222 | ResultSet rs = null; 223 | for (String province : provinceArray) { 224 | sql = "SELECT AVG(min_salary) FROM job_data_result where key_word=? and province=?"; 225 | stmt = conn.prepareStatement(sql); 226 | stmt.setString(1, key); 227 | stmt.setString(2, province); 228 | rs = stmt.executeQuery(); 229 | rs.next(); 230 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", rs.getDouble(1))); 231 | if (!Double.isNaN(jobRequestNumber)) { 232 | countList.add(jobRequestNumber); 233 | } else { 234 | countList.add(0.0); 235 | } 236 | } 237 | if (rs != null) { 238 | rs.close(); 239 | } 240 | if (stmt != null) { 241 | stmt.close(); 242 | } 243 | return countList; 244 | } 245 | 246 | /** 247 | * 计算本科以及以上员工占岗位数的百分比 248 | * @param key 249 | * @return 250 | * @throws Exception 251 | */ 252 | public static List countEducationOver2Percent(String key) throws Exception { 253 | List countList = new ArrayList(); 254 | String sql = null; 255 | PreparedStatement stmt = null; 256 | ResultSet rs = null; 257 | for (String province : provinceArray) { 258 | // 先查询员工总数总数 259 | sql = "SELECT COUNT(id) FROM job_data_result WHERE key_word=? and province=?"; 260 | stmt = conn.prepareStatement(sql); 261 | stmt.setString(1, key); 262 | stmt.setString(2, province); 263 | rs = stmt.executeQuery(); 264 | rs.next(); 265 | int staffCount = rs.getInt(1); 266 | // 再查询本科以上的个数 267 | sql = "SELECT COUNT(id) FROM job_data_result WHERE key_word=? AND province=? AND " + "min_education>1 "; 268 | stmt = conn.prepareStatement(sql); 269 | stmt.setString(1, key); 270 | stmt.setString(2, province); 271 | rs = stmt.executeQuery(); 272 | rs.next(); 273 | int staffOver2 = rs.getInt(1); 274 | double persent = (staffOver2 * 1.0) / staffCount; 275 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", persent)); 276 | if (!Double.isNaN(jobRequestNumber)) { 277 | countList.add(jobRequestNumber); 278 | } else { 279 | countList.add(0.0); 280 | } 281 | } 282 | if (rs != null) { 283 | rs.close(); 284 | } 285 | if (stmt != null) { 286 | stmt.close(); 287 | } 288 | return countList; 289 | } 290 | 291 | /** 292 | * 计算工作经验不限的比例 293 | * @param key 294 | * @return 295 | * @throws Exception 296 | */ 297 | public static List countExperienceIn0(String key) throws Exception { 298 | List countList = new ArrayList(); 299 | String sql = null; 300 | PreparedStatement stmt = null; 301 | ResultSet rs = null; 302 | for (String province : provinceArray) { 303 | // 先查询员工总数总数 304 | sql = "SELECT COUNT(id) FROM job_data_result WHERE key_word=? AND province=?"; 305 | stmt = conn.prepareStatement(sql); 306 | stmt.setString(1, key); 307 | stmt.setString(2, province); 308 | rs = stmt.executeQuery(); 309 | rs.next(); 310 | int staffCount = rs.getInt(1); 311 | // 再查询本科以上的个数 312 | sql = "SELECT COUNT(id) FROM job_data_result WHERE key_word=? AND province=? AND " + "min_experience=1 "; 313 | stmt = conn.prepareStatement(sql); 314 | stmt.setString(1, key); 315 | stmt.setString(2, province); 316 | rs = stmt.executeQuery(); 317 | rs.next(); 318 | int staffSelected = rs.getInt(1); 319 | double persent = (staffSelected * 1.0) / staffCount; 320 | double jobRequestNumber = Double.parseDouble(String.format("%.2f", persent)); 321 | if (!Double.isNaN(jobRequestNumber)) { 322 | countList.add(jobRequestNumber); 323 | } else { 324 | countList.add(0.0); 325 | } 326 | } 327 | if (rs != null) { 328 | rs.close(); 329 | } 330 | if (stmt != null) { 331 | stmt.close(); 332 | } 333 | return countList; 334 | } 335 | } 336 | -------------------------------------------------------------------------------- /src/main/java/com/radish/crawler/BOSSCrawlerManager.java: -------------------------------------------------------------------------------- 1 | package com.radish.crawler; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.InputStreamReader; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.StringTokenizer; 10 | 11 | import org.openqa.selenium.By; 12 | import org.openqa.selenium.WebElement; 13 | import org.openqa.selenium.chrome.ChromeDriver; 14 | import org.openqa.selenium.support.ui.ExpectedConditions; 15 | import org.openqa.selenium.support.ui.WebDriverWait; 16 | 17 | import com.radish.vo.BOSSUrlVO; 18 | 19 | /** 20 | * 单例模式实现管理爬取队列 21 | * @author admin 22 | * 23 | */ 24 | public class BOSSCrawlerManager { 25 | // 初始化爬取队列,每个VO中的url都是可直接访问的 26 | private List urlList = new ArrayList(); 27 | // 要爬取的关键词 java python web linux 28 | private String[] keys = new String[] { "java", "python", "web", "linux" }; 29 | // 爬虫间争抢runningCrawler的同步对象 30 | private Object obj = new Object(); 31 | // 所有爬虫爬完后,通过obj通知main线程 32 | public Object mainThread = new Object(); 33 | // 当前仍然在工作的爬虫数 34 | private Integer runningCrawler = 0; 35 | 36 | private static BOSSCrawlerManager instance = new BOSSCrawlerManager(); 37 | 38 | // 不可构造,单例模式 39 | private BOSSCrawlerManager() { 40 | 41 | } 42 | 43 | public static BOSSCrawlerManager getInstance() { 44 | return instance; 45 | } 46 | 47 | /** 48 | * 根据文件绝对路径初始化待爬取队列 49 | * 初始化成功则返回true,否则false 50 | * @return 51 | */ 52 | public boolean init(String filePath) { 53 | boolean flag = false; 54 | try { 55 | // 如果文件不存在 56 | if (!new File(filePath).exists()) { 57 | return false; 58 | } 59 | BufferedReader reader = new BufferedReader( 60 | new InputStreamReader(new FileInputStream(new File(filePath)), "UTF-8")); 61 | // 逐行读 62 | String line = null; 63 | while ((line = reader.readLine()) != null) { 64 | StringTokenizer tokens = new StringTokenizer(line); 65 | String province = null; 66 | String city = null; 67 | String url = null; 68 | if (tokens.hasMoreTokens()) { 69 | province = tokens.nextToken(); 70 | } 71 | if (tokens.hasMoreTokens()) { 72 | city = tokens.nextToken(); 73 | } 74 | if (tokens.hasMoreTokens()) { 75 | url = tokens.nextToken(); 76 | } 77 | // 根据关键词数组进行初始化 78 | for (int i = 0; i < keys.length; i++) { 79 | urlList.add(new BOSSUrlVO(province, city, url, keys[i])); 80 | } 81 | } 82 | 83 | reader.close(); 84 | flag=true; 85 | } catch (Exception e) { 86 | // 如果出异常,返回false 87 | return false; 88 | } 89 | // 如果没返回true,则返回false 90 | return flag; 91 | } 92 | 93 | /** 94 | * 设置爬虫领取任务的同步方法 95 | * 如果任务没了,就返回null 96 | * @return 97 | */ 98 | public synchronized BOSSUrlVO getVO() { 99 | // 如果待爬取队列空了 100 | if (urlList.size() == 0) { 101 | return null; 102 | } 103 | BOSSUrlVO vo = urlList.get(0); 104 | urlList.remove(0); 105 | return vo; 106 | } 107 | 108 | public void buildWorker() { 109 | new WorkerThread().start(); 110 | } 111 | 112 | /** 113 | * 建立爬虫类 114 | * @author admin 115 | * 116 | */ 117 | class WorkerThread extends Thread { 118 | private BOSSUrlVO vo; 119 | private ChromeDriver driver; 120 | 121 | // 构造方法 122 | public WorkerThread() { 123 | // 初始化浏览器驱动 124 | driver = new ChromeDriver(); 125 | } 126 | 127 | /** 128 | * 核心工作方法 129 | * 不断获取待爬取队列的vo对象,爬取vo对象的url 130 | */ 131 | @Override 132 | public void run() { 133 | synchronized (runningCrawler) { 134 | runningCrawler++; 135 | } 136 | // 永真循环 137 | while (true) { 138 | synchronized (runningCrawler) { 139 | vo = getVO(); 140 | // 如果队列空了 141 | if (vo == null) { 142 | // 当前工作爬虫-1 143 | runningCrawler--; 144 | // 如果这是最后一个死掉的爬虫,唤醒main线程 145 | synchronized (mainThread) { 146 | if (runningCrawler == 0) { 147 | System.out.println("最后一只爬虫休眠"); 148 | System.out.println("即将唤醒main线程"); 149 | mainThread.notify(); 150 | } 151 | } 152 | // 退出run 153 | return; 154 | } 155 | } 156 | // 如果得到了分配的任务 157 | try { 158 | work(); 159 | } catch (Exception e) { 160 | // 如果爬虫一次工作出现异常 161 | System.out.println(Thread.currentThread().getName()+":-------------爬虫工作异常---------------"); 162 | } 163 | } 164 | } 165 | 166 | /** 167 | * 线程的工作方法 168 | * 爬取url对应的 169 | */ 170 | private void work(){ 171 | 172 | try { 173 | String province = vo.getProvince(); 174 | String city = vo.getCity(); 175 | String url = vo.getUrl(); 176 | System.out.println("url:" + url); 177 | WebDriverWait wait = new WebDriverWait(driver, 8); 178 | // 打开网页 179 | driver.get(url); 180 | while (true) { 181 | // 等待加载 182 | //wait.until(ExpectedConditions.presenceOfElementLocated(By.id("footer"))); 183 | wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#wrap"))); 184 | // 爬取内容 185 | // 先爬取所有的div.job-list div.job-primary 186 | List divList = driver.findElementsByCssSelector("div.job-list div.job-primary"); 187 | for (WebElement jobDiv : divList) { 188 | // 得到title salary city experience education company 189 | // 标题 190 | String title = jobDiv.findElement(By.cssSelector("div.job-title")).getText(); 191 | // 收入 192 | String salary = jobDiv.findElement(By.cssSelector("span.red")).getText(); 193 | // 企业 194 | String company = jobDiv.findElement(By.cssSelector("div.company-text h3")).getText(); 195 | // 工作经验 学历 196 | String text = jobDiv.findElement(By.cssSelector("div.info-primary p")).getText(); 197 | String experience = text.substring(text.indexOf(" ")); 198 | String education = text.substring(text.length()-2); 199 | // 打印一个单元数据测试 200 | System.out.printf("title:%s\t%s\t%s\t%s\t%s\t%s", title, city, salary, company, experience, 201 | education+"\r\n"); 202 | } 203 | WebElement nextElement = null; 204 | // 如果有下一页,则点击下一页,否则 205 | if((nextElement=driver.findElement(By.cssSelector("div.page a.next")))!=null){ 206 | if(nextElement.getAttribute("class").contains("disabled")){ 207 | return; 208 | }else{ 209 | nextElement.click(); 210 | } 211 | }else{// 如果没找到就结束了. 212 | return; 213 | } 214 | } 215 | } catch(Exception e){ 216 | System.out.println("url:"+vo.getUrl()+"error,没有正常爬取完毕"); 217 | try { 218 | Thread.sleep(10*1000); 219 | } catch (InterruptedException e1) { 220 | System.out.println("sleep失败"); 221 | } 222 | System.out.println("尽快输入验证码"); 223 | } 224 | } 225 | } 226 | 227 | } 228 | -------------------------------------------------------------------------------- /src/main/java/com/radish/crawler/BOSSProvinceCrawler.java: -------------------------------------------------------------------------------- 1 | package com.radish.crawler; 2 | 3 | import java.io.File; 4 | import java.io.FileOutputStream; 5 | import java.io.FileWriter; 6 | import java.io.OutputStreamWriter; 7 | import java.io.PrintWriter; 8 | import java.util.List; 9 | 10 | import org.jsoup.Jsoup; 11 | import org.jsoup.nodes.Document; 12 | import org.jsoup.nodes.Element; 13 | import org.jsoup.select.Elements; 14 | import org.openqa.selenium.By; 15 | import org.openqa.selenium.WebElement; 16 | import org.openqa.selenium.chrome.ChromeDriver; 17 | import org.openqa.selenium.support.ui.ExpectedConditions; 18 | import org.openqa.selenium.support.ui.WebDriverWait; 19 | 20 | /** 21 | * 爬取https://www.zhipin.com/job_detail/?query=java&scity=101090100&industry=&position=100101 22 | * 中BOOS直聘的province编号. 23 | * java置空为# 24 | * 输出 25 | * 省 市 url 26 | * @author admin 27 | * 28 | */ 29 | public class BOSSProvinceCrawler { 30 | 31 | public static void main(String[] args) throws Exception { 32 | // 33 | // work(); 34 | // 35 | jsoupWork(); 36 | } 37 | 38 | /** 39 | * 使用selenium打开目标网页,爬取主要信息,并保存到/result-sources/BoosUrl.txt 40 | */ 41 | public static void work() throws Exception { 42 | String url = "https://www.zhipin.com/job_detail/?query=java&scity=101090100&industry=&position=100101"; 43 | // 设置Chrome浏览器驱动所在位置 44 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe"); 45 | ChromeDriver driver = new ChromeDriver(); 46 | 47 | WebDriverWait wait = new WebDriverWait(driver, 5); 48 | // 打开网页 49 | driver.get(url); 50 | 51 | // 等待网页加载完毕 52 | wait.until(ExpectedConditions.presenceOfElementLocated(By.id("wrap"))); 53 | // System.out.println(bodyText); 54 | // 点击下拉框 55 | driver.findElement(By.cssSelector("span.label-text")).click(); 56 | // 找出city-box 显示省份条数 57 | List provinceList = driver.findElements(By.cssSelector("div.city-box ul.dorpdown-province li")); 58 | // 去掉第一行的热门 59 | provinceList.remove(0); 60 | // 找出每个省份对应的ul列 61 | List cityList = driver.findElements(By.cssSelector("div.dorpdown-city ul")); 62 | // 去掉第一个热门 63 | cityList.remove(0); 64 | // 遍历省 65 | for (WebElement provinceEL : provinceList) { 66 | int i = 0; 67 | // 省名称 68 | String provinceName = provinceEL.getText(); 69 | // 遍历省对应的城市ul 70 | WebElement ulEL = cityList.get(i); 71 | List liList = ulEL.findElements(By.tagName("li")); 72 | for (WebElement li : liList) { 73 | System.out.println(provinceName + "\t" + li.getText() + "\t" + li.getAttribute("data-val")); 74 | } 75 | i++; 76 | } 77 | 78 | } 79 | 80 | public static void jsoupWork() { 81 | try { 82 | // String url = 83 | // "https://www.zhipin.com/job_detail/?query=java&scity=101090100&industry=&position=100101"; 84 | // String 85 | // userAgent="Opera11.11–WindowsUser-Agent:Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11"; 86 | String url = "https://www.zhipin.com/job_detail/?query=java&scity=101090100&industry=&position=100101"; 87 | // 设置Chrome浏览器驱动所在位置 88 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe"); 89 | ChromeDriver driver = new ChromeDriver(); 90 | WebDriverWait wait = new WebDriverWait(driver, 5); 91 | // 打开网页 92 | driver.get(url); 93 | // 等待网页加载完毕 94 | wait.until(ExpectedConditions.presenceOfElementLocated(By.id("footer"))); 95 | 96 | Document document = Jsoup.parse(driver.getPageSource()); 97 | // 找到省 98 | Elements provinceList = document.select("div.city-box ul.dorpdown-province li"); 99 | // 去掉热门 100 | provinceList.remove(0); 101 | // 市ul列表 102 | Elements cityULList = document.select("div.dorpdown-city ul"); 103 | // 去掉热门 104 | cityULList.remove(0); 105 | StringBuilder builder = new StringBuilder(); 106 | for (int i = 0; i < provinceList.size(); i++) { 107 | // 得到省名称 108 | String provinceName = provinceList.get(i).text(); 109 | // 找到城市ul并得到其中的li 110 | Elements cityList = cityULList.get(i).select("li"); 111 | // 遍历li 112 | for (Element cityLi : cityList) { 113 | // 写入到/result-sources/BoosUrl.txt 114 | //provinceName + "\t" + cityLi.text() + "\t" + cityLi.attr("data-val") 115 | // url=https://www.zhipin.com/job_detail/?query=java&scity=101281900 116 | String line = provinceName + "\t" + cityLi.text() + "\t" 117 | +"https://www.zhipin.com/job_detail/?query=#&scity=" 118 | + cityLi.attr("data-val")+"\r\n"; 119 | System.out.println(">>"+line); 120 | builder.append(line); 121 | } 122 | } 123 | // 结果写入到文件 124 | File result = new File("D:/eclipse_2/git/Job_Analysis/result-sources/radish/BossUrl.txt"); 125 | result.createNewFile(); 126 | PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(result),"UTF-8")); 127 | writer.print(builder.toString()); 128 | writer.flush(); 129 | writer.close(); 130 | } catch (Exception e) { 131 | // TODO Auto-generated catch block 132 | e.printStackTrace(); 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/main/java/com/radish/crawler/Test.java: -------------------------------------------------------------------------------- 1 | package com.radish.crawler; 2 | 3 | public class Test { 4 | public static void main(String[] args) { 5 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe"); 6 | // 获得单例句柄 7 | BOSSCrawlerManager instance = BOSSCrawlerManager.getInstance(); 8 | // 初始化单例对象 9 | if(!instance.init("C:/Users/admin/Desktop/BossUrl.txt")){ 10 | System.out.println("init 失败!程序退出"); 11 | System.exit(0); 12 | }; 13 | System.out.println("urlList初始化完毕!--------------------"); 14 | instance.buildWorker(); 15 | //instance.buildWorker(); 16 | //instance.buildWorker(); 17 | System.out.println("3只爬虫启动成功"); 18 | 19 | // 启动3个线程后,自己睡眠等待最后一只休眠的爬虫唤醒自己 20 | Object mainThread = instance.mainThread; 21 | synchronized (mainThread) { 22 | try { 23 | System.out.println("main线程睡眠"); 24 | mainThread.wait(); 25 | System.out.println("main线程被唤醒"); 26 | } catch (InterruptedException e) { 27 | e.printStackTrace(); 28 | } 29 | } 30 | // 被唤醒后 main方法负责 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/radish/crawler/distributed/DistributedCrawler.java: -------------------------------------------------------------------------------- 1 | package com.radish.crawler.distributed; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.SQLException; 6 | 7 | import org.openqa.selenium.chrome.ChromeDriver; 8 | 9 | /** 10 | * 实现分布式爬虫 11 | * 12 | * 任务: 13 | * 1.从数据库表:url_list获取一条url, 14 | * 15 | * 2.将爬到的数据存入到表lagou 16 | * 爬虫类,每次领取一个任务,并将任务的状态值置为1 17 | * 任务完成后,将结果存入lagou表,并将url_list中相对应任务的状态值置为2 18 | * @author admin 19 | * 20 | */ 21 | public class DistributedCrawler { 22 | private Connection conn; 23 | private ChromeDriver driver; 24 | 25 | // 空构造,初始化conn 26 | public DistributedCrawler() { 27 | 28 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe"); 29 | driver = new ChromeDriver(); 30 | try { 31 | Class.forName("com.mysql.jdbc.Driver"); 32 | // 注意,IP可以填写分布式数据库所在的主机 33 | conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8", "root", 34 | "admin"); 35 | } catch (Exception e) { 36 | System.out.println("数据库连接初始化失败"); 37 | } 38 | } 39 | 40 | // 关闭内置的数据库连接 41 | public void closeConnection() throws Exception { 42 | conn.close(); 43 | } 44 | 45 | public Connection getConn() { 46 | return conn; 47 | } 48 | 49 | public void setConn(Connection conn) { 50 | this.conn = conn; 51 | } 52 | 53 | public ChromeDriver getDriver() { 54 | return driver; 55 | } 56 | 57 | public void setDriver(ChromeDriver driver) { 58 | this.driver = driver; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/radish/crawler/distributed/Test.java: -------------------------------------------------------------------------------- 1 | package com.radish.crawler.distributed; 2 | 3 | import java.sql.Connection; 4 | import java.sql.PreparedStatement; 5 | import java.sql.ResultSet; 6 | import java.sql.SQLException; 7 | import java.util.List; 8 | 9 | import org.openqa.selenium.By; 10 | import org.openqa.selenium.WebElement; 11 | import org.openqa.selenium.chrome.ChromeDriver; 12 | import org.openqa.selenium.support.ui.ExpectedConditions; 13 | import org.openqa.selenium.support.ui.WebDriverWait; 14 | 15 | import com.radish.vo.BOSSUrlVO; 16 | import com.radish.vo.JobDataVO; 17 | 18 | public class Test { 19 | private static Connection conn; 20 | private static ChromeDriver driver; 21 | private static Integer count = 1; 22 | 23 | static { 24 | DistributedCrawler crawler = new DistributedCrawler(); 25 | System.out.println("新建 crawler对象"); 26 | System.out.println("设置 system.property"); 27 | driver = crawler.getDriver(); 28 | conn = crawler.getConn(); 29 | try { 30 | conn.setAutoCommit(false); 31 | } catch (SQLException e) { 32 | System.out.println("设置不自动提交失败"); 33 | } 34 | System.out.println("初始化块执行完毕"); 35 | } 36 | 37 | public static void main(String[] args) { 38 | try { 39 | // 事务处理串行化 40 | conn.setAutoCommit(false); 41 | } catch (SQLException e2) { 42 | System.out.println("设置自动提交失败"); 43 | e2.printStackTrace(); 44 | } 45 | 46 | while (true) { 47 | try { 48 | // 查询一个任务的SELECT语句 49 | String sql = "SELECT id,province,city,url,key_word,status " + "FROM url_list " 50 | + "WHERE status=0 order by id limit 1 "; 51 | PreparedStatement stmt = conn.prepareStatement(sql); 52 | ResultSet rs = stmt.executeQuery(); 53 | //System.out.println("select 执行"); 54 | //System.out.println(rs.wasNull()); 55 | //System.out.println("next():" + rs.next()); 56 | // 查到东西 57 | if (rs.next()) { 58 | System.out.println("查到了一个对象"); 59 | // 取出到vo对象 60 | BOSSUrlVO vo = null; 61 | 62 | int id = rs.getInt(1); 63 | String province = rs.getString(2); 64 | String city = rs.getString(3); 65 | String url = rs.getString(4); 66 | System.out.println("url----" + url); 67 | String keyWord = rs.getString(5); 68 | int status = rs.getInt(6); 69 | vo = new BOSSUrlVO(id, province, city, url, keyWord, status); 70 | 71 | // 就修改status = 1 72 | sql = "UPDATE url_list set status =1 where id=" + vo.getId(); 73 | stmt = conn.prepareStatement(sql); 74 | stmt.executeUpdate(); 75 | System.out.println("update 执行"); 76 | conn.commit(); 77 | System.out.println("查询+update status事务提交完毕"); 78 | rs.close(); 79 | stmt.close(); 80 | 81 | // 此刻已经拿到vo 82 | driver.get(vo.getUrl()); 83 | Thread.sleep(1500); 84 | work(driver, vo); 85 | } else {// 如果没查到,程序退出 86 | rs.close(); 87 | stmt.close(); 88 | System.out.println("数据库中没任务了"); 89 | conn.close(); 90 | System.exit(0); 91 | } 92 | } catch (Exception e) { 93 | System.out.println("事务处理失败 rollback"); 94 | try { 95 | conn.rollback(); 96 | continue; 97 | } catch (Exception e1) { 98 | System.out.println("rollback 失败"); 99 | continue; 100 | } 101 | } 102 | } // while 103 | 104 | }// main 105 | 106 | public static void work(ChromeDriver driver, BOSSUrlVO urlVo) { 107 | WebDriverWait wait = new WebDriverWait(driver, 12); 108 | try { 109 | s: while (true) { 110 | // 等待加载 111 | // wait.until(ExpectedConditions.presenceOfElementLocated(By.id("footer"))); 112 | wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#wrap"))); 113 | // 爬取内容 114 | // 先爬取所有的div.job-list div.job-primary 115 | List divList = driver.findElementsByCssSelector("div.job-list div.job-primary"); 116 | for (WebElement jobDiv : divList) { 117 | // 得到title salary city experience education company 118 | // 标题 119 | String title = jobDiv.findElement(By.cssSelector("div.job-title")).getText(); 120 | 121 | String requestUrl = jobDiv.findElement(By.cssSelector("h3.name a")).getAttribute("href"); 122 | // 收入 123 | String salary = jobDiv.findElement(By.cssSelector("span.red")).getText(); 124 | // 企业 125 | String company = jobDiv.findElement(By.cssSelector("div.company-text h3")).getText(); 126 | // 工作经验 学历 127 | String text = jobDiv.findElement(By.cssSelector("div.info-primary p")).getText(); 128 | String city = text.substring(0, text.indexOf(" ")); 129 | // 如果li的城市和搜索城市不相符,则下个url 130 | if (!city.equals(urlVo.getCity())) 131 | break s; 132 | 133 | String experience = text.substring(text.indexOf(" ")); 134 | String education = text.substring(text.length() - 2); 135 | JobDataVO jobData = new JobDataVO(urlVo.getId(), urlVo.getCity(), urlVo.getKey(), title, company, 136 | null, salary, experience, education, null, null, requestUrl); 137 | insertData(jobData); 138 | } 139 | WebElement nextElement = null; 140 | // 如果有下一页,则点击下一页,否则 141 | if ((nextElement = driver.findElement(By.cssSelector("div.page a.next"))) != null) { 142 | if (nextElement.getAttribute("class").contains("disabled")) { 143 | return; 144 | } else { 145 | nextElement.click(); 146 | Thread.sleep(1000); 147 | } 148 | } else {// 如果没找到就结束了. 149 | return; 150 | } 151 | } 152 | } catch (Exception e) { 153 | // 如果这个while出错,比如被屏蔽需要输入验证码 154 | 155 | System.out.println("while 循环出错,可能需要输入验证码"); 156 | try { 157 | Thread.sleep(20 * 1000); 158 | } catch (InterruptedException e1) { 159 | System.out.println("sleep 失败"); 160 | } 161 | } 162 | }// work() 163 | 164 | public static void insertData(JobDataVO dataVO) { 165 | try { 166 | String sql = "INSERT INTO job_data(id,city,key_word,title,company,salary,experience,education,job_request_url) " 167 | + "VALUES(?,?,?,?,?,?,?,?,?)"; 168 | PreparedStatement stmt = conn.prepareStatement(sql); 169 | stmt.setInt(1, dataVO.getId()); 170 | stmt.setString(2, dataVO.getCity()); 171 | stmt.setString(3, dataVO.getKeyWord()); 172 | stmt.setString(4, dataVO.getTitle()); 173 | stmt.setString(5, dataVO.getCompany()); 174 | stmt.setString(6, dataVO.getSalary()); 175 | stmt.setString(7, dataVO.getExperience()); 176 | stmt.setString(8, dataVO.getEducation()); 177 | stmt.setString(9, dataVO.getJobRequestUrl()); 178 | stmt.executeUpdate(); 179 | conn.commit(); 180 | stmt.close(); 181 | // System.out.println("插入一条数据,总插入数据数为:" + count++); 182 | } catch (Exception e) { 183 | try { 184 | conn.rollback(); 185 | } catch (SQLException e1) { 186 | System.out.println("rollback 失败"); 187 | } 188 | System.out.println("插入结果数据失败"); 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /src/main/java/com/radish/dataclean/DataCleaner.java: -------------------------------------------------------------------------------- 1 | package com.radish.dataclean; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.PreparedStatement; 6 | import java.sql.ResultSet; 7 | import java.sql.SQLException; 8 | 9 | import com.edmund.test.Test; 10 | /** 11 | * 数据清洗的相关类 12 | * @author radish 13 | * 14 | */ 15 | public class DataCleaner { 16 | private Connection conn; 17 | 18 | /** 19 | * 构造器初始化数据库的连接 20 | */ 21 | public DataCleaner() throws Exception { 22 | Class.forName("com.mysql.jdbc.Driver"); 23 | String url = "jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8"; 24 | String username = "root"; 25 | String password = "admin"; 26 | conn = DriverManager.getConnection(url, username, password); 27 | } 28 | 29 | /** 30 | * 31 | * 根据url_list,为lagou表每一条记录根据city填上province 32 | * @throws Exception 33 | */ 34 | public void insertProvince() throws Exception { 35 | String sql = null; 36 | PreparedStatement stmt = null; 37 | ResultSet rs = null; 38 | while (true) { 39 | try { 40 | sql = "SELECT id,city FROM lagou WHERE province IS NULL LIMIT 1"; 41 | stmt = conn.prepareStatement(sql); 42 | rs = stmt.executeQuery(); 43 | // 如果查询到数据 44 | if (rs.next()) { 45 | int id = rs.getInt(1); 46 | String city = rs.getString(2); 47 | // 用city去job_data表查询province 48 | sql = "SELECT province FROM url_list WHERE city=? LIMIT 1"; 49 | stmt = conn.prepareStatement(sql); 50 | stmt.setString(1, city); 51 | rs = stmt.executeQuery(); 52 | // 如果有查到province 53 | if (rs.next()) { 54 | String province = rs.getString(1); 55 | sql = "UPDATE lagou SET province=? WHERE city = ?"; 56 | stmt = conn.prepareStatement(sql); 57 | stmt.setString(1, province); 58 | stmt.setString(2, city); 59 | stmt.executeUpdate(); 60 | } else { 61 | String province = "null"; 62 | sql = "UPDATE lagou SET province=? WHERE id = ?"; 63 | stmt = conn.prepareStatement(sql); 64 | stmt.setString(1, province); 65 | stmt.setInt(2, id); 66 | stmt.executeUpdate(); 67 | } 68 | } else { 69 | System.out.println("处理完毕"); 70 | if (stmt != null) { 71 | stmt.close(); 72 | } 73 | if (rs != null) { 74 | rs.close(); 75 | } 76 | System.exit(0); 77 | } 78 | } catch (Exception e) { 79 | if (stmt != null) { 80 | stmt.close(); 81 | } 82 | if (rs != null) { 83 | rs.close(); 84 | } 85 | e.printStackTrace(); 86 | System.exit(0); 87 | } 88 | } 89 | } 90 | 91 | /** 92 | * 根据url_list,为job_data表每一条记录根据city填上province 93 | * 94 | * @throws Exception 95 | */ 96 | public void insertProvinceToBoss() throws Exception { 97 | String sql = null; 98 | PreparedStatement stmt = null; 99 | ResultSet rs = null; 100 | while (true) { 101 | try { 102 | sql = "SELECT id,city FROM job_data WHERE province IS NULL LIMIT 1"; 103 | stmt = conn.prepareStatement(sql); 104 | rs = stmt.executeQuery(); 105 | // 如果查询到数据 106 | if (rs.next()) { 107 | int id = rs.getInt(1); 108 | String city = rs.getString(2); 109 | // 用city去job_data表查询province 110 | sql = "SELECT province FROM url_list WHERE city=? LIMIT 1"; 111 | stmt = conn.prepareStatement(sql); 112 | stmt.setString(1, city); 113 | rs = stmt.executeQuery(); 114 | // 如果有查到province 115 | if (rs.next()) { 116 | String province = rs.getString(1); 117 | sql = "UPDATE job_data SET province=? WHERE city = ?"; 118 | stmt = conn.prepareStatement(sql); 119 | stmt.setString(1, province); 120 | stmt.setString(2, city); 121 | stmt.executeUpdate(); 122 | } else { 123 | String province = "null"; 124 | sql = "UPDATE job_data SET province=? WHERE id = ?"; 125 | stmt = conn.prepareStatement(sql); 126 | stmt.setString(1, province); 127 | stmt.setInt(2, id); 128 | stmt.executeUpdate(); 129 | } 130 | } else { 131 | System.out.println("处理完毕"); 132 | if (stmt != null) { 133 | stmt.close(); 134 | } 135 | if (rs != null) { 136 | rs.close(); 137 | } 138 | System.exit(0); 139 | } 140 | } catch (Exception e) { 141 | if (stmt != null) { 142 | stmt.close(); 143 | } 144 | if (rs != null) { 145 | rs.close(); 146 | } 147 | e.printStackTrace(); 148 | System.exit(0); 149 | } 150 | } 151 | } 152 | 153 | /** 154 | * 将拉勾网的所有数据清洗后,填写到表job_data_result 155 | * 156 | */ 157 | public void moveLagouDataToResult() throws Exception { 158 | 159 | } 160 | 161 | /** 162 | * 把BOSS直聘的数据清洗后插入到表job_data_result 163 | * 164 | */ 165 | public void moveBossDataToResult() throws Exception { 166 | int count=0; 167 | String sql = null; 168 | PreparedStatement stmt = null; 169 | ResultSet rs = null; 170 | 171 | sql = "SELECT province,"// 1 172 | + "city,"// 2 173 | + "key_word,"// 3 174 | + "company,"// 4 175 | + "salary,"// 5 176 | + "experience,"// 6 177 | + "education" // 7 178 | + " FROM job_data"; 179 | stmt = conn.prepareStatement(sql); 180 | rs = stmt.executeQuery(); 181 | /* 182 | * 测试rs的长度,输出结果为46353,全部可以提取 rs.last(); System.out.println(rs.getRow()); 183 | */ 184 | // 查到46353条记录后 185 | // 插入到结果集的data_from 1代表BOSS直聘的数据 186 | int dataFrom = 1; 187 | String province = null; 188 | String city = null; 189 | String keyWord = null; 190 | String companyOrTeam = null; 191 | double minSalary = 0.0; 192 | double maxSalary = 0.0; 193 | double avgSalary = 0.0; 194 | int minExperience=0; 195 | int minEducation=0; 196 | // key_words_map留空 197 | while (rs.next()) { 198 | try { 199 | // 提取province-----> 200 | province = rs.getString(1); 201 | // 提取city 202 | city = rs.getString(2); 203 | // 提取关键词 204 | keyWord = rs.getString(3); 205 | // 提取公司_组织名 206 | companyOrTeam = rs.getString(4); 207 | /* 208 | * 提取salary,并做处理 209 | */ 210 | String salaryStr = rs.getString(5); 211 | try { 212 | String[] salaryArray = salaryStr.trim().split("-"); 213 | String minSalaryStr = salaryArray[0]; 214 | minSalary = Double.parseDouble(minSalaryStr.substring(0, minSalaryStr.indexOf("k"))); 215 | String maxSalaryStr = salaryArray[1]; 216 | maxSalary = Double.parseDouble(maxSalaryStr.substring(0, maxSalaryStr.indexOf("k"))); 217 | minSalary=minSalary*1000; 218 | maxSalary=maxSalary*1000; 219 | avgSalary = (minSalary + maxSalary) / 2; 220 | } catch (Exception e) { 221 | // 如果salary处理失败或者报错,任何薪水项=0 222 | maxSalary = 0; 223 | minSalary = 0; 224 | avgSalary = 0; 225 | } 226 | /* 227 | * 提取工作经验:experience 处理后--->min_experience 处理失败的都置为0 228 | */ 229 | String experienceStr = rs.getString(6); 230 | try { 231 | minExperience = Integer.parseInt(String.valueOf(experienceStr.trim().charAt(0))); 232 | } catch (Exception e) { 233 | // 如果最小工作经验转化失败,则置为-1 234 | minExperience = 0; 235 | } 236 | /* 237 | * 提取学历,处理出min_education 238 | * 如果处理失败,则置为-1 239 | * 最低学历: 240 | */ 241 | String educationStr=rs.getString(7); 242 | try { 243 | if(educationStr.contains("专")){ 244 | minEducation=1; 245 | }else if(educationStr.contains("本")){ 246 | minEducation=2; 247 | }else if(educationStr.contains("硕")){ 248 | minEducation=3; 249 | }else if(educationStr.contains("博")){ 250 | minEducation=4; 251 | }else { 252 | minEducation=0; 253 | } 254 | } catch (Exception e) { 255 | // 最小学历转化失败,则置为-1 256 | minEducation = -1; 257 | } 258 | } catch (Exception e) { 259 | System.out.println("getString失败"); 260 | e.printStackTrace(); 261 | } 262 | sql="INSERT INTO job_data_result(data_from,province,city,key_word" 263 | + ",company_or_team,min_salary,max_salary,avg_salary" 264 | + ",min_experience,min_education) " 265 | + "VALUES(?,?,?,?" 266 | + ",?,?,?,?" 267 | + ",?,?" 268 | + ")"; 269 | stmt=conn.prepareStatement(sql); 270 | stmt.setInt(1, dataFrom); 271 | stmt.setString(2, province); 272 | stmt.setString(3, city); 273 | stmt.setString(4, keyWord); 274 | stmt.setString(5, companyOrTeam); 275 | stmt.setDouble(6, minSalary); 276 | stmt.setDouble(7, maxSalary); 277 | stmt.setDouble(8, avgSalary); 278 | stmt.setInt(9, minExperience); 279 | stmt.setInt(10, minEducation); 280 | stmt.executeUpdate(); 281 | count++; 282 | } 283 | System.out.println("插入条数:"+count); 284 | } 285 | 286 | public static void main(String[] args) { 287 | try { 288 | DataCleaner cleaner = new DataCleaner(); 289 | cleaner.moveBossDataToResult(); 290 | // cleaner.moveBossDataToResult(); 291 | // cleaner.moveLagouDataToResult(); 292 | // String string = " 3-5年本科"; 293 | // string = string.trim(); 294 | // System.out.println(string.charAt(0)); 295 | //test(); 296 | } catch (Exception e) { 297 | e.printStackTrace(); 298 | } 299 | } 300 | 301 | public static void test() { 302 | String string = "10k-20k"; 303 | string = string.trim(); 304 | for (String string2 : string.split("-")) { 305 | System.out.println(string2); 306 | } 307 | String experienceStr = ""; 308 | 309 | } 310 | } 311 | -------------------------------------------------------------------------------- /src/main/java/com/radish/util/MyUtil.java: -------------------------------------------------------------------------------- 1 | package com.radish.util; 2 | 3 | public class MyUtil { 4 | 5 | public static void main(String[] args) { 6 | // TODO Auto-generated method stub 7 | 8 | } 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/com/radish/util/UrlListIniter.java: -------------------------------------------------------------------------------- 1 | package com.radish.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.InputStreamReader; 7 | import java.sql.Connection; 8 | import java.sql.DriverManager; 9 | import java.sql.PreparedStatement; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import java.util.StringTokenizer; 13 | 14 | import com.radish.vo.BOSSUrlVO; 15 | /** 16 | * 初始化表url_list 17 | * 表结构: 18 | * `id` int(11) NOT NULL AUTO_INCREMENT, 任务标号 19 | `province` varchar(50) NOT NULL, 20 | `city` varchar(100) NOT NULL, 21 | `url` varchar(500) NOT NULL, 22 | `key_word` varchar(50) NOT NULL, 23 | status int not null 0代表爬虫可领取,1代表爬虫已领取,2代表爬虫领取任务后成功提交 24 | * @author admin 25 | * 26 | */ 27 | public class UrlListIniter { 28 | // 初始化爬取队列,每个VO中的url都是可直接访问的 29 | private static List urlList = new ArrayList(); 30 | // 要爬取的关键词 java python web linux C# 31 | private static String[] keys = new String[] { "java", "python", "web", "linux","C%23" }; 32 | // 数据库连接 33 | private static Connection conn; 34 | private static String sql=""; 35 | static{ 36 | try { 37 | Class.forName("com.mysql.jdbc.Driver"); 38 | String username="root"; 39 | String password="admin"; 40 | String url="jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf8"; 41 | conn=DriverManager.getConnection(url, username, password); 42 | } catch (Exception e) { 43 | System.out.println("静态初始化块出错"); 44 | } 45 | } 46 | public static void main(String[] args) throws Exception { 47 | String filePath = "C:/Users/admin/Desktop/BossUrl.txt"; 48 | BufferedReader reader = new BufferedReader( 49 | new InputStreamReader(new FileInputStream(new File(filePath)), "UTF-8")); 50 | // 逐行读 51 | String line = null; 52 | while ((line = reader.readLine()) != null) { 53 | StringTokenizer tokens = new StringTokenizer(line); 54 | String province = null; 55 | String city = null; 56 | String url = null; 57 | if (tokens.hasMoreTokens()) { 58 | province = tokens.nextToken(); 59 | } 60 | if (tokens.hasMoreTokens()) { 61 | city = tokens.nextToken(); 62 | } 63 | if (tokens.hasMoreTokens()) { 64 | url = tokens.nextToken(); 65 | } 66 | // 根据关键词数组进行初始化 67 | for (int i = 0; i < keys.length; i++) { 68 | urlList.add(new BOSSUrlVO(province, city, url, keys[i])); 69 | } 70 | } 71 | reader.close(); 72 | // 如果list初始化成功 73 | if (urlList.size() != 0) { 74 | for (BOSSUrlVO vo : urlList) { 75 | insertVO(vo); 76 | } 77 | } else { 78 | System.out.println("list 初始化失败 程序退出"); 79 | System.exit(0); 80 | } 81 | conn.close(); 82 | }// main 83 | /** 84 | * 向数据库插入一条数据 85 | * @param vo 86 | */ 87 | public static void insertVO(BOSSUrlVO vo){ 88 | try { 89 | //conn.setTransactionIsolation(conn.TRANSACTION_SERIALIZABLE); 90 | //conn.setAutoCommit(false); 91 | sql="INSERT INTO url_list(province,city,url,key_word,status) VALUES(?,?,?,?,0)"; 92 | PreparedStatement stmt = conn.prepareStatement(sql); 93 | stmt.setString(1, vo.getProvince()); 94 | stmt.setString(2, vo.getCity()); 95 | stmt.setString(3, vo.getUrl()); 96 | stmt.setString(4, vo.getKey()); 97 | stmt.executeUpdate(); 98 | } catch (Exception e) { 99 | System.out.println("insertVO error!"); 100 | e.printStackTrace(); 101 | } 102 | 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/java/com/radish/vo/BOSSUrlVO.java: -------------------------------------------------------------------------------- 1 | package com.radish.vo; 2 | 3 | /** 4 | * 对应 5 | * 北京 北京 https://www.zhipin.com/job_detail/?query=#&scity=101010100 6 | * @author admin 7 | * 8 | */ 9 | public class BOSSUrlVO { 10 | private Integer id; 11 | private String province; 12 | private String city; 13 | private String url; 14 | private String key; 15 | private Integer status; 16 | 17 | public BOSSUrlVO() { 18 | super(); 19 | } 20 | 21 | public BOSSUrlVO(Integer id, String province, String city, String url, String key, Integer status) { 22 | this.id = id; 23 | this.province = province; 24 | this.city = city; 25 | this.key = key; 26 | this.url = url.replace("#", this.key); 27 | this.status = status; 28 | } 29 | public BOSSUrlVO(String province, String city, String url, String key) { 30 | this.province = province; 31 | this.city = city; 32 | this.key = key; 33 | this.url = url.replace("#", this.key); 34 | } 35 | public Integer getId() { 36 | return id; 37 | } 38 | 39 | public void setId(Integer id) { 40 | this.id = id; 41 | } 42 | 43 | public String getProvince() { 44 | return province; 45 | } 46 | 47 | public void setProvince(String province) { 48 | this.province = province; 49 | } 50 | 51 | public String getCity() { 52 | return city; 53 | } 54 | 55 | public Integer getStatus() { 56 | return status; 57 | } 58 | 59 | public void setStatus(Integer status) { 60 | this.status = status; 61 | } 62 | 63 | public void setCity(String city) { 64 | this.city = city; 65 | } 66 | 67 | public String getUrl() { 68 | return url; 69 | } 70 | 71 | public void setUrl(String url) { 72 | this.url = url; 73 | } 74 | 75 | @Override 76 | public int hashCode() { 77 | final int prime = 31; 78 | int result = 1; 79 | result = prime * result + ((city == null) ? 0 : city.hashCode()); 80 | result = prime * result + ((province == null) ? 0 : province.hashCode()); 81 | result = prime * result + ((url == null) ? 0 : url.hashCode()); 82 | return result; 83 | } 84 | 85 | @Override 86 | public boolean equals(Object obj) { 87 | if (this == obj) 88 | return true; 89 | if (obj == null) 90 | return false; 91 | if (getClass() != obj.getClass()) 92 | return false; 93 | BOSSUrlVO other = (BOSSUrlVO) obj; 94 | if (city == null) { 95 | if (other.city != null) 96 | return false; 97 | } else if (!city.equals(other.city)) 98 | return false; 99 | if (province == null) { 100 | if (other.province != null) 101 | return false; 102 | } else if (!province.equals(other.province)) 103 | return false; 104 | if (url == null) { 105 | if (other.url != null) 106 | return false; 107 | } else if (!url.equals(other.url)) 108 | return false; 109 | return true; 110 | } 111 | 112 | public String getKey() { 113 | return key; 114 | } 115 | 116 | public void setKey(String key) { 117 | this.key = key; 118 | } 119 | 120 | @Override 121 | public String toString() { 122 | return "BOSSUrlVO [province=" + province + ", city=" + city + ", url=" + url + "]"; 123 | } 124 | 125 | } 126 | -------------------------------------------------------------------------------- /src/main/java/com/radish/vo/JobDataVO.java: -------------------------------------------------------------------------------- 1 | package com.radish.vo; 2 | 3 | import java.util.Map; 4 | 5 | public class JobDataVO { 6 | // 任务ID 7 | private Integer id; 8 | // 城市 9 | private String city; 10 | // 关键词 11 | private String keyWord; 12 | // 标题 13 | private String title; 14 | // 公司名称 15 | private String company; 16 | // 职位名称 17 | private String job; 18 | // 薪水字符串 19 | private String salary; 20 | // 工作经验 21 | private String experience; 22 | // 学历 23 | private String education; 24 | // 职位要求 25 | private String jobRequestMessage; 26 | // 打开后可以获取招聘要求的url地址 27 | private String jobRequestUrl; 28 | // 关键词map 29 | private Map keyMap; 30 | 31 | // 空构造 32 | public JobDataVO() { 33 | super(); 34 | } 35 | 36 | public JobDataVO(Integer id, String city, String keyWord, String title, String company, String job, String salary, 37 | String experience, String education, String jobRequestMessage, Map keyMap,String jobRequestUrl) { 38 | this.id = id; 39 | this.city = city; 40 | this.keyWord = keyWord; 41 | this.title = title; 42 | this.company = company; 43 | this.job = job; 44 | this.salary = salary; 45 | this.experience = experience; 46 | this.education = education; 47 | this.jobRequestMessage = jobRequestMessage; 48 | this.keyMap = keyMap; 49 | this.jobRequestUrl=jobRequestUrl; 50 | } 51 | 52 | /* 53 | * 以下为get和set 54 | */ 55 | public Integer getId() { 56 | return id; 57 | } 58 | public void setId(Integer id) { 59 | this.id = id; 60 | } 61 | public String getCity() { 62 | return city; 63 | } 64 | public void setCity(String city) { 65 | this.city = city; 66 | } 67 | public String getKeyWord() { 68 | return keyWord; 69 | } 70 | public void setKeyWord(String keyWord) { 71 | this.keyWord = keyWord; 72 | } 73 | public String getTitle() { 74 | return title; 75 | } 76 | public void setTitle(String title) { 77 | this.title = title; 78 | } 79 | public String getCompany() { 80 | return company; 81 | } 82 | public void setCompany(String company) { 83 | this.company = company; 84 | } 85 | public String getJob() { 86 | return job; 87 | } 88 | public void setJob(String job) { 89 | this.job = job; 90 | } 91 | public String getSalary() { 92 | return salary; 93 | } 94 | public void setSalary(String salary) { 95 | this.salary = salary; 96 | } 97 | public String getExperience() { 98 | return experience; 99 | } 100 | public void setExperience(String experience) { 101 | this.experience = experience; 102 | } 103 | public String getEducation() { 104 | return education; 105 | } 106 | public void setEducation(String education) { 107 | this.education = education; 108 | } 109 | 110 | public String getJobRequestMessage() { 111 | return jobRequestMessage; 112 | } 113 | 114 | public void setJobRequestMessage(String jobRequestMessage) { 115 | this.jobRequestMessage = jobRequestMessage; 116 | } 117 | 118 | public String getJobRequestUrl() { 119 | return jobRequestUrl; 120 | } 121 | 122 | public void setJobRequestUrl(String jobRequestUrl) { 123 | this.jobRequestUrl = jobRequestUrl; 124 | } 125 | 126 | public Map getKeyMap() { 127 | return keyMap; 128 | } 129 | public void setKeyMap(Map keyMap) { 130 | this.keyMap = keyMap; 131 | } 132 | 133 | @Override 134 | public String toString() { 135 | return "JobDataVO [id=" + id + ", city=" + city + ", keyWord=" + keyWord + ", title=" + title + ", company=" 136 | + company + ", job=" + job + ", salary=" + salary + ", experience=" + experience + ", education=" 137 | + education + ", jobRequestMessage=" + jobRequestMessage + ", jobRequestUrl=" + jobRequestUrl 138 | + ", keyMap=" + keyMap + "]"; 139 | } 140 | 141 | 142 | 143 | 144 | } 145 | -------------------------------------------------------------------------------- /src/main/java/com/random/crawler/BOSSRequestMessageCrawler.java: -------------------------------------------------------------------------------- 1 | package com.random.crawler; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.sql.Connection; 7 | import java.sql.DriverManager; 8 | import java.sql.PreparedStatement; 9 | import java.sql.ResultSet; 10 | import java.util.List; 11 | import java.util.Properties; 12 | 13 | import org.openqa.selenium.chrome.ChromeDriver; 14 | 15 | /** 16 | * 将job_message.txt文件 文件格式key_word job_request_url 17 | * 中的url取出,根据url爬取数据存入到数据库表job_message中 18 | * 19 | * @author admin 20 | * 21 | */ 22 | public class BOSSRequestMessageCrawler { 23 | private List fileList; 24 | private Connection conn; 25 | private static String localdriver = null; // 本地浏览器驱动位置 26 | private ChromeDriver driver; 27 | 28 | /** 29 | * 读取配置文件 30 | */ 31 | static { 32 | Properties property = new Properties(); 33 | try { 34 | property.load(new FileInputStream("./src/main/java/com/random/properties")); 35 | } catch (IOException e) { 36 | e.printStackTrace(); 37 | } 38 | localdriver = property.getProperty("LocalChromedriver"); 39 | } 40 | 41 | /* 42 | * 有参构造方法,得到一个fileList 43 | */ 44 | public BOSSRequestMessageCrawler(List fileList) { 45 | 46 | try { 47 | Class.forName("org.gjt.mm.mysql.Driver"); 48 | conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8", "root", 49 | "root"); 50 | } catch (Exception e) { 51 | System.out.println("数据库连接失败"); 52 | } 53 | this.fileList = fileList; 54 | } 55 | 56 | /* 57 | * 将job_message.txt文件中的key url写入job_message中,并将status的值置为0 58 | */ 59 | public void addAllLineToMySQL() { 60 | 61 | } 62 | 63 | public void crawlerMessage() { 64 | String sql = "SELECT url FROM job_message WHERE status=0 ORDER BY id LIMIT 1"; 65 | try { 66 | PreparedStatement stmt = conn.prepareStatement(sql); 67 | ResultSet rs = stmt.executeQuery(); 68 | // 如果rs有没有 69 | if (rs.wasNull()) { 70 | return; 71 | } else {// 如果有,指针下移 72 | rs.next(); 73 | String url = rs.getString(1); 74 | 75 | } 76 | } catch (Exception e) { 77 | 78 | } 79 | 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/random/crawler/TaskManager.java: -------------------------------------------------------------------------------- 1 | package com.random.crawler; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.InputStreamReader; 7 | import java.io.ObjectInputStream; 8 | import java.sql.Blob; 9 | import java.sql.Connection; 10 | import java.sql.DriverManager; 11 | import java.sql.PreparedStatement; 12 | import java.sql.ResultSet; 13 | import java.sql.SQLException; 14 | import java.util.ArrayList; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Map; 18 | import java.util.Set; 19 | 20 | import org.openqa.selenium.By; 21 | import org.openqa.selenium.chrome.ChromeDriver; 22 | 23 | import jeasy.analysis.MMAnalyzer; 24 | 25 | public class TaskManager { 26 | private Connection conn; 27 | private ChromeDriver driver; 28 | 29 | public TaskManager() { 30 | try { 31 | Class.forName("com.mysql.jdbc.Driver"); 32 | conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/crawler_db?characterEncoding=utf-8", "root", 33 | "root"); 34 | 35 | } catch (Exception e) { 36 | System.out.println("数据库连接失败"); 37 | } 38 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe"); 39 | driver = new ChromeDriver(); 40 | System.out.println("构造方法执行完毕"); 41 | } 42 | 43 | /** 44 | * 45 | * 将job_message.txt文件中的key url写入job_message中,并将status的值置为0 46 | * 47 | * @param fileList 48 | */ 49 | public void initData(List fileList) { 50 | for (File file : fileList) { 51 | try { 52 | BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8")); 53 | String line = null; 54 | String sql = null; 55 | while ((line = reader.readLine()) != null) { 56 | String key = line.split("\t")[0]; 57 | String url = line.split("\t")[1]; 58 | sql = "INSERT INTO job_message(key_word,url,status) VALUES(?,?,0)"; 59 | PreparedStatement stmt = conn.prepareStatement(sql); 60 | stmt.setString(1, key); 61 | stmt.setString(2, url); 62 | stmt.executeUpdate(); 63 | stmt.close(); 64 | } 65 | reader.close(); 66 | } catch (Exception e) { 67 | System.out.println("文件读取失败"); 68 | } 69 | } 70 | } 71 | 72 | /** 73 | * 爬虫获取数据表中的url 爬取 message 取出url,status置为1 74 | * 75 | */ 76 | public void startCrawler() { 77 | try {// 设置事务不自动提交 78 | conn.setAutoCommit(false); 79 | System.out.println("设置自动提交为false"); 80 | } catch (SQLException e1) { 81 | System.out.println("设置自动提交为false,失败!!!!!!!-----------------"); 82 | } 83 | s: while (true) { 84 | String sql = "SELECT url_id,url FROM job_message WHERE status=0 ORDER BY url_id LIMIT 1"; 85 | try { 86 | PreparedStatement stmt = conn.prepareStatement(sql); 87 | ResultSet rs = stmt.executeQuery(); 88 | // 如果rs有没有 89 | if (rs.wasNull()) { 90 | conn.commit(); 91 | return; 92 | } else {// 如果有,指针下移 93 | // System.out.println("查到一条记录"); 94 | rs.next(); 95 | int urlId = rs.getInt(1); 96 | String url = rs.getString(2); 97 | // System.out.println("url_id:" + urlId + "url:" + url); 98 | sql = "UPDATE job_message SET status = 1 WHERE url_id = ?"; 99 | stmt = conn.prepareStatement(sql); 100 | stmt.setInt(1, urlId); 101 | stmt.executeUpdate(); 102 | conn.commit(); 103 | // url ---> message 此处是getBossMessage() 拉钩的方法为 104 | String message = null; 105 | // int reTry = 0; 106 | // 如果得message失败,就重试,直到重试2次 107 | // while ((message = getBossMessage(url)) == null && reTry <2) { 108 | // reTry++; 109 | // } 110 | // 如果重试2次还=null,放弃这个url 111 | if ((message = getBossMessage(url)) == null) { 112 | System.out.println("死活得不到这个message,不要了 urlID:" + urlId); 113 | continue s; 114 | } 115 | try { 116 | sql = "UPDATE job_message SET message=? WHERE url_id=?"; 117 | stmt = conn.prepareStatement(sql); 118 | stmt.setString(1, message); 119 | stmt.setInt(2, urlId); 120 | stmt.executeUpdate(); 121 | conn.commit(); 122 | } catch (Exception e) { 123 | System.out.println("一个状态值被置为1的记录,并没有写入message!!!! urlID:" + urlId); 124 | } 125 | } 126 | } catch (Exception e) { 127 | e.printStackTrace(); 128 | try { 129 | conn.rollback(); 130 | } catch (SQLException e1) { 131 | e1.printStackTrace(); 132 | } 133 | continue s; 134 | } 135 | } 136 | } 137 | 138 | /* 139 | * 通过url,动态爬取Boss直聘 message 140 | */ 141 | private String getBossMessage(String url) { 142 | String message = null; 143 | try { 144 | driver.get(url); 145 | Thread.sleep(2000); 146 | message = driver.findElement(By.cssSelector("div.detail-content .job-sec")).getText(); 147 | } catch (Exception e) { 148 | try { 149 | Thread.sleep(20 * 1000); 150 | } catch (InterruptedException e1) { 151 | e1.printStackTrace(); 152 | } 153 | System.out.println("根据url获取message失败,可能是需要输入验证码"); 154 | } 155 | return message; 156 | } 157 | 158 | /** 159 | * 方法的出口,条件是没有status=1 而且有message了 160 | */ 161 | public void pickMapFromMessage() { 162 | s: while (true) { 163 | try { 164 | conn.setAutoCommit(false); 165 | } catch (SQLException e1) { 166 | System.out.println("设置自动提交false失败"); 167 | } 168 | Map message_map = new HashMap(); 169 | String sql = "SELECT url_id,message FROM job_message WHERE status=1 ORDER BY url_id LIMIT 1"; 170 | try { 171 | PreparedStatement stmt = conn.prepareStatement(sql); 172 | ResultSet rs = stmt.executeQuery(); 173 | conn.commit(); 174 | if (rs.wasNull()) { 175 | // 方法的出口,条件是没有status=1的记录 176 | return; 177 | } else { 178 | rs.next(); 179 | int urlId = rs.getInt(1); 180 | // 如果查到了记录,就把相对应的记录的status设置为2 181 | try { 182 | sql = "update job_message set status=2 where url_id=" + urlId; 183 | stmt = conn.prepareStatement(sql); 184 | stmt.executeUpdate(); 185 | conn.commit(); 186 | } catch (Exception e1) { 187 | System.out.println("设置message处理完毕的status=2失败,url_id:" + urlId); 188 | try { 189 | conn.rollback(); 190 | } catch (Exception e) { 191 | e.printStackTrace(); 192 | } 193 | 194 | } 195 | String message = rs.getString(2); 196 | MMAnalyzer mm = new MMAnalyzer(); 197 | String[] keys = mm.segment(message, "|").split("\\|"); 198 | for (String key : keys) { 199 | if (key.matches("[a-zA-Z/#\\\\]+")) { 200 | // 如果符合英文,但是已有,则value+1 201 | if (message_map.containsKey(key)) { 202 | message_map.put(key, message_map.get(key) + 1); 203 | } else {// 如果不包含 204 | message_map.put(key, 1); 205 | } 206 | } 207 | } 208 | 209 | try { 210 | sql = "UPDATE job_message SET message_map = ? WHERE url_id=?"; 211 | stmt = conn.prepareStatement(sql); 212 | stmt.setObject(1, message_map); 213 | stmt.setInt(2, urlId); 214 | stmt.executeUpdate(); 215 | conn.commit(); 216 | } catch (Exception e) { 217 | conn.rollback(); 218 | System.out.println("message_map写入失败,url_id:" + urlId); 219 | } 220 | } 221 | } catch (Exception e) { 222 | try { 223 | conn.rollback(); 224 | } catch (SQLException e1) { 225 | e1.printStackTrace(); 226 | } 227 | e.printStackTrace(); 228 | continue s; 229 | } 230 | } 231 | } 232 | 233 | public Map readMap() { 234 | 235 | String sql = "SELECT message_map FROM job_message WHERE message IS NOT NULL ORDER BY url_id LIMIT 1"; 236 | try { 237 | PreparedStatement stmt = conn.prepareStatement(sql); 238 | ResultSet rs = stmt.executeQuery(); 239 | // System.out.println("rs.size" + rs.wasNull()); 240 | rs.next(); 241 | Blob message_map = rs.getBlob(1); 242 | ObjectInputStream objIs = new ObjectInputStream(message_map.getBinaryStream()); 243 | Map map = (Map) objIs.readObject(); 244 | // Set keySet = map.keySet(); 245 | // for (String key : keySet) { 246 | // System.out.println("key:" + key + " value:" + map.get(key)); 247 | // } 248 | 249 | // 一定要考虑map为空的情况 250 | 251 | } catch (Exception e) { 252 | e.printStackTrace(); 253 | } 254 | return null; 255 | } 256 | 257 | public List> combineMaps() { 258 | List> mapList = null; 259 | while (true) { 260 | String sql = "SELECT key_words FROM key_map WHERE key_word='java' ORDER BY id LIMIT 1"; 261 | try { 262 | PreparedStatement stmt = conn.prepareStatement(sql); 263 | ResultSet rs = stmt.executeQuery(); 264 | if (rs.wasNull()) { 265 | System.out.println("mapList完成"); 266 | return mapList; 267 | } 268 | rs.next(); 269 | Blob keyWords = rs.getBlob(1); 270 | ObjectInputStream objIs = new ObjectInputStream(keyWords.getBinaryStream()); 271 | Map map = (Map) objIs.readObject(); 272 | mapList = new ArrayList>(); 273 | mapList.add(map); 274 | } catch (Exception e) { 275 | e.printStackTrace(); 276 | } 277 | } 278 | } 279 | 280 | public Map combine(List> mapList) { 281 | Map keymap = new HashMap(); 282 | for (Map map : mapList) { 283 | if (map == null) { 284 | continue; 285 | } 286 | Set keySet = map.keySet(); 287 | for (String key : keySet) { 288 | if (keymap.containsKey(key)) { 289 | keymap.put(key, keymap.get(key) + 1); 290 | } else { 291 | keymap.put(key, 1); 292 | } 293 | } 294 | } 295 | return keymap; 296 | } 297 | } 298 | -------------------------------------------------------------------------------- /src/main/java/com/random/crawler/Test.java: -------------------------------------------------------------------------------- 1 | package com.random.crawler; 2 | 3 | public class Test { 4 | 5 | public static void main(String[] args) { 6 | 7 | // 创建核心类 8 | TaskManager manager = new TaskManager(); 9 | // File file = new File("C:/Users/admin/Desktop/job_data.txt"); 10 | // 初始化文件队列 11 | // List fileList = new ArrayList(); 12 | // fileList.add(file); 13 | // manager初始化数据库 14 | // manager.initData(fileList); 15 | // manager启动爬虫去根据url 爬取 message 16 | // manager.startCrawler(); 17 | /* 18 | * 爬虫根据url填写完message以后 根据massage填写一个message对应的关键字map 注意: 19 | * key是message中出现过的英文单词 value是key在message中出现过的次数 20 | * 将map存入到数据库的message_map(Blob)域中 21 | */ 22 | // manager.pickMapFromMessage(); 23 | // manager.readMap(); 24 | /* 25 | * 根据所有key---map 数据表job_message中的每一条记录代表一个就业要求对应的map能力集 整合各个key的所有map 1 26 | * key:n map ----->1key:1map 得出5条: key map 27 | */ 28 | System.out.println(manager.combine(manager.combineMaps())); 29 | 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/random/properties: -------------------------------------------------------------------------------- 1 | LocalChromedriver=D:/chrome_driver/chromedriver.exe 2 | LocalExportPath=C:/Users/admin/Desktop/jobs -------------------------------------------------------------------------------- /src/main/java/com/random/test/Demo.java: -------------------------------------------------------------------------------- 1 | package com.random.test; 2 | 3 | import java.io.File; 4 | 5 | import org.apache.commons.io.FileUtils; 6 | import org.openqa.selenium.By; 7 | import org.openqa.selenium.OutputType; 8 | import org.openqa.selenium.WebElement; 9 | import org.openqa.selenium.chrome.ChromeDriver; 10 | 11 | public class Demo { 12 | 13 | public static void main(String[] args) { 14 | System.setProperty("webdriver.chrome.driver", "D:/chrome_driver/chromedriver.exe"); 15 | ChromeDriver driver = new ChromeDriver(); 16 | driver.get( 17 | "https://www.zhipin.com/job_detail/691ad23916cba6891n1409y4FFo~.html?ka=search_list_11_blank&lid=7-PfVqs0wE8c.search"); 18 | WebElement element = driver.findElement(By.cssSelector("div.detail-content .job-sec")); 19 | System.out.println(element.getText()); 20 | try { 21 | File screenShot = driver.getScreenshotAs(OutputType.FILE); 22 | FileUtils.copyFile(screenShot, new File("D:/" + System.currentTimeMillis() + ".jpg")); 23 | } catch (Exception e) { 24 | // TODO Auto-generated catch block 25 | e.printStackTrace(); 26 | } 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/wcy/test/Test.java: -------------------------------------------------------------------------------- 1 | package com.wcy.test; 2 | 3 | public class Test { 4 | 5 | public static void main(String[] args) { 6 | // TODO Auto-generated method stub 7 | 8 | } 9 | 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/log4j.properties: -------------------------------------------------------------------------------- 1 | ### set log levels ### 2 | log4j.rootLogger = INFO , console , debug , error 3 | 4 | ### console ### 5 | log4j.appender.console = org.apache.log4j.ConsoleAppender 6 | log4j.appender.console.Target = System.out 7 | log4j.appender.console.layout = org.apache.log4j.PatternLayout 8 | log4j.appender.console.layout.ConversionPattern = %-d{yyyy-MM-dd HH\:mm\:ss} [%p]-[%c] %m%n 9 | 10 | ### log file ### 11 | log4j.appender.debug = org.apache.log4j.DailyRollingFileAppender 12 | log4j.appender.debug.File = ../logs/springmvc-demo.log 13 | log4j.appender.debug.Append = true 14 | log4j.appender.debug.Threshold = INFO 15 | log4j.appender.debug.layout = org.apache.log4j.PatternLayout 16 | log4j.appender.debug.layout.ConversionPattern = %-d{yyyy-MM-dd HH\:mm\:ss} [%p]-[%c] %m%n 17 | 18 | ### exception ### 19 | log4j.appender.error = org.apache.log4j.DailyRollingFileAppender 20 | log4j.appender.error.File = ../logs/springmvc-demo_error.log 21 | log4j.appender.error.Append = true 22 | log4j.appender.error.Threshold = ERROR 23 | log4j.appender.error.layout = org.apache.log4j.PatternLayout 24 | log4j.appender.error.layout.ConversionPattern = %-d{yyyy-MM-dd HH\:mm\:ss} [%p]-[%c] %m%n 25 | 26 | 27 | ###\u9700\u8981\u58F0\u660E\uFF0C\u7136\u540E\u4E0B\u65B9\u624D\u53EF\u4EE5\u4F7Fdruid sql\u8F93\u51FA\uFF0C\u5426\u5219\u4F1A\u629B\u51FAlog4j.error.key not found 28 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 29 | log4j.appender.stdout.Target=System.out 30 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 31 | log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %l %c%n%p: %m%n 32 | 33 | ### druid sql ### 34 | log4j.logger.druid.sql=warn,stdout 35 | log4j.logger.druid.sql.DataSource=warn,stdout 36 | log4j.logger.druid.sql.Connection=warn,stdout 37 | log4j.logger.druid.sql.Statement=warn,stdout 38 | log4j.logger.druid.sql.ResultSet=warn,stdout -------------------------------------------------------------------------------- /概要设计.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radishT/Job_Analysis/24a595ac551756279487c8a6d9f5e8bd03ce4465/概要设计.doc --------------------------------------------------------------------------------