├── .gitattributes ├── README.md ├── analysis&machine-learning └── jobs_ml_reg.ipynb ├── analysis&visualizations └── jobs_analysis.ipynb ├── city_data ├── city.csv ├── city.ipynb └── city.py ├── pics ├── 1.png ├── 10.png ├── 11.png ├── 12.png ├── 13.png ├── 14.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 8.png ├── 9.png ├── ML部分 │ ├── 为标签编码.png │ ├── 决策树训练集误差.png │ ├── 划分训练集与测试集.png │ ├── 变量重要性.png │ ├── 属性合并.png │ ├── 筛选岗位数量前150.png │ ├── 编码分类结果.png │ ├── 过滤后的分布.png │ ├── 过滤薪资.png │ ├── 重要性分析.png │ └── 随机森林训练集误差.png └── url.png └── spiders ├── citylist_spider.py └── jobs_spider.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=python 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BOSS直聘的数据分析 2 | --- 3 | ## 库依赖 4 | - 基于Python3.6 5 | - Jupyter Notebook 6 | - pyecharts 7 | - pymongo 8 | - pandas 9 | - numpy 10 | --- 11 | ## 爬虫实现过程 12 | ### 分析URL 13 | ![url](pics/url.png) 14 | - c后的编号对应不同城市 15 | - page后的数字则对应页码 16 | 17 | --- 18 | 19 | ### 爬取所有省、市对应的code 20 | city.py实现地区与对应code的爬取 21 | 22 | --- 23 | ### 根据codelist爬取所有地区的职位 24 | - 爬取内容包含:signal、省、市、职位名称、薪资、公司名称、工作经验、学历要求、公司规模。 25 | 26 | - signal字段作用在于重复爬取时跳过已爬取的页面。 27 | - 存入MongoDB中 28 | 29 | 30 | --- 31 | ## 进行数据分析 32 | ### 读取数据 33 | 利用pandas读取数据库中数据 34 | 35 | 36 | --- 37 | ### 添加新列:salary 38 | 利用正则提取出[职位薪资] 39 | 40 | 41 | --- 42 | ### 数据清洗 43 | - 移除重复数据 44 | 45 | 46 | 47 | --- 48 | 49 | - 数据筛选过滤,去除过高和过低的薪资 50 | 51 | 52 | 53 | - 去除与“数据分析”无关的岗位信息 54 | 55 | --- 56 | ## 数据可视化 57 | 58 | 59 | --- 60 | 61 | 62 | 63 | --- 64 | 65 | 66 | 67 | --- 68 | 69 | 70 | 71 | --- 72 | 73 | 74 | 75 | --- 76 | 77 | ## 机器学习部分分析 78 | 79 | 80 | --- 81 | 82 | 83 | 84 | --- 85 | 86 | 87 | 88 | --- 89 | 90 | 91 | 92 | --- 93 | 94 | 95 | 96 | --- 97 | 98 | 99 | 100 | --- 101 | ### **变量重要性** 102 | 103 | 104 | 105 | --- 106 | 107 | 108 | -------------------------------------------------------------------------------- /analysis&machine-learning/jobs_ml_reg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# **机器学习**" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 262, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pymongo\n", 19 | "import pandas as pd\n", 20 | "from pandas import Series\n", 21 | "client = pymongo.MongoClient('localhost',27017)\n", 22 | "db = client['Graduation_project']\n", 23 | "table = db['jobs_info']\n", 24 | "data = pd.DataFrame(list(table.find()))\n", 25 | "del data['_id']\n", 26 | "del data['signal']" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 263, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "(65059, 8)" 38 | ] 39 | }, 40 | "execution_count": 263, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "data.shape # 65059 rows × 8 columns" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 264, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "import re\n", 58 | "# 均值函数\n", 59 | "def average(job_salary):\n", 60 | " # 取薪资均值----------------\n", 61 | " pattern = re.compile('\\d+')\n", 62 | " salary = job_salary\n", 63 | " try:\n", 64 | " res = re.findall(pattern, salary)\n", 65 | " avg_salary = 0\n", 66 | " sum = 0\n", 67 | " for i in res:\n", 68 | " a = int(i)\n", 69 | " sum = sum + a\n", 70 | " avg_salary = sum / 2\n", 71 | " except Exception:\n", 72 | " avg_salary = 0\n", 73 | " # 函数返回值\n", 74 | " return avg_salary" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 265, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "salary_list = []\n", 86 | "for i in range(0,65059):\n", 87 | " avg_sal = average(data['职位薪资'][i])\n", 88 | " salary_list.append(avg_sal)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 266, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "sal = Series(salary_list)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 267, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "data.insert(8,'salary',sal)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 268, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "#data" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 269, 121 | "metadata": { 122 | "scrolled": false 123 | }, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/html": [ 128 | "
\n", 129 | "\n", 142 | "\n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | "
城市职位名称工作经验公司规模学历要求salary
0北京数据分析经验不限10000人以上本科8.0
1北京数据分析师1-3年100-499人本科27.5
2北京数据分析师3-5年1000-9999人本科20.0
3北京数据分析经验不限10000人以上本科9.0
4北京数据分析师3-5年10000人以上本科12.5
\n", 202 | "
" 203 | ], 204 | "text/plain": [ 205 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n", 206 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n", 207 | "1 北京 数据分析师 1-3年 100-499人 本科 27.5\n", 208 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n", 209 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n", 210 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5" 211 | ] 212 | }, 213 | "execution_count": 269, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "job_data = data[['城市','职位名称','工作经验','公司规模','学历要求','salary']]\n", 220 | "job_data.head()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 270, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "(65059, 6)" 232 | ] 233 | }, 234 | "execution_count": 270, 235 | "metadata": {}, 236 | "output_type": "execute_result" 237 | } 238 | ], 239 | "source": [ 240 | "job_data.shape # 65059条数据" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "### 薪资分布" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 271, 253 | "metadata": { 254 | "scrolled": false 255 | }, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfYAAAE/CAYAAAC0DOHAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAHWBJREFUeJzt3X+QXfV53/H3J+JHHMvmh7F3VKFW\npFZSEzPBsAU6jtMFPCBwEpE27kCJEQ4ZJQ6kces0yMkkONi00DbxlIY4lYtqkdgW1LEHDcglDGHj\n8QxgwMYIjAkyVoxAhmBhjGwHV87TP+5XmWuxv3XF3j37fs3c2Xuf8z1nv4/O3v3cc+7R3VQVkiSp\nG35ovicgSZIGx2CXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SVNKUkleP9/zkDQzBrskSR1i\nsEs6KJIcMt9zkBYjg11aRJJcnuTJJC8keTTJmUlOSXJXkm8m2ZXkj5IcNsn6b0vyhSTfSvJEkvf1\nLVvZTttfkuRrwF8muTXJr++3jQeTnHdwO5UWL4NdWiSS/DhwGfDPq+pVwNnADuD7wL8HjgH+BXAm\n8GuTbObbwEXAkcDbgHdNENL/EnhD2/4m4Bf75vCTwHJg60CakvQSBru0eHwfOBw4PsmhVbWjqr5S\nVfdX1d1VtbeqdgD/k144v0RVjVfVtqr6+6p6EPj4BGPfV1XfrqrvAjcDq5KsasveAdxYVd87GA1K\nMtilRaOqtgPvBt4HPJNkc5J/lOTHktyS5OtJvgX8J3pH7y+R5NQkdyb52yTPA786wdgn+r7ni8BN\nwC8m+SHgAuBPB96cpH9gsEuLSFV9rKp+CvgnQAHXAB8CvgysqqpXA78NZJJNfAzYAqyoqiOAP5lg\n7P5/MnITcCG9U/zfqaq7BtGLpIkZ7NIikeTHk5yR5HDg74Dv0js9/yrgW8CeJP8MeNcUm3kVsLuq\n/i7JKcC/ne77tiD/e+AP8GhdOugMdmnxOBy4GngW+DrwOnpH579JL6BfAD4M3DjFNn4NuDLJC8Dv\n0TvNPhM3ACcAfzanmUuasVTtf9ZMkgYryUXAuvY2gKSDyCN2SQdVkh+hd6S/Yb7nIi0GBrukgybJ\n2cDfAk/Tu/BO0kHmqXhJkjrEI3ZJkjrEYJckqUMW7F9fOuaYY2rlypUD2963v/1tXvnKVw5se/PN\nfoab/Qw3+xlui7Wf+++//9mqeu104xZssK9cuZL77rtvYNsbHx9nbGxsYNubb/Yz3OxnuNnPcFus\n/ST5m5lsz1PxkiR1iMEuSVKHGOySJHWIwS5JUocY7JIkdYjBLklShxjskiR1iMEuSVKHGOySJHWI\nwS5JUodMG+xJfjjJ55J8McnDSX6/1T+S5KtJHmi3E1s9Sa5Nsj3Jg0lO6tvW2iSPtdvavvrJSba1\nda5NkoPRrCRJXTeTz4p/ETijqvYkORT4bJJPt2X/sao+sd/4c4BV7XYq8CHg1CRHA1cAo0AB9yfZ\nUlXPtTHrgLuBrcBq4NMMkZXrb51y+Y6r3/YyzUSSpMlNe8RePXvaw0PbraZYZQ1wQ1vvbuDIJMuA\ns4Hbq2p3C/PbgdVt2aur6q6qKuAG4LwD6EmSpEVrRu+xJ1mS5AHgGXrhfE9bdFU73f7BJIe32nLg\nib7Vd7baVPWdE9QlSdIspXeQPMPByZHAp4BfB74BfB04DNgAfKWqrkxyK/Cfq+qzbZ07gN8CzgAO\nr6oPtPrvAt8BPtPGv7XV3wL8VlX97ATffx29U/aMjIycvHnz5jk1PZE9e/awdOnSSZdve/L5Kdc/\nYfkRA5vLIEzXz0JjP8PNfoab/Qy3mfZz+umn319Vo9ONm9XfY6+qbyYZB1ZX1X9r5ReT/G/gN9vj\nncCKvtWOBZ5q9bH96uOtfuwE4yf6/hvovYhgdHS0Bvn3eKf7e7gXT/ce+4WDm8sgLNa/V7xQ2M9w\ns5/hZj9Tm8lV8a9tR+okeQXwVuDL7b1x2hXs5wEPtVW2ABe1q+NPA56vql3AbcBZSY5KchRwFnBb\nW/ZCktPati4Cbh5Yh5IkLSIzOWJfBmxKsoTeC4GbquqWJH+Z5LVAgAeAX23jtwLnAtvpnWp/J0BV\n7U7yfuDeNu7Kqtrd7r8L+AjwCnpXww/VFfGSJC0U0wZ7VT0IvGmC+hmTjC/g0kmWbQQ2TlC/D3jj\ndHORJElT85PnJEnqEINdkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQ\ng12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINd\nkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeqQaYM9yQ8n+VySLyZ5OMnv\nt/pxSe5J8liSG5Mc1uqHt8fb2/KVfdt6b6s/muTsvvrqVtueZP3g25QkaXGYyRH7i8AZVfWTwInA\n6iSnAdcAH6yqVcBzwCVt/CXAc1X1euCDbRxJjgfOB34CWA38cZIlSZYA1wHnAMcDF7SxkiRplqYN\n9urZ0x4e2m4FnAF8otU3Aee1+2vaY9ryM5Ok1TdX1YtV9VVgO3BKu22vqser6nvA5jZWkiTN0oze\nY29H1g8AzwC3A18BvllVe9uQncDydn858ARAW/488Jr++n7rTFaXJEmzdMhMBlXV94ETkxwJfAp4\nw0TD2tdMsmyy+kQvLmqCGknWAesARkZGGB8fn3ris7Bnz54pt/eeE/ZOugwY6FwGYbp+Fhr7GW72\nM9zsZ7gNup8ZBfs+VfXNJOPAacCRSQ5pR+XHAk+1YTuBFcDOJIcARwC7++r79K8zWX3/778B2AAw\nOjpaY2Njs5n+lMbHx5lqexevv3XK9XdcOLi5DMJ0/Sw09jPc7Ge42c9wG3Q/M7kq/rXtSJ0krwDe\nCjwC3An8Qhu2Fri53d/SHtOW/2VVVauf366aPw5YBXwOuBdY1a6yP4zeBXZbBtGcJEmLzUyO2JcB\nm9rV6z8E3FRVtyT5ErA5yQeALwDXt/HXA3+aZDu9I/XzAarq4SQ3AV8C9gKXtlP8JLkMuA1YAmys\nqocH1qEkSYvItMFeVQ8Cb5qg/ji9K9r3r/8d8PZJtnUVcNUE9a3A1hnMV5IkTcFPnpMkqUMMdkmS\nOsRglySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJkjrE\nYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCX\nJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6ZNpgT7IiyZ1JHknycJLfaPX3JXkyyQPtdm7f\nOu9Nsj3Jo0nO7quvbrXtSdb31Y9Lck+Sx5LcmOSwQTcqSdJiMJMj9r3Ae6rqDcBpwKVJjm/LPlhV\nJ7bbVoC27HzgJ4DVwB8nWZJkCXAdcA5wPHBB33auadtaBTwHXDKg/iRJWlSmDfaq2lVVn2/3XwAe\nAZZPscoaYHNVvVhVXwW2A6e02/aqeryqvgdsBtYkCXAG8Im2/ibgvLk2JEnSYjar99iTrATeBNzT\nSpcleTDJxiRHtdpy4Im+1Xa22mT11wDfrKq9+9UlSdIspapmNjBZCvwVcFVVfTLJCPAsUMD7gWVV\n9UtJrgPuqqo/a+tdD2yl9yLi7Kr65VZ/B72j+Cvb+Ne3+gpga1WdMMEc1gHrAEZGRk7evHnz3Dvf\nz549e1i6dOmky7c9+fyU65+w/IiBzWUQputnobGf4WY/w81+httM+zn99NPvr6rR6cYdMpNvmuRQ\n4M+Bj1bVJwGq6um+5R8GbmkPdwIr+lY/Fniq3Z+o/ixwZJJD2lF7//gfUFUbgA0Ao6OjNTY2NpPp\nz8j4+DhTbe/i9bdOuf6OCwc3l0GYrp+Fxn6Gm/0MN/sZboPuZyZXxQe4Hnikqv6wr76sb9jPAw+1\n+1uA85McnuQ4YBXwOeBeYFW7Av4wehfYbaneKYM7gV9o668Fbj6wtiRJWpxmcsT+ZuAdwLYkD7Ta\nb9O7qv1EeqfidwC/AlBVDye5CfgSvSvqL62q7wMkuQy4DVgCbKyqh9v2Lgc2J/kA8AV6LyQkSdIs\nTRvsVfVZIBMs2jrFOlcBV01Q3zrRelX1OL332yVJ0gHwk+ckSeoQg12SpA4x2CVJ6hCDXZKkDjHY\nJUnqEINdkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQg12SpA4x2CVJ\n6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQ\ng12SpA4x2CVJ6hCDXZKkDpk22JOsSHJnkkeSPJzkN1r96CS3J3msfT2q1ZPk2iTbkzyY5KS+ba1t\n4x9LsravfnKSbW2da5PkYDQrSVLXzeSIfS/wnqp6A3AacGmS44H1wB1VtQq4oz0GOAdY1W7rgA9B\n74UAcAVwKnAKcMW+FwNtzLq+9VYfeGuSJC0+0wZ7Ve2qqs+3+y8AjwDLgTXApjZsE3Beu78GuKF6\n7gaOTLIMOBu4vap2V9VzwO3A6rbs1VV1V1UVcEPftiRJ0iykl6UzHJysBD4DvBH4WlUd2bfsuao6\nKsktwNVV9dlWvwO4HBgDfriqPtDqvwt8Fxhv49/a6m8BLq+qn5ng+6+jd2TPyMjIyZs3b55lu5Pb\ns2cPS5cunXT5tiefn3L9E5YfMbC5DMJ0/Sw09jPc7Ge42c9wm2k/p59++v1VNTrduENm+o2TLAX+\nHHh3VX1rirfBJ1pQc6i/tFi1AdgAMDo6WmNjY9PMeubGx8eZansXr791yvV3XDi4uQzCdP0sNPYz\n3OxnuNnPcBt0PzO6Kj7JofRC/aNV9clWfrqdRqd9fabVdwIr+lY/FnhqmvqxE9QlSdIszeSq+ADX\nA49U1R/2LdoC7LuyfS1wc1/9onZ1/GnA81W1C7gNOCvJUe2iubOA29qyF5Kc1r7XRX3bkiRJszCT\nU/FvBt4BbEvyQKv9NnA1cFOSS4CvAW9vy7YC5wLbge8A7wSoqt1J3g/c28ZdWVW72/13AR8BXgF8\nut0kSdIsTRvs7SK4yd5QP3OC8QVcOsm2NgIbJ6jfR++CPEmSdAD85DlJkjrEYJckqUMMdkmSOsRg\nlySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJkjrEYJck\nqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlD\nDHZJkjrEYJckqUMMdkmSOmTaYE+yMckzSR7qq70vyZNJHmi3c/uWvTfJ9iSPJjm7r7661bYnWd9X\nPy7JPUkeS3JjksMG2aAkSYvJTI7YPwKsnqD+wao6sd22AiQ5Hjgf+Im2zh8nWZJkCXAdcA5wPHBB\nGwtwTdvWKuA54JIDaUiSpMVs2mCvqs8Au2e4vTXA5qp6saq+CmwHTmm37VX1eFV9D9gMrEkS4Azg\nE239TcB5s+xBkiQ1B/Ie+2VJHmyn6o9qteXAE31jdrbaZPXXAN+sqr371SVJ0hykqqYflKwEbqmq\nN7bHI8CzQAHvB5ZV1S8luQ64q6r+rI27HthK7wXE2VX1y63+DnpH8Ve28a9v9RXA1qo6YZJ5rAPW\nAYyMjJy8efPmObb9Unv27GHp0qWTLt/25PNTrn/C8iMGNpdBmK6fhcZ+hpv9DDf7GW4z7ef000+/\nv6pGpxt3yFwmUVVP77uf5MPALe3hTmBF39Bjgafa/YnqzwJHJjmkHbX3j5/o+24ANgCMjo7W2NjY\nXKY/ofHxcaba3sXrb51y/R0XDm4ugzBdPwuN/Qw3+xlu9jPcBt3PnE7FJ1nW9/DngX1XzG8Bzk9y\neJLjgFXA54B7gVXtCvjD6F1gt6V6pwvuBH6hrb8WuHkuc5IkSTM4Yk/ycWAMOCbJTuAKYCzJifRO\nxe8AfgWgqh5OchPwJWAvcGlVfb9t5zLgNmAJsLGqHm7f4nJgc5IPAF8Arh9Yd5IkLTLTBntVXTBB\nedLwraqrgKsmqG+l9377/vXH6b3fLkmSDpCfPCdJUocY7JIkdYjBLklShxjskiR1iMEuSVKHGOyS\nJHWIwS5JUocY7JIkdYjBLklShxjskiR1iMEuSVKHGOySJHWIwS5JUocY7JIkdYjBLklShxjskiR1\niMEuSVKHGOySJHWIwS5JUocY7JIkdYjBLklShxjskiR1iMEuSVKHGOySJHWIwS5JUocY7JIkdYjB\nLklShxjskiR1yLTBnmRjkmeSPNRXOzrJ7Ukea1+PavUkuTbJ9iQPJjmpb521bfxjSdb21U9Osq2t\nc22SDLpJSZIWi0NmMOYjwB8BN/TV1gN3VNXVSda3x5cD5wCr2u1U4EPAqUmOBq4ARoEC7k+ypaqe\na2PWAXcDW4HVwKcPvLWX18r1t065fMfVb3uZZiJJWsymPWKvqs8Au/crrwE2tfubgPP66jdUz93A\nkUmWAWcDt1fV7hbmtwOr27JXV9VdVVX0XjychyRJmpO5vsc+UlW7ANrX17X6cuCJvnE7W22q+s4J\n6pIkaQ5mcip+NiZ6f7zmUJ9448k6eqftGRkZYXx8fA5TnNiePXum3N57Tth7QNsf5FxnYrp+Fhr7\nGW72M9zsZ7gNup+5BvvTSZZV1a52Ov2ZVt8JrOgbdyzwVKuP7Vcfb/VjJxg/oaraAGwAGB0drbGx\nscmGztr4+DhTbe/iad5Dn86OCyff9sEwXT8Ljf0MN/sZbvYz3Abdz1xPxW8B9l3Zvha4ua9+Ubs6\n/jTg+Xaq/jbgrCRHtSvozwJua8teSHJauxr+or5tSZKkWZr2iD3Jx+kdbR+TZCe9q9uvBm5Kcgnw\nNeDtbfhW4FxgO/Ad4J0AVbU7yfuBe9u4K6tq3wV576J35f0r6F0Nv+CuiJckaVhMG+xVdcEki86c\nYGwBl06ynY3Axgnq9wFvnG4ekiRpen7ynCRJHWKwS5LUIQa7JEkdYrBLktQhBrskSR1isEuS1CEG\nuyRJHWKwS5LUIQa7JEkdYrBLktQhBrskSR1isEuS1CEGuyRJHWKwS5LUIdP+2dbFYtuTz3Px+lvn\nexqSJB0Qj9glSeoQg12SpA4x2CVJ6hCDXZKkDjHYJUnqEINdkqQOMdglSeoQg12SpA7xA2peJiun\n+fCbHVe/7WWaiSSpyzxilySpQwx2SZI6xGCXJKlDDijYk+xIsi3JA0nua7Wjk9ye5LH29ahWT5Jr\nk2xP8mCSk/q2s7aNfyzJ2gNrSZKkxWsQR+ynV9WJVTXaHq8H7qiqVcAd7THAOcCqdlsHfAh6LwSA\nK4BTgVOAK/a9GJAkSbNzME7FrwE2tfubgPP66jdUz93AkUmWAWcDt1fV7qp6DrgdWH0Q5iVJUucd\naLAX8BdJ7k+yrtVGqmoXQPv6ulZfDjzRt+7OVpusLkmSZulA/x/7m6vqqSSvA25P8uUpxmaCWk1R\nf+kGei8e1gGMjIwwPj4+y+lObuQV8J4T9g5se7M1yF4A9uzZM/Btzif7GW72M9zsZ7gNup8DCvaq\neqp9fSbJp+i9R/50kmVVtaudan+mDd8JrOhb/VjgqVYf268+Psn32wBsABgdHa2xsbGJhs3J//jo\nzfzBtvn7vJ4dF44NdHvj4+MM8t9nvtnPcLOf4WY/w23Q/cz5VHySVyZ51b77wFnAQ8AWYN+V7WuB\nm9v9LcBF7er404Dn26n624CzkhzVLpo7q9UkSdIsHcgh6gjwqST7tvOxqvq/Se4FbkpyCfA14O1t\n/FbgXGA78B3gnQBVtTvJ+4F727grq2r3AcxLkqRFa87BXlWPAz85Qf0bwJkT1Au4dJJtbQQ2znUu\nkiSpx0+ekySpQwx2SZI6xGCXJKlDDHZJkjrEYJckqUMMdkmSOsRglySpQwx2SZI6xGCXJKlDDHZJ\nkjrEYJckqUMMdkmSOsRglySpQwx2SZI65ED+HrteZivX3zrl8h1Xv+1lmokkaVh5xC5JUocY7JIk\ndYjBLklShxjskiR1iMEuSVKHGOySJHWIwS5JUof4/9iHxHT/R12SpJnwiF2SpA4x2CVJ6hCDXZKk\nDvE99kXEz5qXpO7ziF2SpA4ZmiP2JKuB/w4sAf5XVV09z1NacPqPyN9zwl4u9kp7SVp0huKIPckS\n4DrgHOB44IIkx8/vrCRJWniG5Yj9FGB7VT0OkGQzsAb40rzOSj/A9+glafgNS7AvB57oe7wTOHWe\n5rJoHeiH5BzMD9nZ99aCLx4kaWrDEuyZoFYvGZSsA9a1h3uSPDrAORwDPDvA7c2rf9fRfnLNfM9k\nYDq1f7CfYWc/w22m/fyTmWxsWIJ9J7Ci7/GxwFP7D6qqDcCGgzGBJPdV1ejB2PZ8sJ/hZj/DzX6G\nm/1MbSgungPuBVYlOS7JYcD5wJZ5npMkSQvOUByxV9XeJJcBt9H7724bq+rheZ6WJEkLzlAEO0BV\nbQW2zuMUDsop/nlkP8PNfoab/Qw3+5lCql5yjZokSVqghuU9dkmSNACLPtiTrE7yaJLtSdbP93xm\nK8mKJHcmeSTJw0l+o9Xfl+TJJA+027nzPdfZSLIjybY29/ta7egktyd5rH09ar7nORNJfrxvPzyQ\n5FtJ3r2Q9lGSjUmeSfJQX23C/ZGea9tz6sEkJ83fzCc2ST//NcmX25w/leTIVl+Z5Lt9++lP5m/m\nE5ukn0l/vpK8t+2fR5OcPT+zntwk/dzY18uOJA+0+kLYP5P9nj44z6GqWrQ3ehfqfQX4UeAw4IvA\n8fM9r1n2sAw4qd1/FfDX9D6W933Ab873/A6grx3AMfvV/guwvt1fD1wz3/OcQ19LgK/T+/+oC2Yf\nAT8NnAQ8NN3+AM4FPk3v8ylOA+6Z7/nPsJ+zgEPa/Wv6+lnZP24Yb5P0M+HPV/v98EXgcOC49jtw\nyXz3MF0/+y3/A+D3FtD+mez39EF5Di32I/Z/+CjbqvoesO+jbBeMqtpVVZ9v918AHqH3SX5dtAbY\n1O5vAs6bx7nM1ZnAV6rqb+Z7IrNRVZ8Bdu9Xnmx/rAFuqJ67gSOTLHt5ZjozE/VTVX9RVXvbw7vp\nfZ7GgjDJ/pnMGmBzVb1YVV8FttP7XTg0puonSYB/A3z8ZZ3UAZji9/RBeQ4t9mCf6KNsF2woJlkJ\nvAm4p5Uua6dxNi6U09Z9CviLJPen94mDACNVtQt6TxTgdfM2u7k7nx/8hbSQ99Fk+6MLz6tfonfE\ntM9xSb6Q5K+SvGW+JjUHE/18LfT98xbg6ap6rK+2YPbPfr+nD8pzaLEH+4w+ynYhSLIU+HPg3VX1\nLeBDwD8FTgR20Tt1tZC8uapOovcX/y5N8tPzPaEDld6HL/0c8H9aaaHvo8ks6OdVkt8B9gIfbaVd\nwD+uqjcB/wH4WJJXz9f8ZmGyn68FvX+AC/jBF8cLZv9M8Ht60qET1Ga8jxZ7sM/oo2yHXZJD6f2w\nfLSqPglQVU9X1fer6u+BDzNkp9qmU1VPta/PAJ+iN/+n952Oal+fmb8Zzsk5wOer6mlY+PuIyffH\ngn1eJVkL/AxwYbU3O9sp62+0+/fTe0/6x+ZvljMzxc/XQt4/hwD/CrhxX22h7J+Jfk9zkJ5Diz3Y\nF/xH2bb3m64HHqmqP+yr978f8/PAQ/uvO6ySvDLJq/bdp3dR00P09s3aNmwtcPP8zHDOfuBIYyHv\no2ay/bEFuKhd2Xsa8Py+043DLMlq4HLg56rqO3311yZZ0u7/KLAKeHx+ZjlzU/x8bQHOT3J4kuPo\n9fO5l3t+c/RW4MtVtXNfYSHsn8l+T3OwnkPzfbXgfN/oXX341/Re5f3OfM9nDvP/KXqnaB4EHmi3\nc4E/Bba1+hZg2XzPdRY9/Si9q3a/CDy8b78ArwHuAB5rX4+e77nOoqcfAb4BHNFXWzD7iN4Lkl3A\n/6N3NHHJZPuD3mnE69pzahswOt/zn2E/2+m9r7nvefQnbey/bj+HXwQ+D/zsfM9/hv1M+vMF/E7b\nP48C58z3/GfST6t/BPjV/cYuhP0z2e/pg/Ic8pPnJEnqkMV+Kl6SpE4x2CVJ6hCDXZKkDjHYJUnq\nEINdkqQOMdglSeoQg12SpA4x2CVJ6pD/D5Xm/PvE0XwEAAAAAElFTkSuQmCC\n", 260 | "text/plain": [ 261 | "" 262 | ] 263 | }, 264 | "metadata": {}, 265 | "output_type": "display_data" 266 | } 267 | ], 268 | "source": [ 269 | "%matplotlib inline\n", 270 | "import matplotlib.pyplot as plt\n", 271 | "job_data.hist(bins=50, figsize=(8,5))\n", 272 | "# save_fig(\"attribute_histogram_plots\")\n", 273 | "plt.show()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "# 数据清洗" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "## 去重" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 272, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/html": [ 298 | "
\n", 299 | "\n", 312 | "\n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | "
城市职位名称工作经验公司规模学历要求salary
0北京数据分析经验不限10000人以上本科8.0
1北京数据分析师1-3年100-499人本科27.5
2北京数据分析师3-5年1000-9999人本科20.0
3北京数据分析经验不限10000人以上本科9.0
4北京数据分析师3-5年10000人以上本科12.5
\n", 372 | "
" 373 | ], 374 | "text/plain": [ 375 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n", 376 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n", 377 | "1 北京 数据分析师 1-3年 100-499人 本科 27.5\n", 378 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n", 379 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n", 380 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5" 381 | ] 382 | }, 383 | "execution_count": 272, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "jobs = job_data.drop_duplicates()\n", 390 | "jobs.head()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 273, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "(59142, 6)" 402 | ] 403 | }, 404 | "execution_count": 273, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "jobs.shape # 61759条数据" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "## 过滤" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 274, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "jobs_copy = jobs.copy()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "### 过滤出薪资在6-27k之间" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 275, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "jobs_copy = jobs_copy[jobs_copy['salary']<27]" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 276, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/html": [ 453 | "
\n", 454 | "\n", 467 | "\n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | "
城市职位名称工作经验公司规模学历要求salary
0北京数据分析经验不限10000人以上本科8.0
2北京数据分析师3-5年1000-9999人本科20.0
3北京数据分析经验不限10000人以上本科9.0
4北京数据分析师3-5年10000人以上本科12.5
8北京数据分析师1-3年10000人以上本科25.5
\n", 527 | "
" 528 | ], 529 | "text/plain": [ 530 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n", 531 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n", 532 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n", 533 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n", 534 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5\n", 535 | "8 北京 数据分析师 1-3年 10000人以上 本科 25.5" 536 | ] 537 | }, 538 | "execution_count": 276, 539 | "metadata": {}, 540 | "output_type": "execute_result" 541 | } 542 | ], 543 | "source": [ 544 | "jobs_copy = jobs_copy[jobs_copy['salary']>6]\n", 545 | "jobs_copy.head()" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 277, 551 | "metadata": {}, 552 | "outputs": [ 553 | { 554 | "data": { 555 | "text/plain": [ 556 | "(27293, 6)" 557 | ] 558 | }, 559 | "execution_count": 277, 560 | "metadata": {}, 561 | "output_type": "execute_result" 562 | } 563 | ], 564 | "source": [ 565 | "jobs_copy.shape" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 278, 571 | "metadata": { 572 | "scrolled": true 573 | }, 574 | "outputs": [ 575 | { 576 | "data": { 577 | "text/html": [ 578 | "
\n", 579 | "\n", 592 | "\n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | "
salary
count27293.000000
mean10.915528
std4.546630
min6.500000
25%7.500000
50%9.000000
75%12.500000
max26.500000
\n", 634 | "
" 635 | ], 636 | "text/plain": [ 637 | " salary\n", 638 | "count 27293.000000\n", 639 | "mean 10.915528\n", 640 | "std 4.546630\n", 641 | "min 6.500000\n", 642 | "25% 7.500000\n", 643 | "50% 9.000000\n", 644 | "75% 12.500000\n", 645 | "max 26.500000" 646 | ] 647 | }, 648 | "execution_count": 278, 649 | "metadata": {}, 650 | "output_type": "execute_result" 651 | } 652 | ], 653 | "source": [ 654 | "jobs_copy.describe()" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": {}, 660 | "source": [ 661 | "### 过滤后的薪资分布图" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 279, 667 | "metadata": {}, 668 | "outputs": [ 669 | { 670 | "data": { 671 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAE/CAYAAACqxdFzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAH6JJREFUeJzt3X+U3XV95/HnW35JDSVBdJqGnIau\nqVs0R9TZQNdudwIWAvQU3FN7cKkmiie1hZ66m26J7bZSlTbuNnX7w9LGkjX+qAPb6pIDcWkWnfV4\nTlGIIgGRZcRUAjGpTQiOIN3ge/+4n5Gb4d65d2buzNz55Pk4556538/38/3ez/t+73de8/3Od74T\nmYkkSarLC+Z7AJIkqfcMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCQAIiIj4mXzPQ5JvWHA\nS5JUIQNe0oxExInzPQZJz2fASxWKiOsi4rGI+E5EPBQRF0bE6oj4+4h4IiL2R8SfRcTJbZa/LCK+\nHBFPRsSjEXF907wV5XT+1RHxTeAzEXF7RPzahHXcFxFXzG6lktox4KXKRMTLgWuBf5WZpwEXA3uB\nZ4H/AJwJ/BRwIfCrbVbzXeAtwGLgMuBXWoT1vwV+sqx/O/BLTWN4FbAM2NmToiRNmQEv1edZ4BTg\nnIg4KTP3ZubXM3N3Zt6VmUczcy/wlzRC+nkycyQz92Tm9zPzPuATLfpen5nfzcyngVuBlRGxssx7\nM3BzZv7zbBQoqTMDXqpMZo4C7wSuBw5GxHBE/GhE/ERE3BYR34qIJ4Hfp3E0/zwRcV5EfDYi/jEi\njgDvaNH30abXfAa4BfiliHgB8Cbgoz0vTlLXDHipQpn515n508CPAQm8H7gR+BqwMjN/GPgtINqs\n4q+BHcDyzDwd+IsWfSf+K8rtwFU0Tv0/lZl/34taJE2PAS9VJiJeHhEXRMQpwPeAp2mctj8NeBIY\ni4h/CfzKJKs5DTiUmd+LiNXAv+/0uiXQvw9swaN3ad4Z8FJ9TgE2A98GvgW8lMbR+m/QCOrvAB8C\nbp5kHb8KvCcivgP8Lo3T7934CLAK+Ni0Ri6pZyJz4lk2SZqeiHgLsKH8ekDSPPIIXlJPRMQP0Tjy\n3zrfY5FkwEvqgYi4GPhH4ACNC/QkzTNP0UuSVCGP4CVJqpABL0lShfr6v0CdeeaZuWLFivkexqS+\n+93v8qIXvWi+h9ETtdRSSx1gLf2qllpqqQPqqmX37t3fzsyXzHQ9fR3wK1as4J577pnvYUxqZGSE\noaGh+R5GT9RSSy11gLX0q1pqqaUOqKuWiPiHXqzHU/SSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKF\nDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRXq63vR96MVm24/ZnrjqqOsb2rbu/myuR6S\nJEnP4xG8JEkVMuAlSapQ1wEfESdExJcj4rYyfXZEfCEiHo6ImyPi5NJ+SpkeLfNXNK3jXaX9oYi4\nuNfFSJKkhqkcwf868GDT9PuBD2TmSuAwcHVpvxo4nJkvAz5Q+hER5wBXAq8A1gJ/HhEnzGz4kiSp\nla4CPiLOAi4D/qpMB3AB8Dely3bgivL88jJNmX9h6X85MJyZz2TmN4BRYHUvipAkScfq9gj+vwG/\nCXy/TL8YeCIzj5bpfcCy8nwZ8ChAmX+k9P9Be4tlJElSD3X8M7mI+DngYGbujoih8eYWXbPDvMmW\naX69DcAGgIGBAUZGRjoNcU5tXHX0mOmBU49t67fxTsXY2NiCHv+4WuoAa+lXtdRSSx1QVy290s3f\nwb8O+PmIuBR4IfDDNI7oF0fEieUo/Szg8dJ/H7Ac2BcRJwKnA4ea2sc1L/MDmbkV2AowODiYQ0ND\n0yhr9qxv8XfwW/Y89zbuvWpojkfUOyMjI/Tb+z0dtdQB1tKvaqmlljqgrlp6peMp+sx8V2aelZkr\naFwk95nMvAr4LPALpds64NbyfEeZpsz/TGZmab+yXGV/NrAS+GLPKpEkST8wkzvZXQcMR8T7gC8D\nN5X2m4CPRsQojSP3KwEy84GIuAX4KnAUuCYzn53B60uSpDamFPCZOQKMlOeP0OIq+Mz8HvDGNsvf\nANww1UFKkqSp8U52kiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIk\nVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLg\nJUmqkAEvSVKFDHhJkipkwEuSVKGOAR8RL4yIL0bEVyLigYj4vdL+4Yj4RkTcWx7nlvaIiD+JiNGI\nuC8iXtO0rnUR8XB5rJu9siRJOr6d2EWfZ4ALMnMsIk4CPh8Rny7z/lNm/s2E/pcAK8vjPOBG4LyI\nOAN4NzAIJLA7InZk5uFeFCJJkp7T8Qg+G8bK5EnlkZMscjnwkbLcXcDiiFgKXAzsysxDJdR3AWtn\nNnxJktRKZE6W1aVTxAnAbuBlwAcz87qI+DDwUzSO8O8ENmXmMxFxG7A5Mz9flr0TuA4YAl6Yme8r\n7b8DPJ2ZfzjhtTYAGwAGBgZeOzw83Is6e2bPY0eOmR44FQ48/dz0qmWnz/GIemdsbIxFixbN9zBm\nrJY6wFr6VS211FIH1FXLmjVrdmfm4EzX080pejLzWeDciFgMfCoiXgm8C/gWcDKwlUaIvweIVquY\npH3ia20t62NwcDCHhoa6GeKcWb/p9mOmN646ypY9z72Ne68amuMR9c7IyAj99n5PRy11gLX0q1pq\nqaUOqKuWXpnSVfSZ+QQwAqzNzP3lNPwzwH8HVpdu+4DlTYudBTw+SbskSeqxbq6if0k5ciciTgVe\nD3yt/F6diAjgCuD+ssgO4C3lavrzgSOZuR+4A7goIpZExBLgotImSZJ6rJtT9EuB7eX38C8AbsnM\n2yLiMxHxEhqn3u8F3lH67wQuBUaBp4C3AmTmoYh4L3B36feezDzUu1IkSdK4jgGfmfcBr27RfkGb\n/glc02beNmDbFMcoSZKmyDvZSZJUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJ\nkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ\n8JIkVciAlySpQga8JEkVMuAlSapQx4CPiBdGxBcj4isR8UBE/F5pPzsivhARD0fEzRFxcmk/pUyP\nlvkrmtb1rtL+UERcPFtFSZJ0vOvmCP4Z4ILMfBVwLrA2Is4H3g98IDNXAoeBq0v/q4HDmfky4AOl\nHxFxDnAl8ApgLfDnEXFCL4uRJEkNHQM+G8bK5EnlkcAFwN+U9u3AFeX55WWaMv/CiIjSPpyZz2Tm\nN4BRYHVPqpAkScfo6nfwEXFCRNwLHAR2AV8HnsjMo6XLPmBZeb4MeBSgzD8CvLi5vcUykiSph07s\nplNmPgucGxGLgU8BP9mqW/kabea1az9GRGwANgAMDAwwMjLSzRDnzMZVR4+ZHjj12LZ+G+9UjI2N\nLejxj6ulDrCWflVLLbXUAXXV0itdBfy4zHwiIkaA84HFEXFiOUo/C3i8dNsHLAf2RcSJwOnAoab2\ncc3LNL/GVmArwODgYA4NDU1liLNu/abbj5neuOooW/Y89zbuvWpojkfUOyMjI/Tb+z0dtdQB1tKv\naqmlljqgrlp6pZur6F9SjtyJiFOB1wMPAp8FfqF0WwfcWp7vKNOU+Z/JzCztV5ar7M8GVgJf7FUh\nkiTpOd0cwS8Ftpcr3l8A3JKZt0XEV4HhiHgf8GXgptL/JuCjETFK48j9SoDMfCAibgG+ChwFrimn\n/iVJUo91DPjMvA94dYv2R2hxFXxmfg94Y5t13QDcMPVhSpKkqfBOdpIkVciAlySpQga8JEkVMuAl\nSarQlP4OvgYrJvwd+0R7N182RyORJGn2eAQvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlC\nBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIkVciAlySpQga8JEkVMuAlSaqQAS9J\nUoUMeEmSKtQx4CNieUR8NiIejIgHIuLXS/v1EfFYRNxbHpc2LfOuiBiNiIci4uKm9rWlbTQiNs1O\nSZIk6cQu+hwFNmbmlyLiNGB3ROwq8z6QmX/Y3DkizgGuBF4B/CjwvyPiJ8rsDwI/C+wD7o6IHZn5\n1V4UcrxYsen2jn32br5sDkYiSepnHQM+M/cD+8vz70TEg8CySRa5HBjOzGeAb0TEKLC6zBvNzEcA\nImK49DXgJUnqscjM7jtHrAA+B7wS+I/AeuBJ4B4aR/mHI+LPgLsy82NlmZuAT5dVrM3Mt5f2NwPn\nZea1E15jA7ABYGBg4LXDw8PTra2lPY8dmXT+qmWnT2n5gVPhwNPdLz9TncY/kzGMjY2xaNGiaS3b\nT2qpA6ylX9VSSy11QF21rFmzZndmDs50Pd2cogcgIhYBfwu8MzOfjIgbgfcCWb5uAd4GRIvFk9a/\n73/eTxeZuRXYCjA4OJhDQ0PdDrEr6zuc4t571eSvN3H5jauOsmXPc29jp+VnqtP4ZzKGkZERev1+\nz4da6gBr6Ve11FJLHVBXLb3SVcBHxEk0wv3jmflJgMw80DT/Q8BtZXIfsLxp8bOAx8vzdu2SJKmH\nurmKPoCbgAcz84+a2pc2dXsDcH95vgO4MiJOiYizgZXAF4G7gZURcXZEnEzjQrwdvSlDkiQ16+YI\n/nXAm4E9EXFvafst4E0RcS6N0+x7gV8GyMwHIuIWGhfPHQWuycxnASLiWuAO4ARgW2Y+0MNaJElS\n0c1V9J+n9e/Vd06yzA3ADS3ad062nCRJ6g3vZCdJUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJck\nqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRXq+P/gJc29\nFZtuB2DjqqOsL8+b7d182VwPSdIC4xG8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKdfwz\nuYhYDnwE+BHg+8DWzPzjiDgDuBlYAewFfjEzD0dEAH8MXAo8BazPzC+Vda0D/nNZ9fsyc3tvy5m5\nFS3+JEmSpIWmmyP4o8DGzPxJ4Hzgmog4B9gE3JmZK4E7yzTAJcDK8tgA3AhQfiB4N3AesBp4d0Qs\n6WEtkiSp6Bjwmbl//Ag8M78DPAgsAy4Hxo/AtwNXlOeXAx/JhruAxRGxFLgY2JWZhzLzMLALWNvT\naiRJEjDF38FHxArg1cAXgIHM3A+NHwKAl5Zuy4BHmxbbV9ratUuSpB6LzOyuY8Qi4P8AN2TmJyPi\nicxc3DT/cGYuiYjbgT/IzM+X9juB3wQuAE7JzPeV9t8BnsrMLRNeZwONU/sMDAy8dnh4eMZFNtvz\n2JGerm/gVDjw9HPTq5ad3tP1T9TN+Kc7hrGxMRYtWjStZftJDXWMb+eJn69xs/05mw01bJdxtdRS\nSx1QVy1r1qzZnZmDM11PV/eij4iTgL8FPp6ZnyzNByJiaWbuL6fgD5b2fcDypsXPAh4v7UMT2kcm\nvlZmbgW2AgwODubQ0NDELjPS6r7eM7Fx1VG27Hnubdx71VBP1z9RN+Of7hhGRkbo9fs9H2qoY33T\nveibP1/jZvtzNhtq2C7jaqmlljqgrlp6peMp+nJV/E3Ag5n5R02zdgDryvN1wK1N7W+JhvOBI+UU\n/h3ARRGxpFxcd1FpkyRJPdbNEfzrgDcDeyLi3tL2W8Bm4JaIuBr4JvDGMm8njT+RG6XxZ3JvBcjM\nQxHxXuDu0u89mXmoJ1VIkqRjdAz48rv0aDP7whb9E7imzbq2AdumMkBJkjR13slOkqQKGfCSJFXI\ngJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVqKv/Jqfurejw3972br5s\njkYiSTqeeQQvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4\nSZIqZMBLklQhA16SpAp1DPiI2BYRByPi/qa26yPisYi4tzwubZr3rogYjYiHIuLipva1pW00Ijb1\nvhRJkjSumyP4DwNrW7R/IDPPLY+dABFxDnAl8IqyzJ9HxAkRcQLwQeAS4BzgTaWvJEmaBR3/XWxm\nfi4iVnS5vsuB4cx8BvhGRIwCq8u80cx8BCAihkvfr055xJIkqaPIzM6dGgF/W2a+skxfD6wHngTu\nATZm5uGI+DPgrsz8WOl3E/Dpspq1mfn20v5m4LzMvLbFa20ANgAMDAy8dnh4eAblPd+ex470dH0D\np8KBp7vvv2rZ6TN6vW7GP93XGBsbY9GiRdNatp/UUMf4dm73+Zrp52g+1LBdxtVSSy11QF21rFmz\nZndmDs50PR2P4Nu4EXgvkOXrFuBtQLTom7T+VUDLnywycyuwFWBwcDCHhoamOcTW1m+6vafr27jq\nKFv2dP827r1qaEav1834p/saIyMj9Pr9ng811DG+ndt9vmb6OZoPNWyXcbXUUksdUFctvTKtgM/M\nA+PPI+JDwG1lch+wvKnrWcDj5Xm7dkmS1GPT+jO5iFjaNPkGYPwK+x3AlRFxSkScDawEvgjcDayM\niLMj4mQaF+LtmP6wJUnSZDoewUfEJ4Ah4MyI2Ae8GxiKiHNpnGbfC/wyQGY+EBG30Lh47ihwTWY+\nW9ZzLXAHcAKwLTMf6Hk1kiQJ6O4q+je1aL5pkv43ADe0aN8J7JzS6CRJ0rR4JztJkipkwEuSVCED\nXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqtB070Wv49iKDvfD37v5sjkaiSSpHY/gJUmqkAEvSVKF\nDHhJkirk7+A15/wdviTNPgNe1dnz2BHW+0OEpOOcp+glSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQK\nGfCSJFXIgJckqUIGvCRJFfJGN8ehdneS27jqKOs33e5NYCSpAh2P4CNiW0QcjIj7m9rOiIhdEfFw\n+bqktEdE/ElEjEbEfRHxmqZl1pX+D0fEutkpR5IkQXen6D8MrJ3Qtgm4MzNXAneWaYBLgJXlsQG4\nERo/EADvBs4DVgPvHv+hQJIk9V7HU/SZ+bmIWDGh+XJgqDzfDowA15X2j2RmAndFxOKIWFr67srM\nQwARsYvGDw2fmHEFkqRp8f821G26F9kNZOZ+gPL1paV9GfBoU799pa1duyRJmgXRONju0KlxBH9b\nZr6yTD+RmYub5h/OzCURcTvwB5n5+dJ+J/CbwAXAKZn5vtL+O8BTmbmlxWttoHF6n4GBgdcODw/P\nrMIJ9jx2pKfrGzgVDjzdff9Vy06f0et1M/5Or9FuHeO1THf5mb5+t8t3cvDQkY7bZKavMdvG36N2\nn69+H38rY2NjLFq0aL6H0RO11FLDvjKulm0CsGbNmt2ZOTjT9Uz3KvoDEbE0M/eXU/AHS/s+YHlT\nv7OAx0v70IT2kVYrzsytwFaAwcHBHBoaatVt2jqdjpqqjauOsmVP92/j3quGZvR63Yy/02u0W8d4\nLdNdfqav/wN7vjv5+jucMvzTj9/acZvMdDvMtvH3qN3nq9/H38rIyAi93p/nSy211LCvjKtlm/TS\ndE/R7wDGr4RfB9za1P6WcjX9+cCRcgr/DuCiiFhSLq67qLRJkqRZ0PHQMyI+QePo+8yI2EfjavjN\nwC0RcTXwTeCNpftO4FJgFHgKeCtAZh6KiPcCd5d+7xm/4E6SJPVeN1fRv6nNrAtb9E3gmjbr2QZs\nm9LoJEnStHirWkmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipk\nwEuSVKHp/jc5zZIVPf5vd5Kk45NH8JIkVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JU\nIQNekqQKeaMbSWqj3Y2nNq46yvpNt7N382VzPCKpex7BS5JUIQNekqQKGfCSJFVoRgEfEXsjYk9E\n3BsR95S2MyJiV0Q8XL4uKe0REX8SEaMRcV9EvKYXBUiSpOfrxUV2azLz203Tm4A7M3NzRGwq09cB\nlwAry+M84MbyVT3mf6STJM3GKfrLge3l+Xbgiqb2j2TDXcDiiFg6C68vSdJxLzJz+gtHfAM4DCTw\nl5m5NSKeyMzFTX0OZ+aSiLgN2JyZny/tdwLXZeY9E9a5AdgAMDAw8Nrh4eFpj6+VPY8d6en6Bk6F\nA09333/VstMnnd/r8U3FeC0zHeNs19hp/QcPHem4TTqtY76Nv0ftPl/9Pv5WxsbGWLRo0XwPY0ra\nfVa73Vf6XQ37yriF+PlqZ82aNbszc3Cm65npKfrXZebjEfFSYFdEfG2SvtGi7Xk/XWTmVmArwODg\nYA4NDc1wiMda3+PT1xtXHWXLnu7fxr1XDU06v9fjm4rxWmY6xtmusdP6//Tjt3bcJp3WMd/G36N2\nn69+H38rIyMj9Hp/nm3tPqvd7iv9roZ9ZdxC/HzNthmdos/Mx8vXg8CngNXAgfFT7+XrwdJ9H7C8\nafGzgMdn8vqSJKm1aQd8RLwoIk4bfw5cBNwP7ADWlW7rgFvL8x3AW8rV9OcDRzJz/7RHLkmS2prJ\nKfoB4FMRMb6ev87M/xURdwO3RMTVwDeBN5b+O4FLgVHgKeCtM3htSZI0iWkHfGY+AryqRfs/ARe2\naE/gmum+niRJ6p53spMkqUIGvCRJFTLgJUmqkP8PXtKsmHjL5PH/od4t/9e6NDMewUuSVCEDXpKk\nCnmKXj3nf7OTpPnnEbwkSRXyCF6ahk5nKeb7ArFuzqLM9xglzS4DXpK0YK1o+s+Lrf5K43j+QdZT\n9JIkVcgjeKkFLxSUtNB5BC9JUoUMeEmSKuQpeh2XPAUvqXYewUuSVCEDXpKkChnwkiRVyICXJKlC\nBrwkSRXyKnppHiyEq/j7/X77kiZnwEvSPPGHKM2mOQ/4iFgL/DFwAvBXmbl5rscgqX4L4SzJ8cAf\nYubPnAZ8RJwAfBD4WWAfcHdE7MjMr87lOKSFzvDyPZgrCz2gF/r4Z2Kuj+BXA6OZ+QhARAwDlwMG\nvLTAGLCz73gOp7lS83s81wG/DHi0aXofcN4cj0GS5oQ/BM3cfL+H3bx+v/4QEJk5dy8W8Ubg4sx8\ne5l+M7A6M3+tqc8GYEOZfDnw0JwNcHrOBL4934PokVpqqaUOsJZ+VUsttdQBddXy8sw8baYrmesj\n+H3A8qbps4DHmztk5lZg61wOaiYi4p7MHJzvcfRCLbXUUgdYS7+qpZZa6oD6aunFeub6Rjd3Aysj\n4uyIOBm4Etgxx2OQJKl6c3oEn5lHI+Ja4A4afya3LTMfmMsxSJJ0PJjzv4PPzJ3Azrl+3Vm0YH6d\n0IVaaqmlDrCWflVLLbXUAdbyPHN6kZ0kSZob/rMZSZIqZMB3ISJeHhH3Nj2ejIh3TugzFBFHmvr8\n7nyNd6KI2BYRByPi/qa2MyJiV0Q8XL4uabPsutLn4YhYN3ejbjmWVnX814j4WkTcFxGfiojFbZbd\nGxF7yrbpyRWqM9Gmlusj4rGmz9ClbZZdGxEPRcRoRGyau1G31qaWm5vq2BsR97ZZtm+2S0Qsj4jP\nRsSDEfFARPx6aV+I+0q7Whbc/jJJLQtqf5mkjtnbVzLTxxQeNC4O/BbwYxPah4Db5nt8bcb8M8Br\ngPub2v4LsKk83wS8v8VyZwCPlK9LyvMlfVbHRcCJ5fn7W9VR5u0FzpzvbdGhluuB3+iw3AnA14Ef\nB04GvgKc02+1TJi/Bfjdft8uwFLgNeX5acD/Bc5ZoPtKu1oW3P4ySS0Lan9pV8eEPj3dVzyCn7oL\nga9n5j/M90C6lZmfAw5NaL4c2F6ebweuaLHoxcCuzDyUmYeBXcDaWRtoB63qyMy/y8yjZfIuGvdW\n6Htttkk3fnC758z8Z2D8ds/zZrJaIiKAXwQ+MaeDmobM3J+ZXyrPvwM8SOPumwtxX2lZy0LcXybZ\nLt3om/2lUx2zsa8Y8FN3Je03wE9FxFci4tMR8Yq5HNQ0DGTmfmh88ICXtujT6tbC3e5Y8+FtwKfb\nzEvg7yJidzTultivri2nT7e1ORW80LbJvwEOZObDbeb35XaJiBXAq4EvsMD3lQm1NFtw+0uLWhbk\n/tJmm/R8XzHgpyAaN+f5eeB/tJj9JRqn7V8F/CnwP+dybLMkWrT15Z9dRMRvA0eBj7fp8rrMfA1w\nCXBNRPzMnA2uezcC/wI4F9hP43TdRAtmmxRvYvIjkr7bLhGxCPhb4J2Z+WS3i7Vom/ft0q6Whbi/\ntKhlQe4vk3y+er6vGPBTcwnwpcw8MHFGZj6ZmWPl+U7gpIg4c64HOAUHImIpQPl6sEWfjrcW7gfl\ngqafA67K8suqiTLz8fL1IPApGqfu+kpmHsjMZzPz+8CHaD3GBbFNACLiRODfATe369Nv2yUiTqLx\nzffjmfnJ0rwg95U2tSzI/aVVLQtxf5lkm8zKvmLAT03bn7Ai4kfK71CIiNU03tt/msOxTdUOYPxK\n33XArS363AFcFBFLyumvi0pb34iItcB1wM9n5lNt+rwoIk4bf06jjvtb9Z1P4yFSvIHWY1xIt3t+\nPfC1zNzXama/bZey/94EPJiZf9Q0a8HtK+1qWYj7yyS1LKj9ZZLPF8zWvjIfVxMuxAfwQzQC+/Sm\ntncA7yjPrwUeoHGV5l3Av57vMTeN8xM0TmH9Pxo/0V4NvBi4E3i4fD2j9B0E/qpp2bcBo+Xx1j6s\nY5TG79juLY+/KH1/FNhZnv942S5fKdvot/t0m3wU2APcR+Ob0NKJtZTpS2lcgfv1fq2ltH94fP9o\n6tu32wX4aRqnb+9r+jxdukD3lXa1LLj9ZZJaFtT+0q6OMm9W9hXvZCdJUoU8RS9JUoUMeEmSKmTA\nS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmq0P8HRHFgURsDZQUAAAAASUVORK5CYII=\n", 672 | "text/plain": [ 673 | "" 674 | ] 675 | }, 676 | "metadata": {}, 677 | "output_type": "display_data" 678 | } 679 | ], 680 | "source": [ 681 | "%matplotlib inline\n", 682 | "import matplotlib.pyplot as plt\n", 683 | "jobs_copy.hist(bins=50, figsize=(8,5))\n", 684 | "plt.show()" 685 | ] 686 | }, 687 | { 688 | "cell_type": "markdown", 689 | "metadata": {}, 690 | "source": [ 691 | "## 不相关职位筛选" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "### 筛选出职位名称包含:商业|数据分析|挖掘|分析|BI|BA|数据" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 280, 704 | "metadata": {}, 705 | "outputs": [ 706 | { 707 | "data": { 708 | "text/plain": [ 709 | "数据分析师 5114\n", 710 | "电商运营 1144\n", 711 | "数据分析 1049\n", 712 | "产品经理 881\n", 713 | "网络推广 609\n", 714 | "Name: 职位名称, dtype: int64" 715 | ] 716 | }, 717 | "execution_count": 280, 718 | "metadata": {}, 719 | "output_type": "execute_result" 720 | } 721 | ], 722 | "source": [ 723 | "jobs_copy['职位名称'].value_counts().head()" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 281, 729 | "metadata": { 730 | "collapsed": true 731 | }, 732 | "outputs": [], 733 | "source": [ 734 | "da_test = jobs_copy.astype(str)\n", 735 | "\n", 736 | "temp_0 = da_test[\n", 737 | " da_test['职位名称'].str.contains('商业|数据分析|挖掘|分析|BI|BA|数据')\n", 738 | " ]\n", 739 | "\n", 740 | "#data_analysis = jobs_result['职位名称']\n", 741 | "temp = list(temp_0['职位名称']) # 将含有关键字的列表表示为test1\n", 742 | "jobs_data = da_test[da_test['职位名称'].isin(temp)]" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": 282, 748 | "metadata": {}, 749 | "outputs": [], 750 | "source": [ 751 | "#jobs_data['职位名称'].value_counts()" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 283, 757 | "metadata": {}, 758 | "outputs": [ 759 | { 760 | "data": { 761 | "text/plain": [ 762 | "(13716, 6)" 763 | ] 764 | }, 765 | "execution_count": 283, 766 | "metadata": {}, 767 | "output_type": "execute_result" 768 | } 769 | ], 770 | "source": [ 771 | "jobs_data.shape" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 284, 777 | "metadata": {}, 778 | "outputs": [ 779 | { 780 | "data": { 781 | "text/html": [ 782 | "
\n", 783 | "\n", 796 | "\n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | "
城市职位名称工作经验公司规模学历要求salary
0北京数据分析经验不限10000人以上本科8.0
2北京数据分析师3-5年1000-9999人本科20.0
3北京数据分析经验不限10000人以上本科9.0
4北京数据分析师3-5年10000人以上本科12.5
8北京数据分析师1-3年10000人以上本科25.5
\n", 856 | "
" 857 | ], 858 | "text/plain": [ 859 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n", 860 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n", 861 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n", 862 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n", 863 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5\n", 864 | "8 北京 数据分析师 1-3年 10000人以上 本科 25.5" 865 | ] 866 | }, 867 | "execution_count": 284, 868 | "metadata": {}, 869 | "output_type": "execute_result" 870 | } 871 | ], 872 | "source": [ 873 | "dt_test = jobs_data.astype(str)\n", 874 | "y = dt_test[\n", 875 | " dt_test['职位名称'].str.contains('转行')\n", 876 | " ]\n", 877 | "\n", 878 | "test1 = list(y['职位名称']) # 将含有关键字的列表表示为test1\n", 879 | "test2 = list(dt_test['职位名称']) # 将全部表示为test2\n", 880 | "ret = list(set(test2) ^ set(test1)) # 列表求差集的方法将含有关键字的行除去\n", 881 | "\n", 882 | "jobs_data = dt_test[dt_test['职位名称'].isin(ret)]\n", 883 | "jobs_data.head()" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": 285, 889 | "metadata": {}, 890 | "outputs": [ 891 | { 892 | "name": "stdout", 893 | "output_type": "stream", 894 | "text": [ 895 | "\n", 896 | "Int64Index: 13665 entries, 0 to 65053\n", 897 | "Data columns (total 6 columns):\n", 898 | "城市 13665 non-null object\n", 899 | "职位名称 13665 non-null object\n", 900 | "工作经验 13665 non-null object\n", 901 | "公司规模 13665 non-null object\n", 902 | "学历要求 13665 non-null object\n", 903 | "salary 13665 non-null float64\n", 904 | "dtypes: float64(1), object(5)\n", 905 | "memory usage: 747.3+ KB\n" 906 | ] 907 | } 908 | ], 909 | "source": [ 910 | "jobs_data_copy = jobs_data.copy()\n", 911 | "jobs_data_copy['salary'] = pd.to_numeric(jobs_data_copy['salary'])\n", 912 | "jobs_data_copy.info()" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 286, 918 | "metadata": {}, 919 | "outputs": [ 920 | { 921 | "data": { 922 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfgAAAE/CAYAAACqxdFzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAHGNJREFUeJzt3X+Q3Hd93/HnOxZ2DDKWjMvVkTSR\nE1QnTlQSczUmpPSEEv9ksNuJW7sOlsAdDcFQCGKwCDOYIUMrN3UokJRUYA1y6lgm/Kg1WBQ0hivD\nDHZBBCwbQyQ7in2ykADZMsImVPDuH/u5sD7t3u7d7u3tffx8zNzc7uf72e/3897vfu913+9+97uR\nmUiSpLr83HwPQJIk9Z8BL0lShQx4SZIqZMBLklQhA16SpAoZ8JIkVciAlwRARGREvGi+xyGpPwx4\nSZIqZMBL6klELJrvMUg6kQEvVSgiboiIAxHxg4j4dkSsjYjzI+LLEfFERByMiD+LiJPbPP6yiPib\niHgyIh6NiHc3TVtZDudfFxGPAJ+PiLsi4k1T5nFfRFwxt5VKaseAlyoTEecAbwT+RWaeBlwE7Ad+\nAvwhcCbwMmAt8IY2s/khcC2wBLgM+IMWYf2vgF8t898G/H7TGF4MLAN29qUoSTNmwEv1+QlwCnBu\nRDwnM/dn5kOZuTsz78nM45m5H/gfNEL6BJk5npl7MvOnmXkfcHuLvu/OzB9m5tPAncCqiFhVpr0G\nuCMzfzwXBUrqzICXKpOZ+4C3AO8GDkfE9oj4hYj4ZxHx6Yj4TkQ8CfwnGnvzJ4iIl0bEFyLiuxFx\nFHh9i76PNi3zH4CPAb8fET8HXA38Zd+Lk9Q1A16qUGb+VWb+NvCLQAI3AR8CvgWsysznA38ERJtZ\n/BWwA1iRmacDf9Gi79SvotwGXEPj0P9TmfnlftQiaXYMeKkyEXFORLwyIk4BfgQ8TeOw/WnAk8Cx\niPgV4A+mmc1pwJHM/FFEnA/8+07LLYH+U+Bm3HuX5p0BL9XnFGAz8D3gO8ALaeytv41GUP8A+DBw\nxzTzeAPwnoj4AfAuGoffu3ErsBr4n7MauaS+icypR9kkaXYi4lpgQ3l7QNI8cg9eUl9ExHNp7Plv\nme+xSDLgJfVBRFwEfBc4ROMEPUnzzEP0kiRVyD14SZIqZMBLklShof4WqDPPPDNXrlw538OY1g9/\n+EOe97znzfcw+qKWWmqpA6xlWNVSSy11QF217N69+3uZ+U96nc9QB/zKlSv56le/Ot/DmNb4+Dhj\nY2PzPYy+qKWWWuoAaxlWtdRSSx1QVy0R8ff9mI+H6CVJqlDHgI+IrRFxOCLubzHtbeV7oc8s9yMi\nPhAR+8p3QZ/X1HddROwtP+v6W4YkSWrWzR78R4GLpzZGxArgd4FHmpovAVaVnw00vtyCiDgDuBF4\nKXA+cGNELO1l4JIkqb2OAZ+ZXwSOtJj0PuDtPPMbpS4Hbs2Ge4AlEXEWcBGwKzOPZObjwC5a/NMg\nSZL6Y1bvwUfEq4EDmfmNKZOW0fQd0cBEaWvXLkmS5sCMz6Iv15t+J3Bhq8kt2nKa9lbz30Dj8D4j\nIyOMj4/PdIgDdezYsaEfY7dqqaWWOsBahlUttdRSB9RVS7/M5mNyvwycDXwjIgCWA18r3xk9Aaxo\n6rsceKy0j01pH28188zcQvmyitHR0Rz2jz3U9NGMWmqppQ6wlmFVSy211AF11dIvMz5En5l7MvOF\nmbkyM1fSCO/zMvM7wA7g2nI2/QXA0cw8CHwWuDAilpaT6y4sbZIkaQ508zG524EvA+dExEREXDdN\n953Aw8A+4MM0vjqSzDwC/DHwlfLzntImSZLmQMdD9Jl5dYfpK5tuJ3B9m35bga0zHJ8kSZoFr2Qn\nSVKFhvpa9NKz1cpNdwGwcfVx1pfbzfZvvmzQQ5K0wLgHL0lShQx4SZIqZMBLklQhA16SpAoZ8JIk\nVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLg\nJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIq\nZMBLklShjgEfEVsj4nBE3N/U9icR8a2IuC8iPhURS5qmvSMi9kXEtyPioqb2i0vbvojY1P9SJEnS\npG724D8KXDylbRfw65n5z4G/Bd4BEBHnAlcBv1Ye898j4qSIOAn4c+AS4Fzg6tJXkiTNgY4Bn5lf\nBI5MaftcZh4vd+8BlpfblwPbM/MfMvPvgH3A+eVnX2Y+nJk/BraXvpIkaQ704z341wGfKbeXAY82\nTZsobe3aJUnSHFjUy4Mj4p3AceC2yaYW3ZLW/0hkm3luADYAjIyMMD4+3ssQ59yxY8eGfozdqqWW\nGurYuLpxgGzk1J/dbrYQ66thvUyqpZZa6oC6aumXWQd8RKwDXgWszczJsJ4AVjR1Ww48Vm63a3+G\nzNwCbAEYHR3NsbGx2Q5xIMbHxxn2MXarllpqqGP9pruARrjfvOfEzXT/NWMDHlHvalgvk2qppZY6\noK5a+mVWh+gj4mLgBuDVmflU06QdwFURcUpEnA2sAv4v8BVgVUScHREn0zgRb0dvQ5ckSe103IOP\niNuBMeDMiJgAbqRx1vwpwK6IALgnM1+fmQ9ExMeAb9I4dH99Zv6kzOeNwGeBk4CtmfnAHNQjSZLo\nIuAz8+oWzbdM0/+9wHtbtO8Eds5odJIkaVa8kp0kSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16S\npAoZ8JIkVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIG\nvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lS\nhQx4SZIqZMBLklShjgEfEVsj4nBE3N/UdkZE7IqIveX30tIeEfGBiNgXEfdFxHlNj1lX+u+NiHVz\nU44kSQJY1EWfjwJ/Btza1LYJuDszN0fEpnL/BuASYFX5eSnwIeClEXEGcCMwCiSwOyJ2ZObj/Spk\nUFZuuusZ9zeuPs76prb9my8b9JAkSTpBxz34zPwicGRK8+XAtnJ7G3BFU/ut2XAPsCQizgIuAnZl\n5pES6ruAi/tRgCRJOtFs34MfycyDAOX3C0v7MuDRpn4Tpa1duyRJmgPdHKKfiWjRltO0nziDiA3A\nBoCRkRHGx8f7Nrh+2Lj6+DPuj5z6zLZhG+9MHDt2bEGPf1INdUy+pqa+viYtxPpqWC+Taqmlljqg\nrlr6ZbYBfygizsrMg+UQ/OHSPgGsaOq3HHistI9NaR9vNePM3AJsARgdHc2xsbFW3ebN+hbvwd+8\n52dP4/5rxgY8ov4ZHx9n2J7v2aihjsnX2dTX16SF+DqrYb1MqqWWWuqAumrpl9keot8BTJ4Jvw64\ns6n92nI2/QXA0XII/7PAhRGxtJxxf2FpkyRJc6DjHnxE3E5j7/vMiJigcTb8ZuBjEXEd8AhwZem+\nE7gU2Ac8BbwWIDOPRMQfA18p/d6TmVNP3JMkSX3SMeAz8+o2k9a26JvA9W3msxXYOqPRSZKkWfFK\ndpIkVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUoX5fi37oTf2616n8uldJUg3cg5ck\nqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVKFn3cfkOun0MTpJqsWeA0dZ70eHq+UevCRJFXIP\nvs+8kI4kaRi4By9JUoXcg19gujlHwKMEkiT34CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIk\nVciAlySpQga8JEkVMuAlSaqQV7LTwHm9fkmaez3twUfEH0bEAxFxf0TcHhE/HxFnR8S9EbE3Iu6I\niJNL31PK/X1l+sp+FCBJkk4064CPiGXAfwRGM/PXgZOAq4CbgPdl5irgceC68pDrgMcz80XA+0o/\nSZI0B3p9D34RcGpELAKeCxwEXgl8vEzfBlxRbl9e7lOmr42I6HH5kiSphVkHfGYeAP4r8AiNYD8K\n7AaeyMzjpdsEsKzcXgY8Wh57vPR/wWyXL0mS2ovMnN0DI5YCnwD+HfAE8Nfl/o3lMDwRsQLYmZmr\nI+IB4KLMnCjTHgLOz8zvT5nvBmADwMjIyEu2b98+q/G1s+fA0b7Ob+RUOPR09/1XLzu9p+V1M/7Z\nLuPYsWMsXrx4Vo+diU419PocDaqOuTT5HLV7ffX6HM2HGtbLpFpqOXzkaMe/XwvltVbLOgFYs2bN\n7swc7XU+vZxF/zvA32XmdwEi4pPAbwFLImJR2UtfDjxW+k8AK4CJckj/dODI1Jlm5hZgC8Do6GiO\njY31MMQTre/i+9RnYuPq49y8p/uncf81Yz0tr5vxz3YZ4+Pj9Pv5bqVTDb0+R4OqYy5NPkftXl+9\nPkfzoYb1MqmWWj54250d/34tlNdaLeukn3p5D/4R4IKIeG55L30t8E3gC8DvlT7rgDvL7R3lPmX6\n53O2hw8kSdK0enkP/l4aJ8t9DdhT5rUFuAF4a0Tso/Ee+y3lIbcALyjtbwU29TBuSZI0jZ4udJOZ\nNwI3Tml+GDi/Rd8fAVf2sjxJktQdL1UrSVKFDHhJkipkwEuSVCG/bEbV2XPgaOeP4vmFNpIq5x68\nJEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKF\nDHhJkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRVaNN8DkKZauemuaafv33zZgEYiSQuXe/CS\nJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVKGeAj4ilkTExyPiWxHxYES8LCLOiIhd\nEbG3/F5a+kZEfCAi9kXEfRFxXn9KkCRJU/W6B/9+4H9n5q8ALwYeBDYBd2fmKuDuch/gEmBV+dkA\nfKjHZUuSpDZmHfAR8XzgFcAtAJn548x8Argc2Fa6bQOuKLcvB27NhnuAJRFx1qxHLkmS2orMnN0D\nI34D2AJ8k8be+27gzcCBzFzS1O/xzFwaEZ8GNmfml0r73cANmfnVKfPdQGMPn5GRkZds3759VuNr\nZ8+Bo32d38ipcOjp7vuvXnZ6T8vrZvydltFuHpO19DrGTnpdB53Gd/jI0Y7rZK5r7NXkc9Tu9TXs\n42/l2LFjLF68eL6H0Re11FLDtjKplnUCsGbNmt2ZOdrrfHq5Fv0i4DzgTZl5b0S8n58djm8lWrSd\n8N9FZm6h8Y8Do6OjOTY21sMQT7S+w3XOZ2rj6uPcvKf7p3H/NWM9La+b8XdaRrt5TNbS6xg76XUd\ndBrfB2+7s+M6mesaezX5HLV7fQ37+FsZHx+n39vzfKmllhq2lUm1rJN+6uU9+AlgIjPvLfc/TiPw\nD00eei+/Dzf1X9H0+OXAYz0sX5IktTHrgM/M7wCPRsQ5pWktjcP1O4B1pW0dcGe5vQO4tpxNfwFw\nNDMPznb5kiSpvV6/LvZNwG0RcTLwMPBaGv80fCwirgMeAa4sfXcClwL7gKdKX0mSNAd6CvjM/DrQ\n6kSAtS36JnB9L8uTJEnd8Up2kiRVyICXJKlCBrwkSRUy4CVJqlCvZ9FLUrVWTnNRqPWb7mL/5ssG\nPCKpe+7BS5JUIQNekqQKGfCSJFXIgJckqUIGvCRJFTLgJUmqkAEvSVKFDHhJkipkwEuSVCEDXpKk\nChnwkiRVyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIkVciAlySpQga8\nJEkVMuAlSaqQAS9JUoV6DviIOCki/iYiPl3unx0R90bE3oi4IyJOLu2nlPv7yvSVvS5bkiS11o89\n+DcDDzbdvwl4X2auAh4Hrivt1wGPZ+aLgPeVfpIkaQ70FPARsRy4DPhIuR/AK4GPly7bgCvK7cvL\nfcr0taW/JEnqs1734P8b8Hbgp+X+C4AnMvN4uT8BLCu3lwGPApTpR0t/SZLUZ5GZs3tgxKuASzPz\nDRExBrwNeC3w5XIYnohYAezMzNUR8QBwUWZOlGkPAedn5venzHcDsAFgZGTkJdu3b59dZW3sOXC0\nr/MbORUOPd19/9XLTu9ped2Mv9My2s1jspZex9hJr+ug0/gOHznacZ3MdY29mnyO2r2+hn38rRw7\ndozFixfP9zBmZL63lblWw7YyaSG+vtpZs2bN7swc7XU+i3p47MuBV0fEpcDPA8+nsUe/JCIWlb30\n5cBjpf8EsAKYiIhFwOnAkakzzcwtwBaA0dHRHBsb62GIJ1q/6a6+zm/j6uPcvKf7p3H/NWM9La+b\n8XdaRrt5TNbS6fErO4xh/+bLZrX8bnUa3wdvu7PjOul1Pcy1yeeo3etr2Mffyvj4OP3enudar9vK\nsKthW5m0EF9fc23Wh+gz8x2ZuTwzVwJXAZ/PzGuALwC/V7qtA+4st3eU+5Tpn8/ZHj6QJEnTmovP\nwd8AvDUi9tF4j/2W0n4L8ILS/lZg0xwsW5Ik0dsh+n+UmePAeLn9MHB+iz4/Aq7sx/IkSdL0vJKd\nJEkVMuAlSaqQAS9JUoUMeEmSKtSXk+zUP50+Yy5JUjcMeEnzotcLJkmanofoJUmqkAEvSVKFDHhJ\nkipkwEuSVCEDXpKkChnwkiRVyICXJKlCBrwkSRUy4CVJqpBXspM0J6ZeqW7j6uOs91LM0sC4By9J\nUoUMeEmSKmTAS5JUIQNekqQKGfCSJFXIgJckqUJ+TE6q0NSPqLWyf/NlAxiJpPliwFeomz/ukqS6\neYhekqQKGfCSJFXIQ/Tqu2fDWwSdavT9bUnzzT14SZIqZMBLklShWQd8RKyIiC9ExIMR8UBEvLm0\nnxERuyJib/m9tLRHRHwgIvZFxH0RcV6/ipAkSc/Uyx78cWBjZv4qcAFwfUScC2wC7s7MVcDd5T7A\nJcCq8rMB+FAPy5YkSdOY9Ul2mXkQOFhu/yAiHgSWAZcDY6XbNmAcuKG035qZCdwTEUsi4qwyH0mS\nZmzyhNeNq4+zvsXJr8/mE1778h58RKwEfhO4FxiZDO3y+4Wl2zLg0aaHTZQ2SZLUZ9HYoe5hBhGL\ngf8DvDczPxkRT2Tmkqbpj2fm0oi4C/jPmfml0n438PbM3D1lfhtoHMJnZGTkJdu3b+9pfFPtOXC0\nr/MbORUOPd19/9XLTp92er/HNxOTtQzzGKHz+A4fOdpxnXSaRyednoN+zb/d66sf66jXMXYydQz9\n3lYGod3z2O22MuwGsa3MtV63lWG0Zs2a3Zk52ut8evocfEQ8B/gEcFtmfrI0H5o89B4RZwGHS/sE\nsKLp4cuBx6bOMzO3AFsARkdHc2xsrJchnqDVIZxebFx9nJv3dP807r9mbNrp/R7fTEzWMsxjhM7P\n4Qdvu7PjOuk0j046PQf9mn+711c/1lGvY+xk6hj6va0MQrvnsdttZdgNYluZa71uKzXr5Sz6AG4B\nHszMP22atANYV26vA+5sar+2nE1/AXDU998lSZobvezBvxx4DbAnIr5e2v4I2Ax8LCKuAx4BrizT\ndgKXAvuAp4DX9rBsSZI0jV7Oov8SEG0mr23RP4HrZ7s8SZLUPa9kJ0lShQx4SZIqZMBLklQhA16S\npAoZ8JIkVciAlySpQj1dyU5aqFZ2uhLds/gLKiTVwYCXWuj0D4AkDTsP0UuSVCEDXpKkChnwkiRV\nyICXJKlCBrwkSRUy4CVJqpABL0lShQx4SZIqZMBLklQhA16SpAoZ8JIkVciAlySpQga8JEkV8tvk\npHngt9XNPZ/j4eBXM88fA15SS/5hlhY2D9FLklQh9+AlLUg1HIL3KInmknvwkiRVyD14SVJbHmVY\nuNyDlySpQu7BS5qVGt4Dl2o+QjHwgI+Ii4H3AycBH8nMzYMegyTp2eHZ/I/oQAM+Ik4C/hz4XWAC\n+EpE7MjMbw5yHJK0ENS8d6m5N+g9+POBfZn5MEBEbAcuBwx4aQaezXsl0iB1s60N6z9agw74ZcCj\nTfcngJcOeAySNBD+I6b5FJk5uIVFXAlclJn/odx/DXB+Zr6pqc8GYEO5ew7w7YENcHbOBL4334Po\nk1pqqaUOsJZhVUsttdQBddVyTmae1utMBr0HPwGsaLq/HHisuUNmbgG2DHJQvYiIr2bm6HyPox9q\nqaWWOsBahlUttdRSB9RXSz/mM+jPwX8FWBURZ0fEycBVwI4Bj0GSpOoNdA8+M49HxBuBz9L4mNzW\nzHxgkGOQJOnZYOCfg8/MncDOQS93Di2YtxO6UEsttdQB1jKsaqmlljrAWk4w0JPsJEnSYHgtekmS\nKmTAdyEizomIrzf9PBkRb5nSZywijjb1edd8jXeqiNgaEYcj4v6mtjMiYldE7C2/l7Z57LrSZ29E\nrBvcqFuOpVUdfxIR34qI+yLiUxGxpM1j90fEnrJu+nKGai/a1PLuiDjQ9Bq6tM1jL46Ib0fEvojY\nNLhRt9amljua6tgfEV9v89ihWS8RsSIivhARD0bEAxHx5tK+ELeVdrUsuO1lmloW1PYyTR1zt61k\npj8z+KFxcuB3gF+c0j4GfHq+x9dmzK8AzgPub2r7L8CmcnsTcFOLx50BPFx+Ly23lw5ZHRcCi8rt\nm1rVUabtB86c73XRoZZ3A2/r8LiTgIeAXwJOBr4BnDtstUyZfjPwrmFfL8BZwHnl9mnA3wLnLtBt\npV0tC257maaWBbW9tKtjSp++bivuwc/cWuChzPz7+R5ItzLzi8CRKc2XA9vK7W3AFS0eehGwKzOP\nZObjwC7g4jkbaAet6sjMz2Xm8XL3HhrXVhh6bdZJN/7xcs+Z+WNg8nLP82a6WiIigH8L3D7QQc1C\nZh7MzK+V2z8AHqRx9c2FuK20rGUhbi/TrJduDM320qmOudhWDPiZu4r2K+BlEfGNiPhMRPzaIAc1\nCyOZeRAaLzzghS36tLq0cLcb1nx4HfCZNtMS+FxE7I7G1RKH1RvL4dOtbQ4FL7R18i+BQ5m5t830\noVwvEbES+E3gXhb4tjKllmYLbntpUcuC3F7arJO+bysG/AxE4+I8rwb+usXkr9E4bP9i4IPA/xrk\n2OZItGgbyo9dRMQ7gePAbW26vDwzzwMuAa6PiFcMbHDd+xDwy8BvAAdpHK6basGsk+Jqpt8jGbr1\nEhGLgU8Ab8nMJ7t9WIu2eV8v7WpZiNtLi1oW5PYyzeur79uKAT8zlwBfy8xDUydk5pOZeazc3gk8\nJyLOHPQAZ+BQRJwFUH4fbtGn46WFh0E5oelVwDVZ3qyaKjMfK78PA5+icehuqGTmocz8SWb+FPgw\nrce4INYJQEQsAv4NcEe7PsO2XiLiOTT++N6WmZ8szQtyW2lTy4LcXlrVshC3l2nWyZxsKwb8zLT9\nDysi/ml5D4WIOJ/Gc/v9AY5tpnYAk2f6rgPubNHns8CFEbG0HP66sLQNjYi4GLgBeHVmPtWmz/Mi\n4rTJ2zTquL9V3/k0GSLFv6b1GBfS5Z5/B/hWZk60mjhs66Vsv7cAD2bmnzZNWnDbSrtaFuL2Mk0t\nC2p7meb1BXO1rczH2YQL8Qd4Lo3APr2p7fXA68vtNwIP0DhL8x7gt+Z7zE3jvJ3GIaz/R+M/2uuA\nFwB3A3vL7zNK31HgI02PfR2wr/y8dgjr2EfjPbavl5+/KH1/AdhZbv9SWS/fKOvonUO6Tv4S2APc\nR+OP0FlTayn3L6VxBu5Dw1pLaf/o5PbR1Hdo1wvw2zQO397X9Hq6dIFuK+1qWXDbyzS1LKjtpV0d\nZdqcbCteyU6SpAp5iF6SpAoZ8JIkVciAlySpQga8JEkVMuAlSaqQAS9JUoUMeEmSKmTAS5JUof8P\nLIwC2ZVVELMAAAAASUVORK5CYII=\n", 923 | "text/plain": [ 924 | "" 925 | ] 926 | }, 927 | "metadata": {}, 928 | "output_type": "display_data" 929 | } 930 | ], 931 | "source": [ 932 | "%matplotlib inline\n", 933 | "import matplotlib.pyplot as plt\n", 934 | "jobs_data_copy.hist(bins=50, figsize=(8,5))\n", 935 | "# save_fig(\"attribute_histogram_plots\")\n", 936 | "plt.show()" 937 | ] 938 | }, 939 | { 940 | "cell_type": "markdown", 941 | "metadata": {}, 942 | "source": [ 943 | "## 筛选岗位数量前150的城市" 944 | ] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": 287, 949 | "metadata": {}, 950 | "outputs": [], 951 | "source": [ 952 | "cities = jobs_data_copy['城市'].value_counts()[:150].index" 953 | ] 954 | }, 955 | { 956 | "cell_type": "code", 957 | "execution_count": 288, 958 | "metadata": {}, 959 | "outputs": [], 960 | "source": [ 961 | "temp_list = []\n", 962 | "for item in jobs_data_copy['城市'].values:\n", 963 | " temp_list.append(item in cities)" 964 | ] 965 | }, 966 | { 967 | "cell_type": "code", 968 | "execution_count": 289, 969 | "metadata": {}, 970 | "outputs": [], 971 | "source": [ 972 | "#temp_list" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": 290, 978 | "metadata": {}, 979 | "outputs": [], 980 | "source": [ 981 | "#temp_list\n", 982 | "jobs_data_copy = jobs_data_copy[temp_list]" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": 291, 988 | "metadata": {}, 989 | "outputs": [ 990 | { 991 | "data": { 992 | "text/html": [ 993 | "
\n", 994 | "\n", 1007 | "\n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | "
城市职位名称工作经验公司规模学历要求salary
0北京数据分析经验不限10000人以上本科8.0
2北京数据分析师3-5年1000-9999人本科20.0
3北京数据分析经验不限10000人以上本科9.0
4北京数据分析师3-5年10000人以上本科12.5
8北京数据分析师1-3年10000人以上本科25.5
\n", 1067 | "
" 1068 | ], 1069 | "text/plain": [ 1070 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n", 1071 | "0 北京 数据分析 经验不限 10000人以上 本科 8.0\n", 1072 | "2 北京 数据分析师 3-5年 1000-9999人 本科 20.0\n", 1073 | "3 北京 数据分析 经验不限 10000人以上 本科 9.0\n", 1074 | "4 北京 数据分析师 3-5年 10000人以上 本科 12.5\n", 1075 | "8 北京 数据分析师 1-3年 10000人以上 本科 25.5" 1076 | ] 1077 | }, 1078 | "execution_count": 291, 1079 | "metadata": {}, 1080 | "output_type": "execute_result" 1081 | } 1082 | ], 1083 | "source": [ 1084 | "jobs_data_copy.head()" 1085 | ] 1086 | }, 1087 | { 1088 | "cell_type": "code", 1089 | "execution_count": 292, 1090 | "metadata": {}, 1091 | "outputs": [], 1092 | "source": [ 1093 | "#jobs_data_copy['城市'].value_counts()" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "markdown", 1098 | "metadata": {}, 1099 | "source": [ 1100 | "## 相同属性合并" 1101 | ] 1102 | }, 1103 | { 1104 | "cell_type": "code", 1105 | "execution_count": 293, 1106 | "metadata": {}, 1107 | "outputs": [], 1108 | "source": [ 1109 | "experience_shift = {\n", 1110 | " '1-3年': '1-3年',\n", 1111 | " '3-5年': '3-5年',\n", 1112 | " '5-10年': '5-10年',\n", 1113 | " '应届生':'1年以内',\n", 1114 | " '1年以下':'1年以内',\n", 1115 | " '1年以内': '1年以内',\n", 1116 | " '无经验': '1年以内',\n", 1117 | " '经验不限': '经验不限',\n", 1118 | " '不限': '经验不限'\n", 1119 | "}\n", 1120 | "\n", 1121 | "scale_shift = {\n", 1122 | " '100-499人':'100-499人',\n", 1123 | " '0-20人':'0-20人',\n", 1124 | " '20人以下':'0-20人',\n", 1125 | " '20-99人':'20-99人',\n", 1126 | " '100-499人':'100-499人',\n", 1127 | " '500-999人':'500-999人',\n", 1128 | " '1000-9999人':'1000-9999人',\n", 1129 | " '10000人以上':'10000人以上',\n", 1130 | " '':'100-499人'\n", 1131 | "}\n", 1132 | "\n", 1133 | "degree_shift = {\n", 1134 | " '中专/中技': '中专',\n", 1135 | " '中技': '中专',\n", 1136 | " '中专': '中专',\n", 1137 | " '高中': '高中',\n", 1138 | " '大专': '大专',\n", 1139 | " '本科': '本科',\n", 1140 | " '硕士': '硕士',\n", 1141 | " '博士': '博士',\n", 1142 | " '不限': '学历不限',\n", 1143 | " '学历不限': '学历不限'\n", 1144 | "}" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": 294, 1150 | "metadata": {}, 1151 | "outputs": [], 1152 | "source": [ 1153 | "jobs_data_copy['工作经验'] = jobs_data_copy['工作经验'].map(experience_shift)\n", 1154 | "jobs_data_copy['公司规模'] = jobs_data_copy['公司规模'].map(scale_shift)\n", 1155 | "jobs_data_copy['学历要求'] = jobs_data_copy['学历要求'].map(degree_shift)" 1156 | ] 1157 | }, 1158 | { 1159 | "cell_type": "code", 1160 | "execution_count": 295, 1161 | "metadata": {}, 1162 | "outputs": [ 1163 | { 1164 | "data": { 1165 | "text/plain": [ 1166 | "1-3年 5219\n", 1167 | "3-5年 4271\n", 1168 | "经验不限 1309\n", 1169 | "5-10年 825\n", 1170 | "1年以内 825\n", 1171 | "Name: 工作经验, dtype: int64" 1172 | ] 1173 | }, 1174 | "execution_count": 295, 1175 | "metadata": {}, 1176 | "output_type": "execute_result" 1177 | } 1178 | ], 1179 | "source": [ 1180 | "jobs_data_copy['工作经验'].value_counts()" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "code", 1185 | "execution_count": 296, 1186 | "metadata": {}, 1187 | "outputs": [ 1188 | { 1189 | "data": { 1190 | "text/plain": [ 1191 | "100-499人 3843\n", 1192 | "20-99人 2848\n", 1193 | "1000-9999人 2503\n", 1194 | "10000人以上 1648\n", 1195 | "500-999人 1292\n", 1196 | "0-20人 315\n", 1197 | "Name: 公司规模, dtype: int64" 1198 | ] 1199 | }, 1200 | "execution_count": 296, 1201 | "metadata": {}, 1202 | "output_type": "execute_result" 1203 | } 1204 | ], 1205 | "source": [ 1206 | "jobs_data_copy['公司规模'].value_counts()" 1207 | ] 1208 | }, 1209 | { 1210 | "cell_type": "code", 1211 | "execution_count": 297, 1212 | "metadata": {}, 1213 | "outputs": [ 1214 | { 1215 | "data": { 1216 | "text/plain": [ 1217 | "本科 8941\n", 1218 | "大专 2479\n", 1219 | "硕士 564\n", 1220 | "学历不限 420\n", 1221 | "中专 21\n", 1222 | "博士 13\n", 1223 | "高中 11\n", 1224 | "Name: 学历要求, dtype: int64" 1225 | ] 1226 | }, 1227 | "execution_count": 297, 1228 | "metadata": {}, 1229 | "output_type": "execute_result" 1230 | } 1231 | ], 1232 | "source": [ 1233 | "jobs_data_copy['学历要求'].value_counts()" 1234 | ] 1235 | }, 1236 | { 1237 | "cell_type": "markdown", 1238 | "metadata": {}, 1239 | "source": [ 1240 | "# 划分训练集与测试集" 1241 | ] 1242 | }, 1243 | { 1244 | "cell_type": "code", 1245 | "execution_count": 298, 1246 | "metadata": { 1247 | "collapsed": true 1248 | }, 1249 | "outputs": [], 1250 | "source": [ 1251 | "from sklearn.model_selection import train_test_split\n", 1252 | "\n", 1253 | "train_set, test_set = train_test_split(jobs_data_copy, test_size=0.2, random_state=42)" 1254 | ] 1255 | }, 1256 | { 1257 | "cell_type": "code", 1258 | "execution_count": 299, 1259 | "metadata": { 1260 | "collapsed": true 1261 | }, 1262 | "outputs": [], 1263 | "source": [ 1264 | "datas_train = train_set.copy()\n", 1265 | "datas_test = test_set.copy()" 1266 | ] 1267 | }, 1268 | { 1269 | "cell_type": "code", 1270 | "execution_count": 300, 1271 | "metadata": {}, 1272 | "outputs": [ 1273 | { 1274 | "data": { 1275 | "text/html": [ 1276 | "
\n", 1277 | "\n", 1290 | "\n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | "
城市职位名称工作经验公司规模学历要求salary
21110南阳数据分析1-3年20-99人本科6.5
56033云浮化妆品数据分析员1-3年100-499人本科8.5
37705泉州数据分析专员1-3年100-499人本科6.5
24622淮安数据分析1-3年1000-9999人大专6.5
37106宁德数据分析师(福州)1-3年100-499人本科11.5
\n", 1350 | "
" 1351 | ], 1352 | "text/plain": [ 1353 | " 城市 职位名称 工作经验 公司规模 学历要求 salary\n", 1354 | "21110 南阳 数据分析 1-3年 20-99人 本科 6.5\n", 1355 | "56033 云浮 化妆品数据分析员 1-3年 100-499人 本科 8.5\n", 1356 | "37705 泉州 数据分析专员 1-3年 100-499人 本科 6.5\n", 1357 | "24622 淮安 数据分析 1-3年 1000-9999人 大专 6.5\n", 1358 | "37106 宁德 数据分析师(福州) 1-3年 100-499人 本科 11.5" 1359 | ] 1360 | }, 1361 | "execution_count": 300, 1362 | "metadata": {}, 1363 | "output_type": "execute_result" 1364 | } 1365 | ], 1366 | "source": [ 1367 | "datas_train.head()" 1368 | ] 1369 | }, 1370 | { 1371 | "cell_type": "markdown", 1372 | "metadata": {}, 1373 | "source": [ 1374 | "## 为 标签/类别 属性编码" 1375 | ] 1376 | }, 1377 | { 1378 | "cell_type": "markdown", 1379 | "metadata": {}, 1380 | "source": [ 1381 | "### 标签/类别 训练集/测试集划分" 1382 | ] 1383 | }, 1384 | { 1385 | "cell_type": "code", 1386 | "execution_count": 301, 1387 | "metadata": {}, 1388 | "outputs": [ 1389 | { 1390 | "data": { 1391 | "text/html": [ 1392 | "
\n", 1393 | "\n", 1406 | "\n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | "
城市公司规模学历要求工作经验
21110南阳20-99人本科1-3年
56033云浮100-499人本科1-3年
37705泉州100-499人本科1-3年
24622淮安1000-9999人大专1-3年
37106宁德100-499人本科1-3年
\n", 1454 | "
" 1455 | ], 1456 | "text/plain": [ 1457 | " 城市 公司规模 学历要求 工作经验\n", 1458 | "21110 南阳 20-99人 本科 1-3年\n", 1459 | "56033 云浮 100-499人 本科 1-3年\n", 1460 | "37705 泉州 100-499人 本科 1-3年\n", 1461 | "24622 淮安 1000-9999人 大专 1-3年\n", 1462 | "37106 宁德 100-499人 本科 1-3年" 1463 | ] 1464 | }, 1465 | "execution_count": 301, 1466 | "metadata": {}, 1467 | "output_type": "execute_result" 1468 | } 1469 | ], 1470 | "source": [ 1471 | "cata_train = datas_train[['城市','公司规模','学历要求','工作经验']] # 训练集\n", 1472 | "cata_test = datas_test[['城市','公司规模','学历要求','工作经验']] # 测试集\n", 1473 | "cata_train.head()" 1474 | ] 1475 | }, 1476 | { 1477 | "cell_type": "code", 1478 | "execution_count": 302, 1479 | "metadata": { 1480 | "collapsed": true 1481 | }, 1482 | "outputs": [], 1483 | "source": [ 1484 | "from sklearn.preprocessing import OneHotEncoder\n", 1485 | "\n", 1486 | "cat_encoder = OneHotEncoder(sparse=False)" 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "code", 1491 | "execution_count": 303, 1492 | "metadata": { 1493 | "collapsed": true 1494 | }, 1495 | "outputs": [], 1496 | "source": [ 1497 | "job_cata_train = cat_encoder.fit_transform(cata_train)\n", 1498 | "job_cata_test = cat_encoder.fit_transform(cata_test)" 1499 | ] 1500 | }, 1501 | { 1502 | "cell_type": "code", 1503 | "execution_count": 304, 1504 | "metadata": {}, 1505 | "outputs": [ 1506 | { 1507 | "data": { 1508 | "text/plain": [ 1509 | "[array(['三明', '上海', '东沙群岛', '东莞', '东营', '中山', '临沂', '丽水', '乐山', '云浮', '亳州',\n", 1510 | " '仙桃', '佛山', '六安', '内江', '凉山', '北京', '十堰', '南京', '南充', '南平', '南通',\n", 1511 | " '南阳', '厦门', '台州', '合肥', '周口', '咸宁', '咸阳', '商丘', '商洛', '嘉兴', '大连',\n", 1512 | " '天津', '天门', '威海', '娄底', '孝感', '宁德', '安庆', '安康', '安阳', '宜宾', '宜昌',\n", 1513 | " '宣城', '宿州', '宿迁', '岳阳', '巴中', '常州', '常德', '平顶山', '广元', '广安', '广州',\n", 1514 | " '开封', '张家界', '徐州', '德阳', '怀化', '恩施', '惠州', '成都', '扬州', '揭阳', '攀枝花',\n", 1515 | " '新乡', '日照', '杭州', '枣庄', '株洲', '梅州', '武汉', '永州', '汕头', '汕尾', '江门',\n", 1516 | " '池州', '河源', '泉州', '泰安', '泰州', '泸州', '洛阳', '济南', '济宁', '淄博', '淮北',\n", 1517 | " '淮安', '深圳', '清远', '温州', '湖州', '湘潭', '湘西', '湛江', '滨州', '漯河', '漳州',\n", 1518 | " '潍坊', '潜江', '潮州', '烟台', '珠海', '甘孜', '益阳', '盐城', '眉山', '神农架', '福州',\n", 1519 | " '绍兴', '绵阳', '聊城', '肇庆', '自贡', '舟山', '芜湖', '苏州', '茂名', '荆州', '荆门',\n", 1520 | " '莆田', '莱芜', '菏泽', '衡阳', '衢州', '襄阳', '西安', '资阳', '达州', '连云港', '遂宁',\n", 1521 | " '邵阳', '郑州', '郴州', '鄂州', '金华', '铜川', '镇江', '长沙', '阜阳', '阳江', '阿坝',\n", 1522 | " '随州', '雅安', '韶关', '黄冈', '黄山', '黄石', '龙岩'], dtype=object),\n", 1523 | " array(['0-20人', '100-499人', '1000-9999人', '10000人以上', '20-99人',\n", 1524 | " '500-999人'], dtype=object),\n", 1525 | " array(['中专', '博士', '大专', '学历不限', '本科', '硕士', '高中'], dtype=object),\n", 1526 | " array(['1-3年', '1年以内', '3-5年', '5-10年', '经验不限'], dtype=object)]" 1527 | ] 1528 | }, 1529 | "execution_count": 304, 1530 | "metadata": {}, 1531 | "output_type": "execute_result" 1532 | } 1533 | ], 1534 | "source": [ 1535 | "cat_encoder.categories_" 1536 | ] 1537 | }, 1538 | { 1539 | "cell_type": "markdown", 1540 | "metadata": {}, 1541 | "source": [ 1542 | "### 参数构造函数" 1543 | ] 1544 | }, 1545 | { 1546 | "cell_type": "code", 1547 | "execution_count": 378, 1548 | "metadata": {}, 1549 | "outputs": [ 1550 | { 1551 | "data": { 1552 | "text/plain": [ 1553 | "array(['三明', '上海', '东沙群岛', '东莞', '东营', '中山', '临沂', '丽水', '乐山', '云浮', '亳州',\n", 1554 | " '仙桃', '佛山', '六安', '内江', '凉山', '北京', '十堰', '南京', '南充', '南平', '南通',\n", 1555 | " '南阳', '厦门', '台州', '合肥', '周口', '咸宁', '咸阳', '商丘', '商洛', '嘉兴', '大连',\n", 1556 | " '天津', '天门', '威海', '娄底', '孝感', '宁德', '安庆', '安康', '安阳', '宜宾', '宜昌',\n", 1557 | " '宣城', '宿州', '宿迁', '岳阳', '巴中', '常州', '常德', '平顶山', '广元', '广安', '广州',\n", 1558 | " '开封', '张家界', '徐州', '德阳', '怀化', '恩施', '惠州', '成都', '扬州', '揭阳', '攀枝花',\n", 1559 | " '新乡', '日照', '杭州', '枣庄', '株洲', '梅州', '武汉', '永州', '汕头', '汕尾', '江门',\n", 1560 | " '池州', '河源', '泉州', '泰安', '泰州', '泸州', '洛阳', '济南', '济宁', '淄博', '淮北',\n", 1561 | " '淮安', '深圳', '清远', '温州', '湖州', '湘潭', '湘西', '湛江', '滨州', '漯河', '漳州',\n", 1562 | " '潍坊', '潜江', '潮州', '烟台', '珠海', '甘孜', '益阳', '盐城', '眉山', '神农架', '福州',\n", 1563 | " '绍兴', '绵阳', '聊城', '肇庆', '自贡', '舟山', '芜湖', '苏州', '茂名', '荆州', '荆门',\n", 1564 | " '莆田', '莱芜', '菏泽', '衡阳', '衢州', '襄阳', '西安', '资阳', '达州', '连云港', '遂宁',\n", 1565 | " '邵阳', '郑州', '郴州', '鄂州', '金华', '铜川', '镇江', '长沙', '阜阳', '阳江', '阿坝',\n", 1566 | " '随州', '雅安', '韶关', '黄冈', '黄山', '黄石', '龙岩'], dtype=object)" 1567 | ] 1568 | }, 1569 | "execution_count": 378, 1570 | "metadata": {}, 1571 | "output_type": "execute_result" 1572 | } 1573 | ], 1574 | "source": [ 1575 | "cat_encoder.categories_[0] " 1576 | ] 1577 | }, 1578 | { 1579 | "cell_type": "code", 1580 | "execution_count": 444, 1581 | "metadata": {}, 1582 | "outputs": [], 1583 | "source": [ 1584 | "import numpy as np\n", 1585 | "\n", 1586 | "def func_params(templist):\n", 1587 | " temp = []\n", 1588 | " city,scale,degree,exp = templist\n", 1589 | " citypara = cat_encoder.categories_[0] == '{city}'.format(city=city)\n", 1590 | " scalepara = cat_encoder.categories_[1] == '{scale}'.format(scale=scale)\n", 1591 | " degreepara = cat_encoder.categories_[2] == '{degree}'.format(degree=degree)\n", 1592 | " exppara = cat_encoder.categories_[3] == '{exp}'.format(exp=exp)\n", 1593 | " # citypara\n", 1594 | " for item in citypara:\n", 1595 | " #print(item)\n", 1596 | " if item == False:\n", 1597 | " item = float(0)\n", 1598 | " temp.append(item)\n", 1599 | " else:\n", 1600 | " item = float(1)\n", 1601 | " temp.append(item)\n", 1602 | " for item in scalepara:\n", 1603 | " if item == False:\n", 1604 | " item = float(0)\n", 1605 | " temp.append(item)\n", 1606 | " else:\n", 1607 | " item = float(1)\n", 1608 | " temp.append(item)\n", 1609 | " for item in scalepara:\n", 1610 | " if item == False:\n", 1611 | " item = float(0)\n", 1612 | " temp.append(item)\n", 1613 | " else:\n", 1614 | " item = float(1)\n", 1615 | " temp.append(item)\n", 1616 | " for item in scalepara:\n", 1617 | " if item == False:\n", 1618 | " item = float(0)\n", 1619 | " temp.append(item)\n", 1620 | " else:\n", 1621 | " item = float(1)\n", 1622 | " temp.append(item)\n", 1623 | " temp = np.array(temp, dtype = float).reshape(1, -1)\n", 1624 | " return temp" 1625 | ] 1626 | }, 1627 | { 1628 | "cell_type": "code", 1629 | "execution_count": 445, 1630 | "metadata": {}, 1631 | "outputs": [], 1632 | "source": [ 1633 | "paramlist = func_params(['上海','1000-9999人','硕士','1-3年'])\n", 1634 | "#paramlist" 1635 | ] 1636 | }, 1637 | { 1638 | "cell_type": "code", 1639 | "execution_count": 372, 1640 | "metadata": {}, 1641 | "outputs": [ 1642 | { 1643 | "data": { 1644 | "text/plain": [ 1645 | "array(['0-20人', '100-499人', '1000-9999人', '10000人以上', '20-99人',\n", 1646 | " '500-999人'], dtype=object)" 1647 | ] 1648 | }, 1649 | "execution_count": 372, 1650 | "metadata": {}, 1651 | "output_type": "execute_result" 1652 | } 1653 | ], 1654 | "source": [ 1655 | "cat_encoder.categories_[1]" 1656 | ] 1657 | }, 1658 | { 1659 | "cell_type": "code", 1660 | "execution_count": 432, 1661 | "metadata": {}, 1662 | "outputs": [], 1663 | "source": [ 1664 | "#cat_encoder.categories_[3]" 1665 | ] 1666 | }, 1667 | { 1668 | "cell_type": "markdown", 1669 | "metadata": {}, 1670 | "source": [ 1671 | "# 用于机器学习的数据" 1672 | ] 1673 | }, 1674 | { 1675 | "cell_type": "markdown", 1676 | "metadata": {}, 1677 | "source": [ 1678 | "## x_train,y_train" 1679 | ] 1680 | }, 1681 | { 1682 | "cell_type": "code", 1683 | "execution_count": 305, 1684 | "metadata": {}, 1685 | "outputs": [], 1686 | "source": [ 1687 | "# 训练集\n", 1688 | "x_train = job_cata_train\n", 1689 | "y_train = datas_train['salary'].values.reshape(-1, 1)\n", 1690 | "#y_train" 1691 | ] 1692 | }, 1693 | { 1694 | "cell_type": "code", 1695 | "execution_count": 439, 1696 | "metadata": {}, 1697 | "outputs": [], 1698 | "source": [ 1699 | "#x_train[0]" 1700 | ] 1701 | }, 1702 | { 1703 | "cell_type": "code", 1704 | "execution_count": 306, 1705 | "metadata": {}, 1706 | "outputs": [ 1707 | { 1708 | "data": { 1709 | "text/plain": [ 1710 | "(9959, 168)" 1711 | ] 1712 | }, 1713 | "execution_count": 306, 1714 | "metadata": {}, 1715 | "output_type": "execute_result" 1716 | } 1717 | ], 1718 | "source": [ 1719 | "x_train.shape" 1720 | ] 1721 | }, 1722 | { 1723 | "cell_type": "code", 1724 | "execution_count": 307, 1725 | "metadata": {}, 1726 | "outputs": [ 1727 | { 1728 | "data": { 1729 | "text/plain": [ 1730 | "(9959, 1)" 1731 | ] 1732 | }, 1733 | "execution_count": 307, 1734 | "metadata": {}, 1735 | "output_type": "execute_result" 1736 | } 1737 | ], 1738 | "source": [ 1739 | "y_train.shape" 1740 | ] 1741 | }, 1742 | { 1743 | "cell_type": "markdown", 1744 | "metadata": {}, 1745 | "source": [ 1746 | "## x_test,y_test" 1747 | ] 1748 | }, 1749 | { 1750 | "cell_type": "code", 1751 | "execution_count": 308, 1752 | "metadata": { 1753 | "collapsed": true 1754 | }, 1755 | "outputs": [], 1756 | "source": [ 1757 | "# 测试集\n", 1758 | "x_test = job_cata_test\n", 1759 | "y_test = datas_test['salary'].values.reshape(-1, 1)" 1760 | ] 1761 | }, 1762 | { 1763 | "cell_type": "code", 1764 | "execution_count": 309, 1765 | "metadata": {}, 1766 | "outputs": [ 1767 | { 1768 | "data": { 1769 | "text/plain": [ 1770 | "(2490, 168)" 1771 | ] 1772 | }, 1773 | "execution_count": 309, 1774 | "metadata": {}, 1775 | "output_type": "execute_result" 1776 | } 1777 | ], 1778 | "source": [ 1779 | "x_test.shape" 1780 | ] 1781 | }, 1782 | { 1783 | "cell_type": "code", 1784 | "execution_count": 310, 1785 | "metadata": {}, 1786 | "outputs": [ 1787 | { 1788 | "data": { 1789 | "text/plain": [ 1790 | "(2490, 1)" 1791 | ] 1792 | }, 1793 | "execution_count": 310, 1794 | "metadata": {}, 1795 | "output_type": "execute_result" 1796 | } 1797 | ], 1798 | "source": [ 1799 | "y_test.shape" 1800 | ] 1801 | }, 1802 | { 1803 | "cell_type": "markdown", 1804 | "metadata": {}, 1805 | "source": [ 1806 | "# 机器学习建模" 1807 | ] 1808 | }, 1809 | { 1810 | "cell_type": "markdown", 1811 | "metadata": {}, 1812 | "source": [ 1813 | "## 决策树" 1814 | ] 1815 | }, 1816 | { 1817 | "cell_type": "code", 1818 | "execution_count": 311, 1819 | "metadata": { 1820 | "scrolled": true 1821 | }, 1822 | "outputs": [ 1823 | { 1824 | "data": { 1825 | "text/plain": [ 1826 | "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n", 1827 | " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", 1828 | " min_impurity_split=None, min_samples_leaf=1,\n", 1829 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 1830 | " presort=False, random_state=42, splitter='best')" 1831 | ] 1832 | }, 1833 | "execution_count": 311, 1834 | "metadata": {}, 1835 | "output_type": "execute_result" 1836 | } 1837 | ], 1838 | "source": [ 1839 | "from sklearn.tree import DecisionTreeRegressor\n", 1840 | "\n", 1841 | "tree_reg = DecisionTreeRegressor(random_state=42)\n", 1842 | "tree_reg.fit(x_train, y_train)" 1843 | ] 1844 | }, 1845 | { 1846 | "cell_type": "markdown", 1847 | "metadata": {}, 1848 | "source": [ 1849 | "### 检验" 1850 | ] 1851 | }, 1852 | { 1853 | "cell_type": "markdown", 1854 | "metadata": {}, 1855 | "source": [ 1856 | "#### 训练集检验" 1857 | ] 1858 | }, 1859 | { 1860 | "cell_type": "code", 1861 | "execution_count": 312, 1862 | "metadata": { 1863 | "scrolled": true 1864 | }, 1865 | "outputs": [ 1866 | { 1867 | "data": { 1868 | "text/plain": [ 1869 | "array([[ 7.66666667],\n", 1870 | " [11.33333333],\n", 1871 | " [10. ],\n", 1872 | " [ 6.5 ],\n", 1873 | " [ 9.875 ],\n", 1874 | " [20. ],\n", 1875 | " [15.8125 ],\n", 1876 | " [ 8.38461538],\n", 1877 | " [ 7.8 ],\n", 1878 | " [10.25 ]])" 1879 | ] 1880 | }, 1881 | "execution_count": 312, 1882 | "metadata": {}, 1883 | "output_type": "execute_result" 1884 | } 1885 | ], 1886 | "source": [ 1887 | "y_pred_tree = tree_reg.predict(x_train)\n", 1888 | "y_pred_tree[:10].reshape(10, 1)" 1889 | ] 1890 | }, 1891 | { 1892 | "cell_type": "code", 1893 | "execution_count": 354, 1894 | "metadata": {}, 1895 | "outputs": [], 1896 | "source": [ 1897 | "#datas_train.head(10)" 1898 | ] 1899 | }, 1900 | { 1901 | "cell_type": "code", 1902 | "execution_count": 314, 1903 | "metadata": { 1904 | "scrolled": true 1905 | }, 1906 | "outputs": [ 1907 | { 1908 | "data": { 1909 | "text/plain": [ 1910 | "array([[ 6.5],\n", 1911 | " [ 8.5],\n", 1912 | " [ 6.5],\n", 1913 | " [ 6.5],\n", 1914 | " [11.5],\n", 1915 | " [20. ],\n", 1916 | " [16.5],\n", 1917 | " [ 7. ],\n", 1918 | " [ 6.5],\n", 1919 | " [ 8. ]])" 1920 | ] 1921 | }, 1922 | "execution_count": 314, 1923 | "metadata": {}, 1924 | "output_type": "execute_result" 1925 | } 1926 | ], 1927 | "source": [ 1928 | "y_train[:10]" 1929 | ] 1930 | }, 1931 | { 1932 | "cell_type": "code", 1933 | "execution_count": 315, 1934 | "metadata": {}, 1935 | "outputs": [ 1936 | { 1937 | "data": { 1938 | "text/plain": [ 1939 | "3.0382473722545784" 1940 | ] 1941 | }, 1942 | "execution_count": 315, 1943 | "metadata": {}, 1944 | "output_type": "execute_result" 1945 | } 1946 | ], 1947 | "source": [ 1948 | "from sklearn.metrics import mean_squared_error\n", 1949 | "import numpy as np\n", 1950 | "\n", 1951 | "tree_mse = mean_squared_error(y_train, y_pred_tree)\n", 1952 | "tree_rmse = np.sqrt(tree_mse)\n", 1953 | "tree_rmse" 1954 | ] 1955 | }, 1956 | { 1957 | "cell_type": "markdown", 1958 | "metadata": {}, 1959 | "source": [ 1960 | "训练集预测误差:3.0382473722545784" 1961 | ] 1962 | }, 1963 | { 1964 | "cell_type": "markdown", 1965 | "metadata": {}, 1966 | "source": [ 1967 | "#### 测试集检验" 1968 | ] 1969 | }, 1970 | { 1971 | "cell_type": "code", 1972 | "execution_count": 316, 1973 | "metadata": {}, 1974 | "outputs": [ 1975 | { 1976 | "data": { 1977 | "text/plain": [ 1978 | "4.782476424480568" 1979 | ] 1980 | }, 1981 | "execution_count": 316, 1982 | "metadata": {}, 1983 | "output_type": "execute_result" 1984 | } 1985 | ], 1986 | "source": [ 1987 | "from sklearn.metrics import mean_squared_error\n", 1988 | "import numpy as np\n", 1989 | "\n", 1990 | "y_pred_tree_test = tree_reg.predict(x_test)\n", 1991 | "\n", 1992 | "tree_mse = mean_squared_error(y_test, y_pred_tree_test)\n", 1993 | "tree_rmse = np.sqrt(tree_mse)\n", 1994 | "tree_rmse" 1995 | ] 1996 | }, 1997 | { 1998 | "cell_type": "markdown", 1999 | "metadata": {}, 2000 | "source": [ 2001 | "测试集误差:4.782476424480568" 2002 | ] 2003 | }, 2004 | { 2005 | "cell_type": "markdown", 2006 | "metadata": {}, 2007 | "source": [ 2008 | "### 交叉验证" 2009 | ] 2010 | }, 2011 | { 2012 | "cell_type": "code", 2013 | "execution_count": 317, 2014 | "metadata": { 2015 | "collapsed": true 2016 | }, 2017 | "outputs": [], 2018 | "source": [ 2019 | "from sklearn.model_selection import cross_val_score\n", 2020 | "\n", 2021 | "scores = cross_val_score(tree_reg, x_train, y_train,\n", 2022 | " scoring=\"neg_mean_squared_error\", cv=10)\n", 2023 | "tree_rmse_scores = np.sqrt(-scores)" 2024 | ] 2025 | }, 2026 | { 2027 | "cell_type": "code", 2028 | "execution_count": 318, 2029 | "metadata": {}, 2030 | "outputs": [ 2031 | { 2032 | "name": "stdout", 2033 | "output_type": "stream", 2034 | "text": [ 2035 | "Scores: [4.62433783 4.74066502 4.8478161 4.60423202 4.78729759 4.55902835\n", 2036 | " 4.47123285 4.46471566 4.62152351 4.81591793]\n", 2037 | "Mean: 4.653676687092153\n", 2038 | "Standard deviation: 0.1310433320106498\n" 2039 | ] 2040 | } 2041 | ], 2042 | "source": [ 2043 | "def display_scores(scores):\n", 2044 | " print(\"Scores:\", scores)\n", 2045 | " print(\"Mean:\", scores.mean())\n", 2046 | " print(\"Standard deviation:\", scores.std())\n", 2047 | "\n", 2048 | "display_scores(tree_rmse_scores)" 2049 | ] 2050 | }, 2051 | { 2052 | "cell_type": "markdown", 2053 | "metadata": {}, 2054 | "source": [ 2055 | "## 随机森林" 2056 | ] 2057 | }, 2058 | { 2059 | "cell_type": "code", 2060 | "execution_count": 319, 2061 | "metadata": {}, 2062 | "outputs": [ 2063 | { 2064 | "name": "stderr", 2065 | "output_type": "stream", 2066 | "text": [ 2067 | "C:\\Users\\13626\\Anaconda3\\lib\\site-packages\\sklearn\\ensemble\\forest.py:248: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n", 2068 | " \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n", 2069 | "C:\\Users\\13626\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", 2070 | " after removing the cwd from sys.path.\n" 2071 | ] 2072 | }, 2073 | { 2074 | "data": { 2075 | "text/plain": [ 2076 | "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n", 2077 | " max_features='auto', max_leaf_nodes=None,\n", 2078 | " min_impurity_decrease=0.0, min_impurity_split=None,\n", 2079 | " min_samples_leaf=1, min_samples_split=2,\n", 2080 | " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n", 2081 | " oob_score=False, random_state=42, verbose=0, warm_start=False)" 2082 | ] 2083 | }, 2084 | "execution_count": 319, 2085 | "metadata": {}, 2086 | "output_type": "execute_result" 2087 | } 2088 | ], 2089 | "source": [ 2090 | "from sklearn.ensemble import RandomForestRegressor\n", 2091 | "\n", 2092 | "forest_reg = RandomForestRegressor(random_state=42)\n", 2093 | "forest_reg.fit(x_train, y_train)" 2094 | ] 2095 | }, 2096 | { 2097 | "cell_type": "markdown", 2098 | "metadata": {}, 2099 | "source": [ 2100 | "### 训练集检验" 2101 | ] 2102 | }, 2103 | { 2104 | "cell_type": "code", 2105 | "execution_count": 320, 2106 | "metadata": {}, 2107 | "outputs": [ 2108 | { 2109 | "data": { 2110 | "text/plain": [ 2111 | "3.1980674188547304" 2112 | ] 2113 | }, 2114 | "execution_count": 320, 2115 | "metadata": {}, 2116 | "output_type": "execute_result" 2117 | } 2118 | ], 2119 | "source": [ 2120 | "y_pred_rf = forest_reg.predict(x_train)\n", 2121 | "forest_mse = mean_squared_error(y_train, y_pred_rf)\n", 2122 | "forest_rmse = np.sqrt(forest_mse)\n", 2123 | "forest_rmse" 2124 | ] 2125 | }, 2126 | { 2127 | "cell_type": "markdown", 2128 | "metadata": {}, 2129 | "source": [ 2130 | "误差为3.1980674188547304" 2131 | ] 2132 | }, 2133 | { 2134 | "cell_type": "markdown", 2135 | "metadata": {}, 2136 | "source": [ 2137 | "### 测试集检验" 2138 | ] 2139 | }, 2140 | { 2141 | "cell_type": "code", 2142 | "execution_count": 322, 2143 | "metadata": {}, 2144 | "outputs": [ 2145 | { 2146 | "data": { 2147 | "text/plain": [ 2148 | "4.5536604518702575" 2149 | ] 2150 | }, 2151 | "execution_count": 322, 2152 | "metadata": {}, 2153 | "output_type": "execute_result" 2154 | } 2155 | ], 2156 | "source": [ 2157 | "y_pred_rf_test = forest_reg.predict(x_test)\n", 2158 | "forest_mse = mean_squared_error(y_test, y_pred_rf_test)\n", 2159 | "forest_rmse = np.sqrt(forest_mse)\n", 2160 | "forest_rmse" 2161 | ] 2162 | }, 2163 | { 2164 | "cell_type": "markdown", 2165 | "metadata": {}, 2166 | "source": [ 2167 | "测试集标准差4.5536604518702575" 2168 | ] 2169 | }, 2170 | { 2171 | "cell_type": "code", 2172 | "execution_count": 355, 2173 | "metadata": {}, 2174 | "outputs": [], 2175 | "source": [ 2176 | "#datas_test.head(10)" 2177 | ] 2178 | }, 2179 | { 2180 | "cell_type": "code", 2181 | "execution_count": 324, 2182 | "metadata": {}, 2183 | "outputs": [ 2184 | { 2185 | "data": { 2186 | "text/plain": [ 2187 | "array([[ 7.73 ],\n", 2188 | " [11.4861727 ],\n", 2189 | " [ 9.33285714],\n", 2190 | " [ 7.4 ],\n", 2191 | " [ 9.275 ],\n", 2192 | " [16.1 ],\n", 2193 | " [15.60247655],\n", 2194 | " [ 8.45447109],\n", 2195 | " [ 8.05528666],\n", 2196 | " [11.12662698]])" 2197 | ] 2198 | }, 2199 | "execution_count": 324, 2200 | "metadata": {}, 2201 | "output_type": "execute_result" 2202 | } 2203 | ], 2204 | "source": [ 2205 | "y_pred_rf[:10].reshape(10, 1)" 2206 | ] 2207 | }, 2208 | { 2209 | "cell_type": "markdown", 2210 | "metadata": {}, 2211 | "source": [ 2212 | "# 变量重要性" 2213 | ] 2214 | }, 2215 | { 2216 | "cell_type": "code", 2217 | "execution_count": 353, 2218 | "metadata": {}, 2219 | "outputs": [ 2220 | { 2221 | "data": { 2222 | "text/plain": [ 2223 | "[(0.058026406821841056, '大专'),\n", 2224 | " (0.031263812251122784, '10000人以上'),\n", 2225 | " (0.020290832343490904, '1000-9999人'),\n", 2226 | " (0.018095521323951716, '20-99人'),\n", 2227 | " (0.015872193036716557, '100-499人'),\n", 2228 | " (0.015170706225377684, '硕士'),\n", 2229 | " (0.014925513158681906, '北京'),\n", 2230 | " (0.014193573528348177, '500-999人'),\n", 2231 | " (0.013911459035214633, '0-20人'),\n", 2232 | " (0.011434958956342767, '深圳'),\n", 2233 | " (0.011310119828051196, '本科'),\n", 2234 | " (0.01002918014973044, '丽水'),\n", 2235 | " (0.0096919398776096, '杭州'),\n", 2236 | " (0.009393127333069419, '温州'),\n", 2237 | " (0.00912590555299951, '学历不限'),\n", 2238 | " (0.009077835450701817, '台州'),\n", 2239 | " (0.008986219951521799, '舟山'),\n", 2240 | " (0.008834491150945197, '绍兴'),\n", 2241 | " (0.008656335871500801, '金华'),\n", 2242 | " (0.008413046832938394, '湖州')]" 2243 | ] 2244 | }, 2245 | "execution_count": 353, 2246 | "metadata": {}, 2247 | "output_type": "execute_result" 2248 | } 2249 | ], 2250 | "source": [ 2251 | "#forest_reg.feature_importances_\n", 2252 | "labellist = []\n", 2253 | "\n", 2254 | "for item in cat_encoder.categories_[0]:\n", 2255 | " labellist.append(item)\n", 2256 | " \n", 2257 | "for item in cat_encoder.categories_[1]:\n", 2258 | " labellist.append(item)\n", 2259 | " \n", 2260 | "for item in cat_encoder.categories_[2]:\n", 2261 | " labellist.append(item)\n", 2262 | "#labellist\n", 2263 | "sorted(zip(forest_reg.feature_importances_,labellist), reverse=True)[:20]" 2264 | ] 2265 | }, 2266 | { 2267 | "cell_type": "code", 2268 | "execution_count": 369, 2269 | "metadata": {}, 2270 | "outputs": [ 2271 | { 2272 | "data": { 2273 | "text/html": [ 2274 | "
\n", 2275 | "\n", 2288 | "\n", 2289 | " \n", 2290 | " \n", 2291 | " \n", 2292 | " \n", 2293 | " \n", 2294 | " \n", 2295 | " \n", 2296 | " \n", 2297 | " \n", 2298 | " \n", 2299 | " \n", 2300 | " \n", 2301 | " \n", 2302 | " \n", 2303 | " \n", 2304 | " \n", 2305 | " \n", 2306 | " \n", 2307 | " \n", 2308 | " \n", 2309 | " \n", 2310 | " \n", 2311 | " \n", 2312 | " \n", 2313 | " \n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | " \n", 2323 | " \n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | " \n", 2343 | " \n", 2344 | " \n", 2345 | " \n", 2346 | " \n", 2347 | " \n", 2348 | " \n", 2349 | " \n", 2350 | " \n", 2351 | " \n", 2352 | " \n", 2353 | " \n", 2354 | " \n", 2355 | " \n", 2356 | " \n", 2357 | " \n", 2358 | " \n", 2359 | " \n", 2360 | " \n", 2361 | " \n", 2362 | " \n", 2363 | " \n", 2364 | " \n", 2365 | " \n", 2366 | " \n", 2367 | " \n", 2368 | " \n", 2369 | " \n", 2370 | " \n", 2371 | " \n", 2372 | " \n", 2373 | " \n", 2374 | " \n", 2375 | " \n", 2376 | " \n", 2377 | "
重要性
大专0.058026
10000人以上0.031264
1000-9999人0.020291
20-99人0.018096
100-499人0.015872
硕士0.015171
北京0.014926
500-999人0.014194
0-20人0.013911
深圳0.011435
本科0.011310
丽水0.010029
杭州0.009692
温州0.009393
学历不限0.009126
台州0.009078
舟山0.008986
绍兴0.008834
金华0.008656
湖州0.008413
\n", 2378 | "
" 2379 | ], 2380 | "text/plain": [ 2381 | " 重要性\n", 2382 | "大专 0.058026\n", 2383 | "10000人以上 0.031264\n", 2384 | "1000-9999人 0.020291\n", 2385 | "20-99人 0.018096\n", 2386 | "100-499人 0.015872\n", 2387 | "硕士 0.015171\n", 2388 | "北京 0.014926\n", 2389 | "500-999人 0.014194\n", 2390 | "0-20人 0.013911\n", 2391 | "深圳 0.011435\n", 2392 | "本科 0.011310\n", 2393 | "丽水 0.010029\n", 2394 | "杭州 0.009692\n", 2395 | "温州 0.009393\n", 2396 | "学历不限 0.009126\n", 2397 | "台州 0.009078\n", 2398 | "舟山 0.008986\n", 2399 | "绍兴 0.008834\n", 2400 | "金华 0.008656\n", 2401 | "湖州 0.008413" 2402 | ] 2403 | }, 2404 | "execution_count": 369, 2405 | "metadata": {}, 2406 | "output_type": "execute_result" 2407 | } 2408 | ], 2409 | "source": [ 2410 | "from pandas import DataFrame\n", 2411 | "# 转换成 DataFrame格式\n", 2412 | "sorted(zip(forest_reg.feature_importances_,labellist), reverse=True)[:20][0]\n", 2413 | "list1 = []\n", 2414 | "list2 = []\n", 2415 | "for item in sorted(zip(forest_reg.feature_importances_,labellist), reverse=True)[:20]:\n", 2416 | " list1.append(item[1])\n", 2417 | " list2.append(item[0])\n", 2418 | "df_importance = DataFrame(list2,index=list1,columns=['重要性'])\n", 2419 | "df_importance" 2420 | ] 2421 | }, 2422 | { 2423 | "cell_type": "markdown", 2424 | "metadata": {}, 2425 | "source": [ 2426 | "# 变量重要性分析" 2427 | ] 2428 | }, 2429 | { 2430 | "cell_type": "markdown", 2431 | "metadata": {}, 2432 | "source": [ 2433 | "## 选取前20个重要变量" 2434 | ] 2435 | }, 2436 | { 2437 | "cell_type": "markdown", 2438 | "metadata": {}, 2439 | "source": [ 2440 | "1.很多公司招聘时往往将学历要求设置为'大专'(即不设门槛,只挑选有能力者)" 2441 | ] 2442 | }, 2443 | { 2444 | "cell_type": "markdown", 2445 | "metadata": {}, 2446 | "source": [ 2447 | "2.公司规模是影响数据分析师薪资方面的重要变量,其中'10000人以上'的大公司在薪资高低方面的影响力最大。" 2448 | ] 2449 | }, 2450 | { 2451 | "cell_type": "markdown", 2452 | "metadata": {}, 2453 | "source": [ 2454 | "3.也有许多公司将'硕士学历'设为数据分析师的门槛,因此'硕士学历'在数据分析师薪资方面影响较大。" 2455 | ] 2456 | }, 2457 | { 2458 | "cell_type": "markdown", 2459 | "metadata": {}, 2460 | "source": [ 2461 | "4.城市/地域方面:可以看到北京、深圳的职位对薪资的影响最大;其次,丽水、杭州、温州、台州、舟山、绍兴、金华等均为浙江省的市,可以明显看出浙江省数据分析师的薪资待遇较高。" 2462 | ] 2463 | }, 2464 | { 2465 | "cell_type": "code", 2466 | "execution_count": 370, 2467 | "metadata": {}, 2468 | "outputs": [], 2469 | "source": [ 2470 | "#datas_train[datas_train['学历要求'] == '大专'].head()" 2471 | ] 2472 | }, 2473 | { 2474 | "cell_type": "markdown", 2475 | "metadata": {}, 2476 | "source": [ 2477 | "# 自定义变量预测" 2478 | ] 2479 | }, 2480 | { 2481 | "cell_type": "code", 2482 | "execution_count": 451, 2483 | "metadata": {}, 2484 | "outputs": [ 2485 | { 2486 | "data": { 2487 | "text/plain": [ 2488 | "[array(['三明', '上海', '东沙群岛', '东莞', '东营', '中山', '临沂', '丽水', '乐山', '云浮', '亳州',\n", 2489 | " '仙桃', '佛山', '六安', '内江', '凉山', '北京', '十堰', '南京', '南充', '南平', '南通',\n", 2490 | " '南阳', '厦门', '台州', '合肥', '周口', '咸宁', '咸阳', '商丘', '商洛', '嘉兴', '大连',\n", 2491 | " '天津', '天门', '威海', '娄底', '孝感', '宁德', '安庆', '安康', '安阳', '宜宾', '宜昌',\n", 2492 | " '宣城', '宿州', '宿迁', '岳阳', '巴中', '常州', '常德', '平顶山', '广元', '广安', '广州',\n", 2493 | " '开封', '张家界', '徐州', '德阳', '怀化', '恩施', '惠州', '成都', '扬州', '揭阳', '攀枝花',\n", 2494 | " '新乡', '日照', '杭州', '枣庄', '株洲', '梅州', '武汉', '永州', '汕头', '汕尾', '江门',\n", 2495 | " '池州', '河源', '泉州', '泰安', '泰州', '泸州', '洛阳', '济南', '济宁', '淄博', '淮北',\n", 2496 | " '淮安', '深圳', '清远', '温州', '湖州', '湘潭', '湘西', '湛江', '滨州', '漯河', '漳州',\n", 2497 | " '潍坊', '潜江', '潮州', '烟台', '珠海', '甘孜', '益阳', '盐城', '眉山', '神农架', '福州',\n", 2498 | " '绍兴', '绵阳', '聊城', '肇庆', '自贡', '舟山', '芜湖', '苏州', '茂名', '荆州', '荆门',\n", 2499 | " '莆田', '莱芜', '菏泽', '衡阳', '衢州', '襄阳', '西安', '资阳', '达州', '连云港', '遂宁',\n", 2500 | " '邵阳', '郑州', '郴州', '鄂州', '金华', '铜川', '镇江', '长沙', '阜阳', '阳江', '阿坝',\n", 2501 | " '随州', '雅安', '韶关', '黄冈', '黄山', '黄石', '龙岩'], dtype=object),\n", 2502 | " array(['0-20人', '100-499人', '1000-9999人', '10000人以上', '20-99人',\n", 2503 | " '500-999人'], dtype=object),\n", 2504 | " array(['中专', '博士', '大专', '学历不限', '本科', '硕士', '高中'], dtype=object),\n", 2505 | " array(['1-3年', '1年以内', '3-5年', '5-10年', '经验不限'], dtype=object)]" 2506 | ] 2507 | }, 2508 | "execution_count": 451, 2509 | "metadata": {}, 2510 | "output_type": "execute_result" 2511 | } 2512 | ], 2513 | "source": [ 2514 | "cat_encoder.categories_ #可选预测变量" 2515 | ] 2516 | }, 2517 | { 2518 | "cell_type": "markdown", 2519 | "metadata": {}, 2520 | "source": [ 2521 | "## 决策树预测 " 2522 | ] 2523 | }, 2524 | { 2525 | "cell_type": "code", 2526 | "execution_count": 452, 2527 | "metadata": {}, 2528 | "outputs": [ 2529 | { 2530 | "data": { 2531 | "text/plain": [ 2532 | "array([15.])" 2533 | ] 2534 | }, 2535 | "execution_count": 452, 2536 | "metadata": {}, 2537 | "output_type": "execute_result" 2538 | } 2539 | ], 2540 | "source": [ 2541 | "#paramlist = func_params(['上海','1000-9999人','硕士','1-3年'])\n", 2542 | "Y_pred_dt = tree_reg.predict(paramlist)\n", 2543 | "Y_pred_dt" 2544 | ] 2545 | }, 2546 | { 2547 | "cell_type": "code", 2548 | "execution_count": 458, 2549 | "metadata": {}, 2550 | "outputs": [ 2551 | { 2552 | "data": { 2553 | "text/plain": [ 2554 | "array([13.5])" 2555 | ] 2556 | }, 2557 | "execution_count": 458, 2558 | "metadata": {}, 2559 | "output_type": "execute_result" 2560 | } 2561 | ], 2562 | "source": [ 2563 | "paramlist2 = func_params(['北京','500-999人','本科','1-3年'])\n", 2564 | "tree_reg.predict(paramlist2)" 2565 | ] 2566 | }, 2567 | { 2568 | "cell_type": "markdown", 2569 | "metadata": {}, 2570 | "source": [ 2571 | "## 随机森林预测" 2572 | ] 2573 | }, 2574 | { 2575 | "cell_type": "code", 2576 | "execution_count": 469, 2577 | "metadata": {}, 2578 | "outputs": [ 2579 | { 2580 | "data": { 2581 | "text/plain": [ 2582 | "array([12.63888889])" 2583 | ] 2584 | }, 2585 | "execution_count": 469, 2586 | "metadata": {}, 2587 | "output_type": "execute_result" 2588 | } 2589 | ], 2590 | "source": [ 2591 | "paramlist3 = func_params(['深圳','100-499人','硕士','1年以内'])\n", 2592 | "tree_reg.predict(paramlist3)" 2593 | ] 2594 | }, 2595 | { 2596 | "cell_type": "code", 2597 | "execution_count": 476, 2598 | "metadata": {}, 2599 | "outputs": [ 2600 | { 2601 | "data": { 2602 | "text/plain": [ 2603 | "array([15.])" 2604 | ] 2605 | }, 2606 | "execution_count": 476, 2607 | "metadata": {}, 2608 | "output_type": "execute_result" 2609 | } 2610 | ], 2611 | "source": [ 2612 | "paramlist4 = func_params(['上海','1000-9999人','硕士','1年以内'])\n", 2613 | "tree_reg.predict(paramlist4)" 2614 | ] 2615 | }, 2616 | { 2617 | "cell_type": "code", 2618 | "execution_count": null, 2619 | "metadata": { 2620 | "collapsed": true 2621 | }, 2622 | "outputs": [], 2623 | "source": [] 2624 | } 2625 | ], 2626 | "metadata": { 2627 | "kernelspec": { 2628 | "display_name": "Python 3", 2629 | "language": "python", 2630 | "name": "python3" 2631 | }, 2632 | "language_info": { 2633 | "codemirror_mode": { 2634 | "name": "ipython", 2635 | "version": 3 2636 | }, 2637 | "file_extension": ".py", 2638 | "mimetype": "text/x-python", 2639 | "name": "python", 2640 | "nbconvert_exporter": "python", 2641 | "pygments_lexer": "ipython3", 2642 | "version": "3.6.3" 2643 | } 2644 | }, 2645 | "nbformat": 4, 2646 | "nbformat_minor": 2 2647 | } 2648 | -------------------------------------------------------------------------------- /city_data/city.csv: -------------------------------------------------------------------------------- 1 | ,code,市,省 2 | 0,101010100,北京,北京 3 | 1,101020100,上海,上海 4 | 2,101030100,天津,天津 5 | 3,101040100,重庆,重庆 6 | 4,101050100,哈尔滨,黑龙江 7 | 5,101050200,齐齐哈尔,黑龙江 8 | 6,101050300,牡丹江,黑龙江 9 | 7,101050400,佳木斯,黑龙江 10 | 8,101050500,绥化,黑龙江 11 | 9,101050600,黑河,黑龙江 12 | 10,101050700,伊春,黑龙江 13 | 11,101050800,大庆,黑龙江 14 | 12,101050900,七台河,黑龙江 15 | 13,101051000,鸡西,黑龙江 16 | 14,101051100,鹤岗,黑龙江 17 | 15,101051200,双鸭山,黑龙江 18 | 16,101051300,大兴安岭,黑龙江 19 | 17,101060100,长春,吉林 20 | 18,101060200,吉林,吉林 21 | 19,101060300,四平,吉林 22 | 20,101060400,通化,吉林 23 | 21,101060500,白城,吉林 24 | 22,101060600,辽源,吉林 25 | 23,101060700,松原,吉林 26 | 24,101060800,白山,吉林 27 | 25,101060900,延边,吉林 28 | 26,101070100,沈阳,辽宁 29 | 27,101070200,大连,辽宁 30 | 28,101070300,鞍山,辽宁 31 | 29,101070400,抚顺,辽宁 32 | 30,101070500,本溪,辽宁 33 | 31,101070600,丹东,辽宁 34 | 32,101070700,锦州,辽宁 35 | 33,101070800,营口,辽宁 36 | 34,101070900,阜新,辽宁 37 | 35,101071000,辽阳,辽宁 38 | 36,101071100,铁岭,辽宁 39 | 37,101071200,朝阳,辽宁 40 | 38,101071300,盘锦,辽宁 41 | 39,101071400,葫芦岛,辽宁 42 | 40,101080100,呼和浩特,内蒙古 43 | 41,101080200,包头,内蒙古 44 | 42,101080300,乌海,内蒙古 45 | 43,101080400,通辽,内蒙古 46 | 44,101080500,赤峰,内蒙古 47 | 45,101080600,鄂尔多斯,内蒙古 48 | 46,101080700,呼伦贝尔,内蒙古 49 | 47,101080800,巴彦淖尔,内蒙古 50 | 48,101080900,乌兰察布,内蒙古 51 | 49,101081000,锡林郭勒,内蒙古 52 | 50,101081100,兴安盟,内蒙古 53 | 51,101081200,阿拉善,内蒙古 54 | 52,101090100,石家庄,河北 55 | 53,101090200,保定,河北 56 | 54,101090300,张家口,河北 57 | 55,101090400,承德,河北 58 | 56,101090500,唐山,河北 59 | 57,101090600,廊坊,河北 60 | 58,101090700,沧州,河北 61 | 59,101090800,衡水,河北 62 | 60,101090900,邢台,河北 63 | 61,101091000,邯郸,河北 64 | 62,101091100,秦皇岛,河北 65 | 63,101100100,太原,山西 66 | 64,101100200,大同,山西 67 | 65,101100300,阳泉,山西 68 | 66,101100400,晋中,山西 69 | 67,101100500,长治,山西 70 | 68,101100600,晋城,山西 71 | 69,101100700,临汾,山西 72 | 70,101100800,运城,山西 73 | 71,101100900,朔州,山西 74 | 72,101101000,忻州,山西 75 | 73,101101100,吕梁,山西 76 | 74,101110100,西安,陕西 77 | 75,101110200,咸阳,陕西 78 | 76,101110300,延安,陕西 79 | 77,101110400,榆林,陕西 80 | 78,101110500,渭南,陕西 81 | 79,101110600,商洛,陕西 82 | 80,101110700,安康,陕西 83 | 81,101110800,汉中,陕西 84 | 82,101110900,宝鸡,陕西 85 | 83,101111000,铜川,陕西 86 | 84,101120100,济南,山东 87 | 85,101120200,青岛,山东 88 | 86,101120300,淄博,山东 89 | 87,101120400,德州,山东 90 | 88,101120500,烟台,山东 91 | 89,101120600,潍坊,山东 92 | 90,101120700,济宁,山东 93 | 91,101120800,泰安,山东 94 | 92,101120900,临沂,山东 95 | 93,101121000,菏泽,山东 96 | 94,101121100,滨州,山东 97 | 95,101121200,东营,山东 98 | 96,101121300,威海,山东 99 | 97,101121400,枣庄,山东 100 | 98,101121500,日照,山东 101 | 99,101121600,莱芜,山东 102 | 100,101121700,聊城,山东 103 | 101,101130100,乌鲁木齐,新疆 104 | 102,101130200,克拉玛依,新疆 105 | 103,101130300,昌吉,新疆 106 | 104,101130400,巴音郭楞,新疆 107 | 105,101130500,博尔塔拉,新疆 108 | 106,101130600,伊犁,新疆 109 | 107,101130800,吐鲁番,新疆 110 | 108,101130900,哈密,新疆 111 | 109,101131000,阿克苏,新疆 112 | 110,101131100,克孜勒苏柯尔克孜,新疆 113 | 111,101131200,喀什,新疆 114 | 112,101131300,和田,新疆 115 | 113,101131400,塔城,新疆 116 | 114,101131500,阿勒泰,新疆 117 | 115,101131600,石河子,新疆 118 | 116,101131700,阿拉尔,新疆 119 | 117,101131800,图木舒克,新疆 120 | 118,101131900,五家渠,新疆 121 | 119,101132000,铁门关,新疆 122 | 120,101132100,北屯市,新疆 123 | 121,101132200,可克达拉市,新疆 124 | 122,101132300,昆玉市,新疆 125 | 123,101132400,双河市,新疆 126 | 124,101150100,西宁,青海 127 | 125,101150200,海东,青海 128 | 126,101150300,海北,青海 129 | 127,101150400,黄南,青海 130 | 128,101150500,海南,青海 131 | 129,101150600,果洛,青海 132 | 130,101150700,玉树,青海 133 | 131,101150800,海西,青海 134 | 132,101160100,兰州,甘肃 135 | 133,101160200,定西,甘肃 136 | 134,101160300,平凉,甘肃 137 | 135,101160400,庆阳,甘肃 138 | 136,101160500,武威,甘肃 139 | 137,101160600,金昌,甘肃 140 | 138,101160700,张掖,甘肃 141 | 139,101160800,酒泉,甘肃 142 | 140,101160900,天水,甘肃 143 | 141,101161000,白银,甘肃 144 | 142,101161100,陇南,甘肃 145 | 143,101161200,嘉峪关,甘肃 146 | 144,101161300,临夏,甘肃 147 | 145,101161400,甘南,甘肃 148 | 146,101170100,银川,宁夏 149 | 147,101170200,石嘴山,宁夏 150 | 148,101170300,吴忠,宁夏 151 | 149,101170400,固原,宁夏 152 | 150,101170500,中卫,宁夏 153 | 151,101180100,郑州,河南 154 | 152,101180200,安阳,河南 155 | 153,101180300,新乡,河南 156 | 154,101180400,许昌,河南 157 | 155,101180500,平顶山,河南 158 | 156,101180600,信阳,河南 159 | 157,101180700,南阳,河南 160 | 158,101180800,开封,河南 161 | 159,101180900,洛阳,河南 162 | 160,101181000,商丘,河南 163 | 161,101181100,焦作,河南 164 | 162,101181200,鹤壁,河南 165 | 163,101181300,濮阳,河南 166 | 164,101181400,周口,河南 167 | 165,101181500,漯河,河南 168 | 166,101181600,驻马店,河南 169 | 167,101181700,三门峡,河南 170 | 168,101181800,济源,河南 171 | 169,101190100,南京,江苏 172 | 170,101190200,无锡,江苏 173 | 171,101190300,镇江,江苏 174 | 172,101190400,苏州,江苏 175 | 173,101190500,南通,江苏 176 | 174,101190600,扬州,江苏 177 | 175,101190700,盐城,江苏 178 | 176,101190800,徐州,江苏 179 | 177,101190900,淮安,江苏 180 | 178,101191000,连云港,江苏 181 | 179,101191100,常州,江苏 182 | 180,101191200,泰州,江苏 183 | 181,101191300,宿迁,江苏 184 | 182,101200100,武汉,湖北 185 | 183,101200200,襄阳,湖北 186 | 184,101200300,鄂州,湖北 187 | 185,101200400,孝感,湖北 188 | 186,101200500,黄冈,湖北 189 | 187,101200600,黄石,湖北 190 | 188,101200700,咸宁,湖北 191 | 189,101200800,荆州,湖北 192 | 190,101200900,宜昌,湖北 193 | 191,101201000,十堰,湖北 194 | 192,101201100,随州,湖北 195 | 193,101201200,荆门,湖北 196 | 194,101201300,恩施,湖北 197 | 195,101201400,仙桃,湖北 198 | 196,101201500,潜江,湖北 199 | 197,101201600,天门,湖北 200 | 198,101201700,神农架,湖北 201 | 199,101210100,杭州,浙江 202 | 200,101210200,湖州,浙江 203 | 201,101210300,嘉兴,浙江 204 | 202,101210400,宁波,浙江 205 | 203,101210500,绍兴,浙江 206 | 204,101210600,台州,浙江 207 | 205,101210700,温州,浙江 208 | 206,101210800,丽水,浙江 209 | 207,101210900,金华,浙江 210 | 208,101211000,衢州,浙江 211 | 209,101211100,舟山,浙江 212 | 210,101220100,合肥,安徽 213 | 211,101220200,蚌埠,安徽 214 | 212,101220300,芜湖,安徽 215 | 213,101220400,淮南,安徽 216 | 214,101220500,马鞍山,安徽 217 | 215,101220600,安庆,安徽 218 | 216,101220700,宿州,安徽 219 | 217,101220800,阜阳,安徽 220 | 218,101220900,亳州,安徽 221 | 219,101221000,滁州,安徽 222 | 220,101221100,淮北,安徽 223 | 221,101221200,铜陵,安徽 224 | 222,101221300,宣城,安徽 225 | 223,101221400,六安,安徽 226 | 224,101221500,池州,安徽 227 | 225,101221600,黄山,安徽 228 | 226,101230100,福州,福建 229 | 227,101230200,厦门,福建 230 | 228,101230300,宁德,福建 231 | 229,101230400,莆田,福建 232 | 230,101230500,泉州,福建 233 | 231,101230600,漳州,福建 234 | 232,101230700,龙岩,福建 235 | 233,101230800,三明,福建 236 | 234,101230900,南平,福建 237 | 235,101240100,南昌,江西 238 | 236,101240200,九江,江西 239 | 237,101240300,上饶,江西 240 | 238,101240400,抚州,江西 241 | 239,101240500,宜春,江西 242 | 240,101240600,吉安,江西 243 | 241,101240700,赣州,江西 244 | 242,101240800,景德镇,江西 245 | 243,101240900,萍乡,江西 246 | 244,101241000,新余,江西 247 | 245,101241100,鹰潭,江西 248 | 246,101250100,长沙,湖南 249 | 247,101250200,湘潭,湖南 250 | 248,101250300,株洲,湖南 251 | 249,101250400,衡阳,湖南 252 | 250,101250500,郴州,湖南 253 | 251,101250600,常德,湖南 254 | 252,101250700,益阳,湖南 255 | 253,101250800,娄底,湖南 256 | 254,101250900,邵阳,湖南 257 | 255,101251000,岳阳,湖南 258 | 256,101251100,张家界,湖南 259 | 257,101251200,怀化,湖南 260 | 258,101251300,永州,湖南 261 | 259,101251400,湘西,湖南 262 | 260,101260100,贵阳,贵州 263 | 261,101260200,遵义,贵州 264 | 262,101260300,安顺,贵州 265 | 263,101260400,铜仁,贵州 266 | 264,101260500,毕节,贵州 267 | 265,101260600,六盘水,贵州 268 | 266,101260700,黔东南,贵州 269 | 267,101260800,黔南,贵州 270 | 268,101260900,黔西南,贵州 271 | 269,101270100,成都,四川 272 | 270,101270200,攀枝花,四川 273 | 271,101270300,自贡,四川 274 | 272,101270400,绵阳,四川 275 | 273,101270500,南充,四川 276 | 274,101270600,达州,四川 277 | 275,101270700,遂宁,四川 278 | 276,101270800,广安,四川 279 | 277,101270900,巴中,四川 280 | 278,101271000,泸州,四川 281 | 279,101271100,宜宾,四川 282 | 280,101271200,内江,四川 283 | 281,101271300,资阳,四川 284 | 282,101271400,乐山,四川 285 | 283,101271500,眉山,四川 286 | 284,101271600,雅安,四川 287 | 285,101271700,德阳,四川 288 | 286,101271800,广元,四川 289 | 287,101271900,阿坝,四川 290 | 288,101272000,凉山,四川 291 | 289,101272100,甘孜,四川 292 | 290,101280100,广州,广东 293 | 291,101280200,韶关,广东 294 | 292,101280300,惠州,广东 295 | 293,101280400,梅州,广东 296 | 294,101280500,汕头,广东 297 | 295,101280600,深圳,广东 298 | 296,101280700,珠海,广东 299 | 297,101280800,佛山,广东 300 | 298,101280900,肇庆,广东 301 | 299,101281000,湛江,广东 302 | 300,101281100,江门,广东 303 | 301,101281200,河源,广东 304 | 302,101281300,清远,广东 305 | 303,101281400,云浮,广东 306 | 304,101281500,潮州,广东 307 | 305,101281600,东莞,广东 308 | 306,101281700,中山,广东 309 | 307,101281800,阳江,广东 310 | 308,101281900,揭阳,广东 311 | 309,101282000,茂名,广东 312 | 310,101282100,汕尾,广东 313 | 311,101282200,东沙群岛,广东 314 | 312,101290100,昆明,云南 315 | 313,101290200,曲靖,云南 316 | 314,101290300,保山,云南 317 | 315,101290400,玉溪,云南 318 | 316,101290500,普洱,云南 319 | 317,101290700,昭通,云南 320 | 318,101290800,临沧,云南 321 | 319,101290900,丽江,云南 322 | 320,101291000,西双版纳,云南 323 | 321,101291100,文山,云南 324 | 322,101291200,红河,云南 325 | 323,101291300,德宏,云南 326 | 324,101291400,怒江,云南 327 | 325,101291500,迪庆,云南 328 | 326,101291600,大理,云南 329 | 327,101291700,楚雄,云南 330 | 328,101300100,南宁,广西 331 | 329,101300200,崇左,广西 332 | 330,101300300,柳州,广西 333 | 331,101300400,来宾,广西 334 | 332,101300500,桂林,广西 335 | 333,101300600,梧州,广西 336 | 334,101300700,贺州,广西 337 | 335,101300800,贵港,广西 338 | 336,101300900,玉林,广西 339 | 337,101301000,百色,广西 340 | 338,101301100,钦州,广西 341 | 339,101301200,河池,广西 342 | 340,101301300,北海,广西 343 | 341,101301400,防城港,广西 344 | 342,101310100,海口,海南 345 | 343,101310200,三亚,海南 346 | 344,101310300,三沙,海南 347 | 345,101310400,儋州,海南 348 | 346,101310500,五指山,海南 349 | 347,101310600,琼海,海南 350 | 348,101310700,文昌,海南 351 | 349,101310800,万宁,海南 352 | 350,101310900,东方,海南 353 | 351,101311000,定安,海南 354 | 352,101311100,屯昌,海南 355 | 353,101311200,澄迈,海南 356 | 354,101311300,临高,海南 357 | 355,101311400,白沙,海南 358 | 356,101311500,昌江,海南 359 | 357,101311600,乐东,海南 360 | 358,101311700,陵水,海南 361 | 359,101311800,保亭,海南 362 | 360,101311900,琼中,海南 363 | 361,101341100,台湾,台湾 364 | 362,101140100,拉萨,西藏 365 | 363,101140200,日喀则,西藏 366 | 364,101140300,昌都,西藏 367 | 365,101140400,林芝,西藏 368 | 366,101140500,山南,西藏 369 | 367,101140600,那曲,西藏 370 | 368,101140700,阿里,西藏 371 | 369,101320300,香港,香港 372 | 370,101330100,澳门,澳门 373 | -------------------------------------------------------------------------------- /city_data/city.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pymongo\n", 12 | "import pandas as pd\n", 13 | "client = pymongo.MongoClient('localhost',27017)\n", 14 | "db = client['Graduation_project']\n", 15 | "table = db['city']\n", 16 | "city = pd.DataFrame(list(table.find()))\n", 17 | "del city['_id']" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 11, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/html": [ 28 | "
\n", 29 | "\n", 42 | "\n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | "
code
0101010100北京北京
1101020100上海上海
2101030100天津天津
3101040100重庆重庆
4101050100哈尔滨黑龙江
\n", 84 | "
" 85 | ], 86 | "text/plain": [ 87 | " code 市 省\n", 88 | "0 101010100 北京 北京\n", 89 | "1 101020100 上海 上海\n", 90 | "2 101030100 天津 天津\n", 91 | "3 101040100 重庆 重庆\n", 92 | "4 101050100 哈尔滨 黑龙江" 93 | ] 94 | }, 95 | "execution_count": 11, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "city.head()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 13, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "city.to_csv('city.csv')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 16, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/html": [ 123 | "
\n", 124 | "\n", 137 | "\n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | "
code
0101010100北京北京
1101020100上海上海
2101030100天津天津
3101040100重庆重庆
4101050100黑龙江哈尔滨
\n", 179 | "
" 180 | ], 181 | "text/plain": [ 182 | " code 省 市\n", 183 | "0 101010100 北京 北京\n", 184 | "1 101020100 上海 上海\n", 185 | "2 101030100 天津 天津\n", 186 | "3 101040100 重庆 重庆\n", 187 | "4 101050100 黑龙江 哈尔滨" 188 | ] 189 | }, 190 | "execution_count": 16, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "import pandas as pd\n", 197 | "\n", 198 | "city = pd.read_csv('city.csv')\n", 199 | "data = city[['code', '省', '市']]\n", 200 | "data.head()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 20, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "city = data.values" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 21, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "array([[101010100, '北京', '北京'],\n", 223 | " [101020100, '上海', '上海'],\n", 224 | " [101030100, '天津', '天津'],\n", 225 | " ...,\n", 226 | " [101140700, '西藏', '阿里'],\n", 227 | " [101320300, '香港', '香港'],\n", 228 | " [101330100, '澳门', '澳门']], dtype=object)" 229 | ] 230 | }, 231 | "execution_count": 21, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "city" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 23, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "data": { 247 | "text/plain": [ 248 | "'上海'" 249 | ] 250 | }, 251 | "execution_count": 23, 252 | "metadata": {}, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "city[1][2]" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 36, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "# 连接到MongoDB\n", 269 | "MONGO_URL = 'localhost'\n", 270 | "MONGO_DB = 'Graduation_project'\n", 271 | "MONGO_COLLECTION = 'jobs_info'\n", 272 | "client = pymongo.MongoClient(MONGO_URL, port=27017)\n", 273 | "db = client[MONGO_DB]\n", 274 | "\n", 275 | "\n", 276 | "# 检查是否已爬过\n", 277 | "check = pd.DataFrame(list(db[MONGO_COLLECTION].find()))\n", 278 | "check_list = check[['signal']]\n", 279 | "grouped = check_list.groupby(check['signal'])" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 44, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "True" 291 | ] 292 | }, 293 | "execution_count": 44, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "'1' in grouped.size().index" 300 | ] 301 | } 302 | ], 303 | "metadata": { 304 | "kernelspec": { 305 | "display_name": "Python 3", 306 | "language": "python", 307 | "name": "python3" 308 | }, 309 | "language_info": { 310 | "codemirror_mode": { 311 | "name": "ipython", 312 | "version": 3 313 | }, 314 | "file_extension": ".py", 315 | "mimetype": "text/x-python", 316 | "name": "python", 317 | "nbconvert_exporter": "python", 318 | "pygments_lexer": "ipython3", 319 | "version": "3.6.3" 320 | } 321 | }, 322 | "nbformat": 4, 323 | "nbformat_minor": 2 324 | } 325 | -------------------------------------------------------------------------------- /city_data/city.py: -------------------------------------------------------------------------------- 1 | # version:1.0 2 | # author:brandon 3 | # date:2018/10/20 4 | 5 | # common imports 6 | import pandas as pd 7 | 8 | 9 | def city(): 10 | city_data = pd.read_csv('city.csv') 11 | data = city_data[['code', '省', '市']] 12 | # print(data.values) 13 | city = data.values 14 | return city -------------------------------------------------------------------------------- /pics/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/1.png -------------------------------------------------------------------------------- /pics/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/10.png -------------------------------------------------------------------------------- /pics/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/11.png -------------------------------------------------------------------------------- /pics/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/12.png -------------------------------------------------------------------------------- /pics/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/13.png -------------------------------------------------------------------------------- /pics/14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/14.png -------------------------------------------------------------------------------- /pics/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/2.png -------------------------------------------------------------------------------- /pics/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/3.png -------------------------------------------------------------------------------- /pics/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/4.png -------------------------------------------------------------------------------- /pics/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/5.png -------------------------------------------------------------------------------- /pics/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/6.png -------------------------------------------------------------------------------- /pics/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/7.png -------------------------------------------------------------------------------- /pics/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/8.png -------------------------------------------------------------------------------- /pics/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/9.png -------------------------------------------------------------------------------- /pics/ML部分/为标签编码.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/为标签编码.png -------------------------------------------------------------------------------- /pics/ML部分/决策树训练集误差.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/决策树训练集误差.png -------------------------------------------------------------------------------- /pics/ML部分/划分训练集与测试集.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/划分训练集与测试集.png -------------------------------------------------------------------------------- /pics/ML部分/变量重要性.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/变量重要性.png -------------------------------------------------------------------------------- /pics/ML部分/属性合并.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/属性合并.png -------------------------------------------------------------------------------- /pics/ML部分/筛选岗位数量前150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/筛选岗位数量前150.png -------------------------------------------------------------------------------- /pics/ML部分/编码分类结果.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/编码分类结果.png -------------------------------------------------------------------------------- /pics/ML部分/过滤后的分布.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/过滤后的分布.png -------------------------------------------------------------------------------- /pics/ML部分/过滤薪资.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/过滤薪资.png -------------------------------------------------------------------------------- /pics/ML部分/重要性分析.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/重要性分析.png -------------------------------------------------------------------------------- /pics/ML部分/随机森林训练集误差.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/ML部分/随机森林训练集误差.png -------------------------------------------------------------------------------- /pics/url.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonchow1997/bosszhipin_spider/8e78ba18dee2f46d297ba3fcd11f5cc731989367/pics/url.png -------------------------------------------------------------------------------- /spiders/citylist_spider.py: -------------------------------------------------------------------------------- 1 | # version:1.0 2 | # author:brandon 3 | # date:2018/10/20 4 | 5 | # common imports 6 | import requests 7 | import pymongo 8 | # ------------- 9 | 10 | 11 | # 页面获取函数 12 | def get(): 13 | header = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 15 | 'Chrome/69.0.3497.12 Safari/537.36 ' 16 | } 17 | url = 'https://www.zhipin.com/common/data/city.json' 18 | response = requests.get(url, headers=header) 19 | return response.json() 20 | 21 | 22 | def parse(data): 23 | cities = data['data']['cityList'] 24 | for provence in cities: 25 | provence_name = provence['name'] 26 | citylist = provence['subLevelModelList'] 27 | for city in citylist: 28 | city_code = city['code'] 29 | city_name = city['name'] 30 | city_info = { 31 | '省': provence_name, 32 | '市': city_name, 33 | 'code': city_code 34 | } 35 | print(city_info) 36 | save_to_mongo(city_info) 37 | 38 | 39 | # 连接到MongoDB 40 | MONGO_URL = 'localhost' 41 | MONGO_DB = 'Graduation_project' 42 | MONGO_COLLECTION = 'city' 43 | client = pymongo.MongoClient(MONGO_URL, port=27017) 44 | db = client[MONGO_DB] 45 | 46 | 47 | def save_to_mongo(data): 48 | # 保存到MongoDB中 49 | try: 50 | if db[MONGO_COLLECTION].insert(data): 51 | print('存储到 MongoDB 成功') 52 | except Exception: 53 | print('存储到 MongoDB 失败') 54 | 55 | 56 | if __name__ == '__main__': 57 | city = get() 58 | parse(city) 59 | -------------------------------------------------------------------------------- /spiders/jobs_spider.py: -------------------------------------------------------------------------------- 1 | # version:1.0 2 | # author:brandon 3 | # date:2018/10/20 4 | 5 | # common imports 6 | import requests 7 | import re 8 | from lxml import etree 9 | import time 10 | import random 11 | import pymongo 12 | import pandas as pd 13 | # ------------- 14 | # 导入模块 15 | import city 16 | 17 | 18 | # 页面获取函数 19 | def get_page(page, city_code): 20 | header = { 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 22 | 'Chrome/69.0.3497.12 Safari/537.36 ' 23 | } 24 | print('正在爬取第', page, '页') 25 | url = 'https://www.zhipin.com/c{code}-p100511/?page={page}&ka=page-{page}'.format(code=city_code, page=page) 26 | response = requests.get(url, headers=header) 27 | return response.text 28 | # -------------- 29 | 30 | 31 | # 页面解析函数 32 | def parse(html, city, provence, page): 33 | # data = json.loads(data) 34 | # print(data) 35 | # 观察数据结构可得 36 | data = etree.HTML(html) 37 | # 取工资均值 38 | items = data.xpath('//*[@id="main"]/div/div[2]/ul/li') 39 | for item in items: 40 | job_title = item.xpath('./div/div[1]/h3/a/div[1]/text()')[0] 41 | job_salary = item.xpath('./div/div[1]/h3/a/span/text()')[0] 42 | job_company = item.xpath('./div/div[2]/div/h3/a/text()')[0] 43 | job_experience = item.xpath('./div/div[1]/p/text()[2]')[0] 44 | job_degree = item.xpath('./div/div[1]/p/text()[3]')[0] 45 | company_scale = item.xpath('./div/div[2]/div/p/text()[3]')[0] 46 | # 取薪资均值---------------- 47 | avg_salary = average(job_salary) 48 | # ------------------------- 49 | signal = city + str(page) 50 | print(provence, '|', city, '|', job_title, '|', job_salary, '|', job_company, '|', job_experience, '|', job_degree, '|', company_scale, 51 | '|', avg_salary) 52 | job = { 53 | 'signal': signal, 54 | '省': provence, 55 | '城市': city, 56 | '职位名称': job_title, 57 | '职位薪资': job_salary, 58 | '公司名称': job_company, 59 | '工作经验': job_experience, 60 | '学历要求': job_degree, 61 | '公司规模': company_scale 62 | } 63 | save_to_mongo(job) 64 | # --------------------------------------- 65 | 66 | 67 | # 均值函数 68 | def average(job_salary): 69 | # 取薪资均值---------------- 70 | pattern = re.compile('\d+') 71 | salary = job_salary 72 | try: 73 | res = re.findall(pattern, salary) 74 | avg_salary = 0 75 | sum = 0 76 | for i in res: 77 | a = int(i) 78 | sum = sum + a 79 | avg_salary = sum / 2 80 | except Exception: 81 | avg_salary = 0 82 | # 函数返回值 83 | return avg_salary 84 | 85 | 86 | # 连接到MongoDB 87 | MONGO_URL = 'localhost' 88 | MONGO_DB = 'Graduation_project' 89 | MONGO_COLLECTION = 'jobs_info' 90 | client = pymongo.MongoClient(MONGO_URL, port=27017) 91 | db = client[MONGO_DB] 92 | 93 | 94 | # 检查是否已爬过 95 | check = pd.DataFrame(list(db[MONGO_COLLECTION].find())) 96 | check_list = check[['signal']] 97 | grouped = check_list.groupby(check['signal']) 98 | # ----------------- 99 | 100 | 101 | def save_to_mongo(data): 102 | # 保存到MongoDB中 103 | try: 104 | if db[MONGO_COLLECTION].insert(data): 105 | print('存储到 MongoDB 成功') 106 | except Exception: 107 | print('存储到 MongoDB 失败') 108 | 109 | 110 | def jobspider(city_code, city, provence): 111 | # 最大爬取页数 112 | MAX_PAGE = 30 113 | for i in range(1, MAX_PAGE + 1): 114 | job_signal = city + str(i) 115 | # print(job_signal) 116 | if job_signal in grouped.size().index: 117 | continue 118 | else: 119 | try: 120 | html = get_page(i, city_code) 121 | # ------------ 解析数据 --------------- 122 | parse(html, city, provence, i) 123 | print('-' * 100) 124 | time.sleep(random.randint(0, 3)) 125 | except Exception: 126 | break 127 | 128 | 129 | if __name__ == '__main__': 130 | # 获取市ID 131 | citylist = city.city() 132 | for city in citylist: 133 | city_code = city[0] 134 | provence = city[1] 135 | city = city[2] 136 | # 职位爬虫 137 | jobspider(city_code, city, provence) 138 | # ----------------- 139 | --------------------------------------------------------------------------------