├── .gitignore
├── LICENSE.txt
├── Plagiarism.md
├── README.md
├── Untitled.ipynb
├── images
    ├── 201904_salary_by_cities.png
    ├── 2019_04_pl_pie.png
    └── 2019_04_pl_word_cloud.png
├── py
    ├── 2db.py
    ├── black_list
    ├── clean.py
    ├── common.py
    ├── company.py
    ├── config.py
    ├── db.py
    ├── download.py
    ├── feature_engineering.py
    ├── multiprocess.py
    ├── old.download_lagou.py
    ├── stats.py
    └── weighted.py
├── reports
    ├── 201904
    │   ├── 996_Survey.ipynb
    │   ├── Beijing.ipynb
    │   ├── Chengdu.ipynb
    │   ├── First_Tier.ipynb
    │   ├── Nanjing.ipynb
    │   ├── Qingdao.ipynb
    │   ├── Survey.ipynb
    │   ├── china_v2.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── cities_pyecharts.ipynb
    │   ├── first_tier_v2.ipynb
    │   └── programming_language.ipynb
    ├── 201905
    │   ├── China.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── first_tier_v2.ipynb
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 201906
    │   ├── cities_basemap.ipynb
    │   ├── first_tier.ipynb
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 201907
    │   ├── cities_basemap.ipynb
    │   ├── first_tier_v2.ipynb
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 201908
    │   ├── cities_basemap.ipynb
    │   ├── first_tier_v2.ipynb
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 201909
    │   ├── cities_basemap.ipynb
    │   ├── first_tier_v2.ipynb
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 201910
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── first_tier_v2.ipynb
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 201911
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── first_tier_v2.ipynb
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 201912
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier_v2.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 202001
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 202002
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 202003
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 202004
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 202005
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 202006
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 202007
    │   ├── General_Stats.ipynb
    │   ├── Untitled.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   └── provinces_basemap.ipynb
    ├── 202008
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   ├── provinces_basemap.ipynb
    │   └── trend.ipynb
    ├── 202009
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   ├── provinces_basemap.ipynb
    │   └── trend.ipynb
    ├── 202010
    │   ├── General_Stats.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   ├── provinces_basemap.ipynb
    │   └── trend.ipynb
    ├── 202011
    │   ├── General_Stats.ipynb
    │   ├── anomaly_explore.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   ├── provinces_basemap.ipynb
    │   └── trend.ipynb
    ├── 202012
    │   ├── General_Stats.ipynb
    │   ├── anomaly_explore.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   ├── provinces_basemap.ipynb
    │   └── trend.ipynb
    ├── 202101
    │   ├── General_Stats.ipynb
    │   ├── anomaly_explore.ipynb
    │   ├── cities_basemap.ipynb
    │   ├── config.py
    │   ├── first_tier.ipynb
    │   ├── machine_learning.ipynb
    │   ├── map_wrapper.py
    │   ├── programming_language.ipynb
    │   ├── provinces_basemap.ipynb
    │   └── trend.ipynb
    ├── city_locations.csv
    └── geo_data
    │   ├── province_city.csv
    │   └── provincial_capital_locations.csv
├── spyder.ipynb
├── sql
    ├── create_city_stats.sql
    ├── create_company.sql
    ├── create_general_stats.sql
    ├── create_table.sql
    ├── create_table_v2.sql
    ├── create_table_v3.sql
    ├── feature_engineering.sql
    ├── update.sql
    ├── update_ml.sql
    └── update_v2.sql
└── whitelist.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .vs/
2 | .vscode/
3 | .spyproject/
4 | .ipynb_checkpoints/
5 | __pycache__
6 | debug.log
7 | geckodriver.log


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) <year> <copyright holders>
 2 | 
 3 | Anti 996 License Version 1.0 (Draft)
 4 | 
 5 | Permission is hereby granted to any individual or legal entity
 6 | obtaining a copy of this licensed work (including the source code,
 7 | documentation and/or related items, hereinafter collectively referred
 8 | to as the "licensed work"), free of charge, to deal with the licensed
 9 | work for any purpose, including without limitation, the rights to use,
10 | reproduce, modify, prepare derivative works of, distribute, publish 
11 | and sublicense the licensed work, subject to the following conditions:
12 | 
13 | 1. The individual or the legal entity must conspicuously display,
14 | without modification, this License and the notice on each redistributed 
15 | or derivative copy of the Licensed Work.
16 | 
17 | 2. The individual or the legal entity must strictly comply with all
18 | applicable laws, regulations, rules and standards of the jurisdiction
19 | relating to labor and employment where the individual is physically
20 | located or where the individual was born or naturalized; or where the
21 | legal entity is registered or is operating (whichever is stricter). In
22 | case that the jurisdiction has no such laws, regulations, rules and
23 | standards or its laws, regulations, rules and standards are
24 | unenforceable, the individual or the legal entity are required to
25 | comply with Core International Labor Standards.
26 | 
27 | 3. The individual or the legal entity shall not induce or force its
28 | employee(s), whether full-time or part-time, or its independent
29 | contractor(s), in any methods, to agree in oral or written form, to
30 | directly or indirectly restrict, weaken or relinquish his or her
31 | rights or remedies under such laws, regulations, rules and standards
32 | relating to labor and employment as mentioned above, no matter whether
33 | such written or oral agreement are enforceable under the laws of the
34 | said jurisdiction, nor shall such individual or the legal entity
35 | limit, in any methods, the rights of its employee(s) or independent
36 | contractor(s) from reporting or complaining to the copyright holder or
37 | relevant authorities monitoring the compliance of the license about
38 | its violation(s) of the said license.
39 | 
40 | THE LICENSED WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
41 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
43 | IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM,
44 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
45 | OTHERWISE, ARISING FROM, OUT OF OR IN ANY WAY CONNECTION WITH THE
46 | LICENSED WORK OR THE USE OR OTHER DEALINGS IN THE LICENSED WORK.
47 | 


--------------------------------------------------------------------------------
/Plagiarism.md:
--------------------------------------------------------------------------------
 1 | # Blacklist of Plagiarism 剽窃黑名单
 2 | 
 3 | 一下作者剽窃了该文：
 4 | 
 5 | [2017年一线城市程序员工资大调查](https://blog.csdn.net/juwikuang/article/details/72888792)
 6 | 
 7 | 
 8 | |网站|名字|证据|Archive|注解|
 9 | |--|--|--|--|--|
10 | |CSDN|代码技巧|[链接](https://blog.csdn.net/tTU1EvLDeLFq5btqiK/article/details/78929087)||未注明出处|
11 | |公众号|代码技巧|[链接](https://mp.weixin.qq.com/s/8kxgoaAEn-Qoz6M0-MhJNw)|[Archive](https://web.archive.org/web/20190420151822/https://mp.weixin.qq.com/s/8kxgoaAEn-Qoz6M0-MhJNw)|未注明出处，和上面是一家|
12 | |CSDN|黄小斜|[链接](https://blog.csdn.net/a724888/article/details/85841595)||未注明出处|
13 | |公众号|程序员江湖|[链接](https://mp.weixin.qq.com/s?__biz=MzUyOTk5NDQwOA==&mid=2247484646&idx=1&sn=9e71c0b3a411f19596e76a719663c003&chksm=fa59c321cd2e4a37602247cf906de6fe842dd20bf9f6940540dfd7d95aaeff49d947a7e4c309&mpshare=1&scene=1&srcid=0420GOJD9QoZi5FgxftRZ9zx&key=cb077656fa10b4eb27f368d30cef703ffcc82770b492b9a167537ddc4000a70a4468f004bea96a7914dad864c99824061f96cc11ffd063e41a281895d0f8677bb0eb2e8d42e2cd6e8b80844fa9914ac4&ascene=1&uin=MjU2NTc5MTk0Mg%3D%3D&devicetype=Windows+10&version=62060739&lang=en&pass_ticket=uf9nNPrm%2FwExDIh6qF0atahVZa%2BFwjMGxjOE8z0Uy5i0YwiioUkwCxNWMl7tVZmp)|[Archive](https://web.archive.org/web/20190420151741/https://mp.weixin.qq.com/s?__biz=MzUyOTk5NDQwOA==&mid=2247484646&idx=1&sn=9e71c0b3a411f19596e76a719663c003&chksm=fa59c321cd2e4a37602247cf906de6fe842dd20bf9f6940540dfd7d95aaeff49d947a7e4c309&mpshare=1&scene=1&srcid=0420GOJD9QoZi5FgxftRZ9zx&key=cb077656fa10b4eb27f368d30cef703ffcc82770b492b9a167537ddc4000a70a4468f004bea96a7914dad864c99824061f96cc11ffd063e41a281895d0f8677bb0eb2e8d42e2cd6e8b80844fa9914ac4&ascene=1&uin=MjU2NTc5MTk0Mg%3D%3D&devicetype=Windows+10&version=62060739&lang=en&pass_ticket=uf9nNPrm%2FwExDIh6qF0atahVZa%2BFwjMGxjOE8z0Uy5i0YwiioUkwCxNWMl7tVZmp)|出处写为【码农有道】，和上面是一家|
14 | |公众号|码农有道|[链接](https://mp.weixin.qq.com/s?__biz=MzIwNTc4NTEwOQ==&mid=2247486120&idx=1&sn=9c4d677ff9823254c7cf3f86b79dd8dd&scene=21#wechat_redirect)|[Archive](https://web.archive.org/web/20190420151309/https://mp.weixin.qq.com/s?__biz=MzIwNTc4NTEwOQ==&mid=2247486120&idx=1&sn=9c4d677ff9823254c7cf3f86b79dd8dd&scene=21%23wechat_redirect)|冒充原创|
15 | |CSDN|Java成长记_Camel|[链接](https://blog.csdn.net/qq_30225725/article/details/86729448)||冒充原创，篡改（2017改成2019）|
16 | |CSDN| 运维派V|[链接](https://blog.csdn.net/ki8qzvka6gz4n450m/article/details/79548177)||冒充原创，误导（2017误导成2019）|
17 | |CSDN| weixin_34248118|[链接](https://blog.csdn.net/weixin_34248118/article/details/87058659)||冒充原创|
18 | |CSDN| 程序员之家v|[链接](https://blog.csdn.net/EGEFCXzo3Ha1x4/article/details/79454135)||未注明出处，百度关键字【一线城市 程序员 工资】头条|
19 | |公众号|养码场|[链接](https://mp.weixin.qq.com/s/EOi3wY0d6K2z7cTZghscvA)|[Archive](https://web.archive.org/web/20190420151218/https://mp.weixin.qq.com/s/EOi3wY0d6K2z7cTZghscvA)|为注明出处，和上面是一家，误导（2017误导为2018）|
20 | |公众号|程序员之家|[链接](https://mp.weixin.qq.com/s/Q4rZdblmjPVJKuoFOMKRjA)|[Archive](https://web.archive.org/web/20190420151129/https://mp.weixin.qq.com/s/Q4rZdblmjPVJKuoFOMKRjA)|为注明出处，误导（2017误导为2018）|
21 | |公众号|千锋教育|[链接](https://mp.weixin.qq.com/s/Xro1BhiYsdwb5IOBPQtZbQ)|[Archive](https://web.archive.org/web/20190420151028/https://mp.weixin.qq.com/s/Xro1BhiYsdwb5IOBPQtZbQ)|为注明出处，误导（2017误导为2018）|
22 | |搜狐号|华俊竹传媒|[链接](http://m.sohu.com/a/277091383_120001579)||为注明出处，误导（2017误导为2018）|
23 | |企鹅号|码农有道|[链接](https://new.qq.com/omn/20181117/20181117B00KBW.html)||为注明出处，误导（2017误导为2018）|
24 | |CSDN| Exceed Oneself|[链接](https://blog.csdn.net/ll666634/article/details/79156271)||未注明出处，误导（2017误导为2018）|
25 | |公众号|Python人工智能|[链接](https://mp.weixin.qq.com/s/Umlu3HI8A-XnJcAK5U9g1Q)|[Archive](https://web.archive.org/web/20190420150859/https://mp.weixin.qq.com/s/Umlu3HI8A-XnJcAK5U9g1Q)|为注明出处，误导（2017误导为2018）|
26 | |公众号|资料在线|[链接](https://mp.weixin.qq.com/s/RiRSGVKzj0gkZoBnVYlX_Q)|[Archive](https://web.archive.org/web/20190420150643/https://mp.weixin.qq.com/s/RiRSGVKzj0gkZoBnVYlX_Q)|出处错误，误导（2017误导为2018）|
27 | |公众号|Java之猿程之家|[链接](https://mp.weixin.qq.com/s/H1QQ3cc64jZF_fHeP2gkQA)|[Archive](https://web.archive.org/web/20190420150511/https://mp.weixin.qq.com/s/H1QQ3cc64jZF_fHeP2gkQA)|未注明出处，误导（2017误导为2019）|
28 | |简书|Grady_Camel|[链接](https://www.jianshu.com/p/a33f7281568a)|[Archive](https://web.archive.org/web/20190420140556/https://www.jianshu.com/p/a33f7281568a)|篡改数据！|
29 | 
30 | 以上公众号，博主，抄袭别人的文章，还篡改，造成误导。请大家取消关注，谢谢。
31 | 
32 | 抄袭文章删除的，联系本人，从黑名单里~~删除~~。
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stats of Chinese Developers
 2 | #统计中国程序员的就业情况
 3 | 
 4 | This repo is to look into Chinese Job website and make stats. 
 5 | 根据招聘网站，统计程序员就业信息。
 6 | 
 7 | ## Salary 程序员工资
 8 | 
 9 | ![Salary Distribution](https://github.com/juwikuang/china_job_survey/blob/master/images/201904_salary_by_cities.png?raw=true)
10 | 
11 | ## Programming Languages 编程语言
12 | 
13 | | - | Langueage       | Percentage |
14 | |---|------------|--------|
15 | | 1 | java       | 29.28% |
16 | | 2 | cpp        | 16.08% |
17 | | 3 | javascript | 15.09% |
18 | | 4 | c_sharp    | 10.95% |
19 | | 5 | python     | 8.21%  |
20 | 
21 | ![](https://github.com/juwikuang/job_survey/blob/master/images/2019_04_pl_word_cloud.png?raw=true)
22 | 
23 | For users from China, please use this link to render the reports:
24 | 
25 | [https://nbviewer.jupyter.org/](https://nbviewer.jupyter.org/)
26 | 
27 | 


--------------------------------------------------------------------------------
/Untitled.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "provinces = {}\n",
 10 |     "provinces['北京'] = '010000'\n",
 11 |     "provinces['上海'] = '020000'\n",
 12 |     "provinces['广东'] = '030000'\n",
 13 |     "provinces['深圳'] = '040000'\n",
 14 |     "provinces['天津'] = '050000'\n",
 15 |     "provinces['重庆'] = '060000'\n",
 16 |     "provinces['江苏'] = '070000'\n",
 17 |     "provinces['浙江'] = '080000'\n",
 18 |     "provinces['四川'] = '090000'"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/plain": [
 29 |        "{'北京': '010000',\n",
 30 |        " '上海': '020000',\n",
 31 |        " '广东': '030000',\n",
 32 |        " '深圳': '040000',\n",
 33 |        " '天津': '050000',\n",
 34 |        " '重庆': '060000',\n",
 35 |        " '江苏': '070000',\n",
 36 |        " '浙江': '080000',\n",
 37 |        " '四川': '090000'}"
 38 |       ]
 39 |      },
 40 |      "execution_count": 2,
 41 |      "metadata": {},
 42 |      "output_type": "execute_result"
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "provinces"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "dict_items([('北京', '010000'), ('上海', '020000'), ('广东', '030000'), ('深圳', '040000'), ('天津', '050000'), ('重庆', '060000'), ('江苏', '070000'), ('浙江', '080000'), ('四川', '090000')])"
 58 |       ]
 59 |      },
 60 |      "execution_count": 3,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "provinces.items()"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 7,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "[('四川', '090000'),\n",
 78 |        " ('浙江', '080000'),\n",
 79 |        " ('江苏', '070000'),\n",
 80 |        " ('重庆', '060000'),\n",
 81 |        " ('天津', '050000'),\n",
 82 |        " ('深圳', '040000'),\n",
 83 |        " ('广东', '030000'),\n",
 84 |        " ('上海', '020000'),\n",
 85 |        " ('北京', '010000')]"
 86 |       ]
 87 |      },
 88 |      "execution_count": 7,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "list(provinces.items())[::-1]"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": []
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "Python 3",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.7.6"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 4
126 | }
127 | 


--------------------------------------------------------------------------------
/images/201904_salary_by_cities.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EricWebsmith/china_job_survey/87c8db28d3ba28729b984b38fd990f385c8e45fb/images/201904_salary_by_cities.png


--------------------------------------------------------------------------------
/images/2019_04_pl_pie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EricWebsmith/china_job_survey/87c8db28d3ba28729b984b38fd990f385c8e45fb/images/2019_04_pl_pie.png


--------------------------------------------------------------------------------
/images/2019_04_pl_word_cloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EricWebsmith/china_job_survey/87c8db28d3ba28729b984b38fd990f385c8e45fb/images/2019_04_pl_word_cloud.png


--------------------------------------------------------------------------------
/py/black_list:
--------------------------------------------------------------------------------
1 | 四川虹美智能科技有限公司
2 | 软件与服务中心
3 | 
4 | 
5 | update _202003 set career='机器学习' where title like '%机器学习%' or title like '%深度学习%'  or title like '%推荐系统%'   or title like '%推荐算法%' or title like '%图像识别%'
6 | or title like '%人工智能%' or   title like '%nlp%'  or title like '%自然语言%' or  title like '%aml%' or title like '%AI%' or title like '%数据科学家%'
7 | or title like '%data scientist%' or title like '%知识图谱%'
8 | or zhinengleibie in ('机器学习工程师','深度学习工程师')


--------------------------------------------------------------------------------
/py/clean.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr  1 23:52:57 2020
 4 | 
 5 | @author: eric
 6 | """
 7 | 
 8 | import config
 9 | 
10 | for table in config.table_list:
11 |     company_titles="('"+"','".join(config.company_blacklist)+"')"
12 |     sql=f'delete from {table} where company_title in {company_titles}'
13 |     print(sql)
14 |     
15 |     
16 | for table in config.table_list:
17 | 
18 |     for key in config.title_key_blacklist:
19 |         sql=f"delete from {table} where title like '%{key}%'"
20 |         print(sql)
21 |         
22 | title_end_blacklist=['审核']
23 |         
24 | for table in config.table_list:
25 |     sql=f"delete from {table} where title like '%审核'"
26 |     print(sql)
27 |     
28 | ids=['105141736', '89941978','107192348']
29 | 
30 | for table in config.table_list:
31 |     sql=f"delete from {table} where job_id='107192348'"
32 |     print(sql)


--------------------------------------------------------------------------------
/py/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Apr  3 11:44:37 2019
 4 | 
 5 | @author: eric
 6 | """
 7 | import inspect
 8 | 
 9 | def is_letter_english(letter):
10 |     return ord(letter)<=126
11 | 
12 | def is_article_english(article):
13 |     english_letters=sum(list(map(is_letter_english,list(article))))
14 |     length=len(article)
15 |     percentage=100*english_letters/length
16 |     return percentage>80
17 | 
18 | def get_featurenames(o):
19 |     #python reflection
20 |     dictionary=inspect.getmembers(o)
21 |     feature_names=[t[0] for t in  dictionary if not t[0].startswith("__") and not t[0].startswith("get_") and not t[0].startswith("check_")]
22 |     return feature_names
23 | 
24 | def object2list(job):
25 |     dictionary=inspect.getmembers(job)
26 |     l=[]
27 |     for key, value in dictionary:
28 |         if key.startswith('__') or key.startswith('get_') or key.startswith('check_'):
29 |             continue
30 |         l.append(value)
31 |     return l
32 |         
33 | def object2dict(o):
34 |     dictionary=inspect.getmembers(o)
35 |     d={}
36 |     for key, value in dictionary:
37 |         if key.startswith('__'):
38 |             continue
39 |         d[key]=value
40 |     return d


--------------------------------------------------------------------------------
/py/company.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Apr  6 10:26:35 2020
  4 | 
  5 | @author: eric
  6 | """
  7 | 
  8 | from requests import get
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | class Company():
 12 | 
 13 |     company_title=""
 14 |     
 15 |     company_description=""
 16 |     #外资(欧美)
 17 |     #外资(非欧美)
 18 |     #合资
 19 |     #国企
 20 |     #民营公司
 21 |     #外企代表处
 22 |     #政府机关
 23 |     #事业单位
 24 |     #非营利组织
 25 |     #上市公司
 26 |     #创业公司
 27 |     company_type=''
 28 | 
 29 |     #少于50人
 30 |     #50-150人
 31 |     #150-500人
 32 |     #500-1000人
 33 |     #1000-5000人
 34 |     #5000-10000人
 35 |     #10000人以上
 36 |     company_size=''
 37 | 
 38 |     #计算机/互联网/通信/电子
 39 |     #会计/金融/银行/保险
 40 |     #贸易/消费/制造/营运
 41 |     #制药/医疗
 42 |     #广告/媒体
 43 |     #房地产/建筑
 44 |     #专业服务/教育/培训
 45 |     #服务业
 46 |     #物流/运输
 47 |     #能源/原材料
 48 |     #政府/非营利组织/其他
 49 |     industry=''
 50 | 
 51 | def get_company_tags(company_link):
 52 |     response=get(company_link)
 53 |     response.encoding='gbk'
 54 |     soup=BeautifulSoup(response.text, 'html.parser')
 55 |     ltype_tag=soup.select_one('.ltype')
 56 |     if not ltype_tag:
 57 |         return []
 58 |     info_string=ltype_tag.text
 59 |     return [info.strip() for info in info_string.split('|')]
 60 | 
 61 | def check_company_size(self):
 62 |     return not self.company_size==''
 63 | 
 64 | def get_company_size(self, tag):
 65 |     if (tag=='少于50人'):
 66 |         self.company_size='50-'
 67 |     elif (tag=='50-150人'):
 68 |         self.company_size='50-150'
 69 |     elif (tag=='150-500人'):
 70 |         self.company_size='150-500'
 71 |     elif (tag=='500-1000人'):
 72 |         self.company_size='500-1000'
 73 |     elif (tag=='1000-5000人'):
 74 |         self.company_size='1000-5000'
 75 |     elif (tag=='5000-10000人'):
 76 |         self.company_size='5000-10000'
 77 |     elif (tag=='10000人以上'):
 78 |         self.company_size='10000+'
 79 |     return self
 80 |     
 81 | def get_company_type(self, tag):
 82 |     if tag in ['外资（欧美）','外资（非欧美）','合资','国企','民营公司','外企代表处','政府机关','事业单位','非营利组织','上市公司''创业公司']:
 83 |         self.company_type=tag
 84 |     return self
 85 | 
 86 | def check_company_type(self):
 87 |     return not self.company_type==''
 88 | 
 89 |     #<span class="bname">公司信息</span>
 90 |     company_info_tag=soup.find('span',text='公司信息')
 91 |     if company_info_tag:
 92 |         job.company_description=company_info_tag.parent.find_next('div').text.replace('\xa0',' ').strip()
 93 | 
 94 |     #['民营公司', '150-500人', '服装/纺织/皮革']
 95 |     company_tags=[p.text.strip() for p in soup.select('.com_tag .at')]
 96 |     
 97 |     if len(company_tags)>0:
 98 |         job.get_company_type(company_tags[0])
 99 |     if job.company_type=='':
100 |         company_link=company_title_tag.attrs['href']
101 |         company_tags=get_company_tags(company_link)
102 |         for tag in company_tags:
103 |             if job.get_company_type(tag).check_company_type():
104 |                 break 
105 | 
106 |     if job.company_type=='':
107 |         return None
108 |     
109 |     job.get_company_size(company_tags[1])
110 |     if not job.check_company_size():
111 |         company_link=company_title_tag.attrs['href']
112 |         company_tags=get_company_tags(company_link)
113 |         for tag in company_tags:
114 |             if job.get_company_size(tag).check_company_size():
115 |                 break
116 |             
117 |     #计算机/互联网/通信/电子
118 |     industry_tags=[p.text.strip() for p in soup.select('.com_tag .at a') if not p.text=='']
119 |     
120 |     
121 |     if len(industry_tags)==0:
122 |         company_link=soup.select_one('.com_name').attrs['href']
123 |         industry_tags=get_company_tags(company_link)
124 |     
125 |     for industry_tag in industry_tags:
126 |         job.get_industry(industry_tag)
127 | 
128 | 
129 | 
130 |     def check_industry(self):
131 |         return not self.industry==''
132 | 
133 |     def get_industry(self, industry_tag):
134 |         if industry_tag in ['计算机软件','计算机硬件','计算机服务(系统、数据服务、维修)','通信/电信/网络设备','通信/电信运营、增值服务','互联网/电子商务','网络游戏','电子技术/半导体/集成电路','仪器仪表/工业自动化']:
135 |             self.industry='computer'
136 |         #会计/金融/银行/保险
137 |         if industry_tag in ['会计/审计','金融/投资/证券','银行','保险','信托/担保/拍卖/典当']:
138 |             self.industry='finance'
139 |         #贸易/消费/制造/营运
140 |         if industry_tag in ['贸易/进出口','批发/零售','快速消费品(食品、饮料、化妆品)','服装/纺织/皮革','家具/家电/玩具/礼品','奢侈品/收藏品/工艺品/珠宝','办公用品及设备','机械/设备/重工','汽车及零配件']:
141 |             self.industry='trade'
142 |         #制药/医疗
143 |         if industry_tag in ['制药/生物工程','医疗/护理/卫生','医疗设备/器械']:
144 |             self.industry='medical'
145 |         #广告/媒体
146 |         if industry_tag in ['广告','公关/市场推广/会展','影视/媒体/艺术/文化传播','文字媒体/出版','印刷/包装/造纸']:
147 |             self.industry='ads'
148 |         #房地产/建筑
149 |         if industry_tag in ['房地产','建筑/建材/工程','家居/室内设计/装潢','物业管理/商业中心']:
150 |             self.industry='realestate'
151 |         #专业服务/教育/培训
152 |         if industry_tag in ['中介服务','专业服务(咨询、人力资源、财会)','外包服务','检测，认证','法律','教育/培训/院校','学术/科研','租赁服务']:
153 |             self.industry='edu'
154 |         #服务业
155 |         if industry_tag in ['餐饮业','酒店/旅游','娱乐/休闲/体育','美容/保健','生活服务']:
156 |             self.industry='service'
157 |         #物流/运输
158 |         if industry_tag in ['交通/运输/物流','航天/航空']:
159 |             self.industry='logistic'
160 |         #能源/原材料
161 |         if industry_tag in ['石油/化工/矿产/地质','采掘业/冶炼','电气/电力/水利','新能源','原材料和加工']:
162 |             self.industry='energy'
163 |         #政府/非营利组织/其他
164 |         if industry_tag in ['政府/公共事业','非营利组织','环保','农/林/牧/渔','多元化业务集团公司']:
165 |             self.industry='gov'
166 |         return self


--------------------------------------------------------------------------------
/py/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Oct  8 00:45:34 2019
  4 | 
  5 | @author: eric
  6 | """
  7 | 
  8 | year=2020
  9 | month=12
 10 | 
 11 | table_list=['_201904','_201905','_201906','_201907','_201908','_201909','_201910','_201911','_201912', '_202001','_202002','_202003','_202004']
 12 | title_key_blacklist=['安全工程师','seo','测试','信息工程师','运维','经理','讲师','教师','老师','负责人','合伙人','计算机技术员','计算机辅助设计','DBA','实施','售前','售后','实习','数据标注员','管培生','2020届大专班','推广员','销售代表']
 13 | company_blacklist=['四川长虹网络科技有限责任公司', '软件与服务中心', '东华医为科技有限公司', '成都迈思信息技术有限公司', '广州国盛网络科技有限公司', '深圳市达铭丰科技有限公司', '北软互联（北京）科技有限公司', '南京瑞玥科技有限公司', '深圳极联信息技术股份有限公司','浙江八方电信有限公司诚聘', '深圳市捷兴电子商务有限公司', '西电济南变压器股份有限公司']
 14 | 
 15 | company_id_blocklist=['co5789883','co6033653']
 16 | 
 17 | zhinengleibies={}
 18 | #后端
 19 | zhinengleibies['0106']='高级软件工程师'
 20 | zhinengleibies['0107']='软件工程师'
 21 | zhinengleibies['0120']='PHP开发工程师'
 22 | zhinengleibies['0121']='Java开发工程师'
 23 | zhinengleibies['0122']='C开发工程师'
 24 | zhinengleibies['0123']='系统分析员'
 25 | zhinengleibies['0124']='Python开发工程师'
 26 | zhinengleibies['0126']='.NET开发工程师'
 27 | 
 28 | zhinengleibies['0128']='区块链开发'
 29 | zhinengleibies['0129']='Hadoop工程师'
 30 | zhinengleibies['0130']='大数据开发工程师'
 31 | zhinengleibies['0131']='爬虫开发工程师'
 32 | zhinengleibies['0132']='脚本开发工程师'
 33 | 
 34 | zhinengleibies['0143']='系统架构设计师'
 35 | zhinengleibies['0151']='Ruby开发工程师'
 36 | zhinengleibies['0152']='Go开发工程师'
 37 | #前端    
 38 | zhinengleibies['7201']='Web前端开发'
 39 | zhinengleibies['7202']='HTML5开发工程师'
 40 | zhinengleibies['7203']='前端开发'    
 41 | #人工智能
 42 | zhinengleibies['7301']='机器学习工程师'
 43 | zhinengleibies['7302']='深度学习工程师'
 44 | zhinengleibies['7303']='图像算法工程师'
 45 | zhinengleibies['7304']='图像处理工程师'
 46 | zhinengleibies['7305']='图像识别工程师'
 47 | zhinengleibies['7306']='语音识别工程师'
 48 | zhinengleibies['7307']='机器视觉工程师'
 49 | zhinengleibies['7308']='自然语言处理（NLP）'
 50 | zhinengleibies['7309']='算法工程师'
 51 | zhinengleibies['7310']='推荐算法工程师'
 52 | zhinengleibies['7311']='搜索算法工程师'
 53 | zhinengleibies['7312']='人工智能'
 54 | 
 55 | #设计
 56 | zhinengleibies['7405']='网站架构设计师'
 57 | #数据
 58 | zhinengleibies['7501']='数据分析师'
 59 | zhinengleibies['7502']='数据分析经理主管'
 60 | zhinengleibies['7503']='ETL开发工程师'
 61 | zhinengleibies['7504']='BI工程师'
 62 | zhinengleibies['7505']='数据仓库工程师'
 63 | zhinengleibies['7506']='数据采集工程师'
 64 | zhinengleibies['7507']='数据建模工程师'
 65 | zhinengleibies['7508']='数据治理工程师'
 66 | zhinengleibies['7509']='数据'
 67 | #移动开发
 68 | zhinengleibies['7701']='Android开发工程师'
 69 | zhinengleibies['7702']='iOS开发工程师'
 70 | zhinengleibies['7703']='移动开发工程师'
 71 | zhinengleibies['7704']='移动开发工程师'
 72 | zhinengleibies['7705']='小程序开发工程师'
 73 | #游戏
 74 | zhinengleibies['7809']='游戏开发工程师'
 75 | zhinengleibies['7810']='Cocos2d-x开发工程师'
 76 | zhinengleibies['7811']='Unity3d开发工程师'
 77 | zhinengleibies['7812']='游戏客户端开发工程师'
 78 | zhinengleibies['7813']='游戏服务端开发工程师'
 79 | #嵌入式
 80 | zhinengleibies['2910']='嵌入式软件开发'
 81 | 
 82 | 
 83 | 
 84 | provinces = {}
 85 | provinces['北京'] = '010000'
 86 | provinces['上海'] = '020000'
 87 | provinces['广东'] = '030000'
 88 | provinces['深圳'] = '040000'
 89 | provinces['天津'] = '050000'
 90 | provinces['重庆'] = '060000'
 91 | provinces['江苏'] = '070000'
 92 | provinces['浙江'] = '080000'
 93 | provinces['四川'] = '090000'
 94 | provinces['海南'] = '100000'
 95 | provinces['福建'] = '110000'
 96 | provinces['山东'] = '120000'
 97 | provinces['江西'] = '130000'
 98 | provinces['广西'] = '140000'
 99 | provinces['安徽'] = '150000'
100 | provinces['河北'] = '160000'
101 | provinces['河南'] = '170000'
102 | provinces['湖北'] = '180000'
103 | provinces['湖南'] = '190000'
104 | provinces['陕西'] = '200000'
105 | provinces['山西'] = '210000'
106 | provinces['黑龙江'] = '220000'
107 | provinces['辽宁'] = '230000'
108 | provinces['吉林'] = '240000'
109 | provinces['云南'] = '250000'
110 | provinces['贵州'] = '260000'
111 | provinces['甘肃'] = '270000'
112 | provinces['内蒙古'] = '280000'
113 | provinces['宁夏'] = '290000'
114 | provinces['西藏'] = '300000'
115 | provinces['新疆'] = '310000'
116 | provinces['青海'] = '320000'
117 | 


--------------------------------------------------------------------------------
/py/db.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Apr  2 22:46:54 2019
 4 | 
 5 | @author: eric
 6 | """
 7 | 
 8 | from sqlalchemy import create_engine
 9 | from urllib.parse import quote_plus
10 | import pandas as pd
11 | 
12 | def get_conn():
13 |     params = quote_plus("DRIVER={SQL Server Native Client 11.0};"
14 |                                      "SERVER=localhost;"
15 |                                      "DATABASE=it_jobs;"
16 |                                      "Trusted_Connection=yes;")
17 |     
18 |     engine = create_engine("mssql+pyodbc:///?odbc_connect={}".format(params))
19 |     conn=engine.connect()
20 |     return conn
21 |     
22 | def get_data(sqlOrTableName, connection, params=None):
23 |     sql = ""
24 |     if len(sqlOrTableName.split(' '))==1: #if it is a table name
25 |         sql = "select * from {}".format(sqlOrTableName)
26 |     else:
27 |         sql=sqlOrTableName
28 |     bookExtensionTable = pd.read_sql(sql,con=connection, params=params)
29 |     return bookExtensionTable
30 | 
31 | 


--------------------------------------------------------------------------------
/py/multiprocess.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Process
 2 | 
 3 | import numpy as np
 4 | import os
 5 | 
 6 | import glob
 7 | 
 8 | from config import year, month, company_blacklist, title_key_blacklist, zhinengleibies
 9 | year_month=f'{year}{month:02}'
10 | 
11 | provinces=['北京','上海','广东','深圳','天津','重庆','江苏','浙江','四川','海南','福建','山东','江西','广西','安徽','河北','河南','湖北','湖南','陕西','山西','黑龙江','辽宁','吉林','云南','贵州','甘肃','内蒙古','宁夏','西藏','新疆','青海']
12 | data_folder = '../../data/51jobs_{}/'.format(year_month)
13 | back_folder = '../../data/51jobs_{}_b/'.format(year_month)
14 | 
15 | d = {}
16 | 
17 | counts = []
18 | for zhinengleibie in list(zhinengleibies.values())[::-1]:
19 |     count=0
20 |     for province in provinces:
21 |         files=glob.glob(f'{data_folder}{zhinengleibie}/{province}/*.*')
22 |         count+=len(files)
23 |     d[zhinengleibie]=count
24 |     counts.append(count)
25 |     #forglob(data_folder+"*")
26 | 
27 | orders = np.argsort(counts)
28 | 
29 | zhineng_splitters=[]
30 | zhineng_splitters=[]
31 | 
32 | n_splitters = 4
33 | 
34 | for splitter_index in range(4):
35 |     zhineng_splitters.append([])
36 | 
37 | for i in range(len(orders)):
38 |     for splitter_index in range(4):
39 |         if i % n_splitters == splitter_index:
40 |             zhineng_splitters[splitter_index].append(znlbs[orders[i]])
41 | 
42 | znlbs=zhinengleibies.values()
43 | znlbs = list(znlbs)
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/py/old.download_lagou.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Mar 31 12:10:20 2019
  4 | 
  5 | @author: eric
  6 | """
  7 | 
  8 | from os import mkdir
  9 | from os import path
 10 | import re
 11 | 
 12 | from urllib.request import urlretrieve
 13 | from requests import get
 14 | from bs4 import BeautifulSoup
 15 | import threading
 16 | from config import year, month
 17 | 
 18 | 
 19 | data_folder = f'../../data/lagou_{year}{month:02}/'
 20 | 
 21 | 
 22 | def main():
 23 |     #city_names=['beijing','shanghai','guangzhou','shenzhen','hangzhou','nanjing','wuhan','chongqing','chengdu','changsha','fuzhou','hefei','ningbo','zhengzhou','tianjin','qingdao','jinan','kuming','shenyang','xian','dongguan','dalian','harbin','changchun']
 24 |     #city_codes=['010000','020000','030200','040000','080200','070200','180200','060000','090200','190200','110200','150200','080300','170200','050000','120300','120200','250200','230200','200200','030800','230300','220200','240200']
 25 |     
 26 |     provinces = {}
 27 |     provinces['北京'] = '010000'
 28 |     provinces['上海'] = '020000'
 29 |     provinces['广东'] = '030000'
 30 |     provinces['深圳'] = '040000'
 31 |     provinces['天津'] = '050000'
 32 |     provinces['重庆'] = '060000'
 33 |     provinces['江苏'] = '070000'
 34 |     provinces['浙江'] = '080000'
 35 |     provinces['四川'] = '090000'
 36 |     provinces['海南'] = '100000'
 37 |     provinces['福建'] = '110000'
 38 |     provinces['山东'] = '120000'
 39 |     provinces['江西'] = '130000'
 40 |     provinces['广西'] = '140000'
 41 |     provinces['安徽'] = '150000'
 42 |     provinces['河北'] = '160000'
 43 |     provinces['河南'] = '170000'
 44 |     provinces['湖北'] = '180000'
 45 |     provinces['湖南'] = '190000'
 46 |     provinces['陕西'] = '200000'
 47 |     provinces['山西'] = '210000'
 48 |     provinces['黑龙江'] = '220000'
 49 |     provinces['辽宁'] = '230000'
 50 |     provinces['吉林'] = '240000'
 51 |     provinces['云南'] = '250000'
 52 |     provinces['贵州'] = '260000'
 53 |     provinces['甘肃'] = '270000'
 54 |     provinces['内蒙古'] = '280000'
 55 |     provinces['宁夏'] = '290000'
 56 |     provinces['西藏'] = '300000'
 57 |     provinces['新疆'] = '310000'
 58 |     provinces['青海'] = '320000'
 59 | 
 60 | 
 61 |     #make sure this folder is created
 62 |     
 63 |     
 64 | 
 65 |     def download_pages(links, folder):
 66 |         for link in links:
 67 |             try:
 68 |                 if not link.startswith("https://jobs.51job.com/"):
 69 |                     continue
 70 |                 filename = path.split(link)[-1]
 71 |                 filename = filename.split('?')[0]
 72 |                 destination_file = path.join(folder, filename)
 73 |                 if not path.isfile(destination_file):
 74 |                     print(link)
 75 |                     t = threading.Thread(target=urlretrieve, args=(link, path.join(folder, filename)))
 76 |                     t.start()
 77 |                     #urlretrieve(link, path.join(folder, filename))
 78 |                     
 79 |             except Exception as e:
 80 |                 print(str(e))
 81 |                 pass
 82 |             
 83 |     #0100是软件，2500是互联网
 84 |     categories={}
 85 |     categories['0106']='高级软件工程师'
 86 |     categories['0107']='软件工程师'
 87 |     categories['0109']='机器学习工程师'
 88 |     categories['0110']='深度学习工程师'
 89 |     categories['0111']='图像算法工程师'
 90 |     categories['0112']='图像处理工程师'
 91 |     categories['0113']='语音识别工程师'
 92 |     categories['0114']='图像识别工程师'
 93 |     categories['0115']='机器视觉工程师'
 94 |     categories['0116']='自然语言处理（NLP）'
 95 |     categories['0148']='算法工程师'
 96 |     categories['0143']='系统架构设计师'
 97 |     categories['2501']='互联网软件开发工程师'
 98 |     categories['2537']='手机应用开发工程师'
 99 |     categories['2512']='网站架构设计师'
100 |     #categories['']=''
101 |     #categories['']=''
102 |     for category_key, category_name in categories.items():
103 |         job_category_folder = path.join(data_folder, category_name)
104 |         if not path.isdir(job_category_folder):
105 |             mkdir(job_category_folder)
106 |         for province_name, province_code in provinces.items():
107 |             #create forlder
108 |             province_folder = path.join(job_category_folder, province_name)
109 |             if not path.isdir(province_folder):
110 |                 mkdir(province_folder)
111 |             #links -
112 |             #first page
113 |             first_page_url = 'https://search.51job.com/list/{0},000000,{1},00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(province_code, category_key)
114 |             first_page = get(first_page_url)
115 |             first_page.encoding = 'gb2312'
116 |             soup = BeautifulSoup(first_page.text,"html.parser")
117 |             #page_count_string='共328页，到第'
118 |             page_count_string = soup.select_one(".p_in .td").text
119 |             re_result = re.match(r'共(\d+)页，到第',page_count_string)
120 |             total_page = int(re_result.group(1))
121 |             print("{0} has {1} pages".format(province_name, total_page))
122 |             
123 | 
124 | 
125 | 
126 |             for page_index in range(1,total_page):
127 |                 #'https://sou.zhaopin.com/?jl=530&sf=0&st=0&jt=23,160000,045'
128 |                 list_url = 'https://search.51job.com/list/{0},000000,{1},00,9,99,%2B,2,{2}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(province_code, category_key, page_index)
129 |                 list_page = get(list_url)
130 |                 list_page.encoding = 'gb2312'
131 |                 soup = BeautifulSoup(list_page.text,"html.parser")
132 |                 #get list page
133 |                 links = [tag.attrs['href'] for tag in soup.select(".t1 a")]
134 |                 download_pages(links, province_folder)
135 | 
136 |     
137 | if __name__ == '__main__':
138 |     main()
139 | 
140 | 
141 | 
142 | #beijing page 1 %25=%
143 | #https://search.51job.com/list/010000,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
144 | #shanghai page 1 %25=%
145 | #https://search.51job.com/list/020000,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
146 | #guangzhou page 1
147 | #https://search.51job.com/list/030200,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
148 | #shenzhen page 1
149 | #https://search.51job.com/list/040000,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
150 | #hangzhou page 1
151 | #https://search.51job.com/list/080200,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
152 | 
153 | #beijing page 2 %2B=+
154 | #https://search.51job.com/list/010000,000000,0100,00,9,99,%2B,2,2.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
155 | #https://search.51job.com/list/010000,000000,0100,00,9,99,%2B,2,3.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
156 | 
157 | #beijiang page 1
158 | #https://search.51job.com/list/010000,000000,0100,00,9,99,%2B,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/py/stats.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Oct  5 23:49:42 2019
 4 | 
 5 | @author: eric
 6 | """
 7 | import numpy as np
 8 | import pandas as pd
 9 | import weighted
10 | import db
11 | from config import year, month
12 | 
13 | 
14 | conn=db.get_conn()
15 | 
16 | #month=1
17 | #city stats
18 | cities = """
19 | ('北京','上海','深圳','杭州','广州','南京','苏州','成都','东莞','西安','武汉','天津','长沙',
20 | '宁波','福州','大连','重庆','青岛','济南','合肥','长春','昆明','郑州','沈阳','哈尔滨','厦门')
21 | """
22 | 
23 | sql=f"""select SUM(monthly_salary * headcount)/SUM(headcount) as salary, MAX(city) as city
24 |  from jobs where year_month={year}{month:02} and monthly_salary>0 and monthly_salary<80000 and city in {cities}
25 |  group by city
26 | """
27 | 
28 | result=conn.execute(sql).fetchall()
29 | 
30 | conn.execute(f"delete from City_Stats where year_month='{year}{month:02}'")
31 | 
32 | sql_insert=""
33 | 
34 | for salary, city in result:
35 |     sql_insert+="insert into City_Stats(year_month, City, Salary) "
36 |     sql_insert+=f" values('{year}{month:02}', '{city}', {salary});\n"
37 | 
38 | conn.execute(sql_insert)
39 | 
40 | 
41 | 
42 | #MonthlyStats
43 | def get_summary(data, career):
44 |     
45 |     salaries = data.monthly_salary.values
46 |     headcounts = data.headcount.values
47 |     head_count=np.sum(headcounts)
48 |     salary_average=int(np.average(salaries, weights=headcounts))
49 |     q = weighted.weighted_quantile(salaries,[0.025,0.5,0.975],headcounts)
50 |     print(f"{year}年{month}月全国招收{career}{head_count}人。{year}年{month}月全国{career}平均工资{salary_average:.0f}元，工资中位数{q[1]:.0f}元，其中95%的人的工资介于{q[0]:.0f}元到{q[2]:.0f}元。\r\n")
51 |     return head_count, salary_average, q[1]
52 |     
53 | data=pd.read_sql(sql=f"select * from jobs where year_month= {year}{month:02} and monthly_salary>0 and monthly_salary<80000", con=conn)
54 | headcount, mean, median=get_summary(data, '程序员')
55 | conn.execute(f"delete from general_Stats where year_month='{year}{month:02}'")
56 | sql="insert into general_Stats(year_month, Salary_Mean, Salary_Median, JD_Count, Head_Count)"
57 | sql=sql+f" values('{year}{month:02}',{mean},{median},{data.shape[0]},{headcount})"
58 | conn.execute(sql)
59 | 
60 | conn.close()


--------------------------------------------------------------------------------
/py/weighted.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def weighted_mean(values, weights):
 4 |     return np.sum(values * weights) / weights.sum()
 5 | 
 6 | def weighted_median(values, weights):
 7 |     sorter = np.argsort(values)
 8 |     values = values[sorter]
 9 |     weights = weights[sorter]
10 |     weight_sum = np.sum(weights)
11 |     weight_cum=0
12 |     index=0
13 |     for i in range(len(weights)):
14 |         weight_cum+=weights[i]
15 |         if weight_cum>=weight_sum/2:
16 |             index = i
17 |             break
18 |     
19 |     return values[index]
20 | 
21 | #https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy
22 | 
23 | def weighted_quantile(values, quantiles, sample_weight=None, 
24 |                       values_sorted=False, old_style=False):
25 |     """ Very close to numpy.percentile, but supports weights.
26 |     NOTE: quantiles should be in [0, 1]!
27 |     :param values: numpy.array with data
28 |     :param quantiles: array-like with many quantiles needed
29 |     :param sample_weight: array-like of the same length as `array`
30 |     :param values_sorted: bool, if True, then will avoid sorting of
31 |         initial array
32 |     :param old_style: if True, will correct output to be consistent
33 |         with numpy.percentile.
34 |     :return: numpy.array with computed quantiles.
35 |     """
36 |     values = np.array(values)
37 |     quantiles = np.array(quantiles)
38 |     if sample_weight is None:
39 |         sample_weight = np.ones(len(values))
40 |     sample_weight = np.array(sample_weight)
41 |     assert np.all(quantiles >= 0) and np.all(quantiles <= 1), \
42 |         'quantiles should be in [0, 1]'
43 | 
44 |     if not values_sorted:
45 |         sorter = np.argsort(values)
46 |         values = values[sorter]
47 |         sample_weight = sample_weight[sorter]
48 | 
49 |     weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
50 |     if old_style:
51 |         # To be convenient with numpy.percentile
52 |         weighted_quantiles -= weighted_quantiles[0]
53 |         weighted_quantiles /= weighted_quantiles[-1]
54 |     else:
55 |         weighted_quantiles /= np.sum(sample_weight)
56 |     return np.interp(quantiles, weighted_quantiles, values)


--------------------------------------------------------------------------------
/reports/201912/config.py:
--------------------------------------------------------------------------------
1 | year=2019
2 | month=12


--------------------------------------------------------------------------------
/reports/201912/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row[0]
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row[1]
 47 |         headcount=row[5]
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row[0]
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202001/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202001/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row[0]
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row[1]
 47 |         headcount=row[5]
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row[0]
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202002/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202002/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row[0]
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row[1]
 47 |         headcount=row[5]
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row[0]
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202003/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202003/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row[0]
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row[1]
 47 |         headcount=row[5]
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row[0]
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202004/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202004/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row[0]
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row[1]
 47 |         headcount=row[5]
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row[0]
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202005/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202005/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202006/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202006/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202007/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202007/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202008/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202008/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202009/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202009/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202010/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202010/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202010/provinces_basemap.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "ename": "ModuleNotFoundError",
 10 |      "evalue": "No module named 'mpl_toolkits.basemap'",
 11 |      "output_type": "error",
 12 |      "traceback": [
 13 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 14 |       "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
 15 |       "\u001b[1;32m<ipython-input-1-4644dc56fc29>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mconfig\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mmap_wrapper\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
 16 |       "\u001b[1;32mD:\\projects\\51job_survey\\51job_survey_py\\reports\\202010\\map_wrapper.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mconfig\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mmpl_toolkits\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbasemap\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mBasemap\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdraw_city_map\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_city\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mheadcount_scale\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 17 |       "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'mpl_toolkits.basemap'"
 18 |      ]
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "from config import *\n",
 23 |     "from map_wrapper import *"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "print(f'{year}年{month}月')"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import pandas as pd\n",
 42 |     "import sys\n",
 43 |     "sys.path.append('../../py')\n",
 44 |     "import db\n",
 45 |     "import weighted\n",
 46 |     "import inspect\n",
 47 |     "import matplotlib.pyplot as plt\n",
 48 |     "plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签\n",
 49 |     "plt.rcParams['axes.unicode_minus']=False #用来正常显示负号\n",
 50 |     "%matplotlib inline\n",
 51 |     "from mpl_toolkits.basemap import Basemap\n",
 52 |     "import seaborn as sns\n",
 53 |     "import scipy.stats as stats\n",
 54 |     "import numpy as np\n",
 55 |     "import math\n",
 56 |     "from matplotlib.font_manager import _rebuild\n",
 57 |     "\n",
 58 |     "_rebuild() #reload一下"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "conn=db.get_conn()\n",
 68 |     "data_original=pd.read_sql(sql=f\"select * from _{year}{month:02} where monthly_salary>0 and monthly_salary<80000\", con=conn)\n",
 69 |     "\n",
 70 |     "data=data_original[~data_original.job_id.isin(error_job_ids)]\n",
 71 |     "\n",
 72 |     "del data['publish_date']\n",
 73 |     "del data['published_on_weekend']\n",
 74 |     "del data['title']\n",
 75 |     "#del data['company_title']\n",
 76 |     "#del data['company_description']\n",
 77 |     "del data['job_description']\n",
 78 |     "del data['job_id']\n",
 79 |     "\n",
 80 |     "\n"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "\n",
 90 |     "join_counts=[conn.execute(f\"select COUNT(1) from _{year}{month:02}\").fetchall()[0][0]]\n",
 91 |     "percents=[]\n",
 92 |     "for i in range(1,month-6+1):\n",
 93 |     "    sql=f\"select COUNT(1) from _{year}{month:02} a join _{year}{month-i:02} b on a.job_id = b.job_id\"\n",
 94 |     "    #print(sql)\n",
 95 |     "    count=conn.execute(sql).fetchall()[0][0]\n",
 96 |     "\n",
 97 |     "    join_counts.append(count)\n",
 98 |     "    subtract = join_counts[i-1]-join_counts[i]\n",
 99 |     "    percents.append(subtract*1.0/join_counts[i])\n",
100 |     "\n",
101 |     "percents.append(join_counts[-1]/join_counts[0])"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "join_counts"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "percents"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "#plt.pie(percents, labels=['1','2','3','4','5','6','7','7+'])\n",
129 |     "#plt.show()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "data.shape[0]"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "conn.close()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "#Common Functions\n",
157 |     "def get_sub_stats_by_col(data, col):\n",
158 |     "    categories=data[col].unique()\n",
159 |     "    salary_mean=[]\n",
160 |     "    salary_95_min=[]\n",
161 |     "    salary_95_max=[]\n",
162 |     "    salary_median=[]\n",
163 |     "\n",
164 |     "    count=[]\n",
165 |     "    \n",
166 |     "    categorys_out=[]\n",
167 |     "    for category in categories:\n",
168 |     "        #print(feature)\n",
169 |     "        idata=data[data[col]==category]\n",
170 |     "        headcount=idata.headcount.sum()\n",
171 |     "        values = idata.monthly_salary.values\n",
172 |     "        weights = idata.headcount.values\n",
173 |     "        #print(str(headcount))\n",
174 |     "        if headcount==0:\n",
175 |     "            continue\n",
176 |     "        \n",
177 |     "        salary_mean.append(np.average(values, weights=weights))\n",
178 |     "        \n",
179 |     "\n",
180 |     "        q = weighted.weighted_quantile(values,[0.025,0.5,0.975],weights)\n",
181 |     "        salary_95_min.append(q[0])\n",
182 |     "        salary_median.append(q[1])\n",
183 |     "        salary_95_max.append(q[2])\n",
184 |     "        count.append(idata.headcount.sum())\n",
185 |     "        categorys_out.append(category)\n",
186 |     "    sub_data=pd.DataFrame()\n",
187 |     "    sub_data[col]=[c for c in categorys_out]\n",
188 |     "    sub_data['salary_mean']=salary_mean\n",
189 |     "    sub_data['salary_95_min']=salary_95_min\n",
190 |     "    sub_data['salary_median']=salary_median\n",
191 |     "    sub_data['salary_95_max']=salary_95_max\n",
192 |     "    sub_data['head_count']=count\n",
193 |     "    sub_data['percentage']=count/np.sum(count)\n",
194 |     "    sub_data=sub_data.sort_values(by='salary_mean', ascending=False)\n",
195 |     "\n",
196 |     "    return sub_data\n",
197 |     "\n",
198 |     "data_format={\"percentage\":\"{:.2%}\",\"salary_mean\":\"{:.0f}\",\"salary_median\":\"{:.0f}\",\"salary_95_min\":\"{:.0f}\",\"salary_95_max\":\"{:.0f}\"}\n",
199 |     "\n"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "data_career=get_sub_stats_by_col(data,'career')\n",
209 |     "data_career.style.format(data_format)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "# 程序员工资"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "data_city=get_sub_stats_by_col(data,'province')\n",
226 |     "#data_city.city=data_city.city.map(translate_dict)\n",
227 |     "data_city.style.hide_index().format(data_format)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": []
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "describe(data_city,'程序员')"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": []
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "draw_province_map(data_city,2000,'2019年5月中国大陆各省程序员工资')"
260 |    ]
261 |   }
262 |  ],
263 |  "metadata": {
264 |   "kernelspec": {
265 |    "display_name": "Python 3",
266 |    "language": "python",
267 |    "name": "python3"
268 |   },
269 |   "language_info": {
270 |    "codemirror_mode": {
271 |     "name": "ipython",
272 |     "version": 3
273 |    },
274 |    "file_extension": ".py",
275 |    "mimetype": "text/x-python",
276 |    "name": "python",
277 |    "nbconvert_exporter": "python",
278 |    "pygments_lexer": "ipython3",
279 |    "version": "3.8.3"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 4
284 | }
285 | 


--------------------------------------------------------------------------------
/reports/202011/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202011/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202011/provinces_basemap.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "ename": "ModuleNotFoundError",
 10 |      "evalue": "No module named 'mpl_toolkits.basemap'",
 11 |      "output_type": "error",
 12 |      "traceback": [
 13 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 14 |       "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
 15 |       "\u001b[1;32m<ipython-input-1-4644dc56fc29>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mconfig\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mmap_wrapper\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
 16 |       "\u001b[1;32mD:\\projects\\51job_survey\\51job_survey_py\\reports\\202010\\map_wrapper.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mconfig\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mmpl_toolkits\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbasemap\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mBasemap\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdraw_city_map\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_city\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mheadcount_scale\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 17 |       "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'mpl_toolkits.basemap'"
 18 |      ]
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "from config import *\n",
 23 |     "from map_wrapper import *"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "print(f'{year}年{month}月')"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import pandas as pd\n",
 42 |     "import sys\n",
 43 |     "sys.path.append('../../py')\n",
 44 |     "import db\n",
 45 |     "import weighted\n",
 46 |     "import inspect\n",
 47 |     "import matplotlib.pyplot as plt\n",
 48 |     "plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签\n",
 49 |     "plt.rcParams['axes.unicode_minus']=False #用来正常显示负号\n",
 50 |     "%matplotlib inline\n",
 51 |     "from mpl_toolkits.basemap import Basemap\n",
 52 |     "import seaborn as sns\n",
 53 |     "import scipy.stats as stats\n",
 54 |     "import numpy as np\n",
 55 |     "import math\n",
 56 |     "from matplotlib.font_manager import _rebuild\n",
 57 |     "\n",
 58 |     "_rebuild() #reload一下"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "conn=db.get_conn()\n",
 68 |     "data_original=pd.read_sql(sql=f\"select * from _{year}{month:02} where monthly_salary>0 and monthly_salary<80000\", con=conn)\n",
 69 |     "\n",
 70 |     "data=data_original[~data_original.job_id.isin(error_job_ids)]\n",
 71 |     "\n",
 72 |     "del data['publish_date']\n",
 73 |     "del data['published_on_weekend']\n",
 74 |     "del data['title']\n",
 75 |     "#del data['company_title']\n",
 76 |     "#del data['company_description']\n",
 77 |     "del data['job_description']\n",
 78 |     "del data['job_id']\n",
 79 |     "\n",
 80 |     "\n"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "\n",
 90 |     "join_counts=[conn.execute(f\"select COUNT(1) from _{year}{month:02}\").fetchall()[0][0]]\n",
 91 |     "percents=[]\n",
 92 |     "for i in range(1,month-6+1):\n",
 93 |     "    sql=f\"select COUNT(1) from _{year}{month:02} a join _{year}{month-i:02} b on a.job_id = b.job_id\"\n",
 94 |     "    #print(sql)\n",
 95 |     "    count=conn.execute(sql).fetchall()[0][0]\n",
 96 |     "\n",
 97 |     "    join_counts.append(count)\n",
 98 |     "    subtract = join_counts[i-1]-join_counts[i]\n",
 99 |     "    percents.append(subtract*1.0/join_counts[i])\n",
100 |     "\n",
101 |     "percents.append(join_counts[-1]/join_counts[0])"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "join_counts"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "percents"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "#plt.pie(percents, labels=['1','2','3','4','5','6','7','7+'])\n",
129 |     "#plt.show()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "data.shape[0]"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "conn.close()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "#Common Functions\n",
157 |     "def get_sub_stats_by_col(data, col):\n",
158 |     "    categories=data[col].unique()\n",
159 |     "    salary_mean=[]\n",
160 |     "    salary_95_min=[]\n",
161 |     "    salary_95_max=[]\n",
162 |     "    salary_median=[]\n",
163 |     "\n",
164 |     "    count=[]\n",
165 |     "    \n",
166 |     "    categorys_out=[]\n",
167 |     "    for category in categories:\n",
168 |     "        #print(feature)\n",
169 |     "        idata=data[data[col]==category]\n",
170 |     "        headcount=idata.headcount.sum()\n",
171 |     "        values = idata.monthly_salary.values\n",
172 |     "        weights = idata.headcount.values\n",
173 |     "        #print(str(headcount))\n",
174 |     "        if headcount==0:\n",
175 |     "            continue\n",
176 |     "        \n",
177 |     "        salary_mean.append(np.average(values, weights=weights))\n",
178 |     "        \n",
179 |     "\n",
180 |     "        q = weighted.weighted_quantile(values,[0.025,0.5,0.975],weights)\n",
181 |     "        salary_95_min.append(q[0])\n",
182 |     "        salary_median.append(q[1])\n",
183 |     "        salary_95_max.append(q[2])\n",
184 |     "        count.append(idata.headcount.sum())\n",
185 |     "        categorys_out.append(category)\n",
186 |     "    sub_data=pd.DataFrame()\n",
187 |     "    sub_data[col]=[c for c in categorys_out]\n",
188 |     "    sub_data['salary_mean']=salary_mean\n",
189 |     "    sub_data['salary_95_min']=salary_95_min\n",
190 |     "    sub_data['salary_median']=salary_median\n",
191 |     "    sub_data['salary_95_max']=salary_95_max\n",
192 |     "    sub_data['head_count']=count\n",
193 |     "    sub_data['percentage']=count/np.sum(count)\n",
194 |     "    sub_data=sub_data.sort_values(by='salary_mean', ascending=False)\n",
195 |     "\n",
196 |     "    return sub_data\n",
197 |     "\n",
198 |     "data_format={\"percentage\":\"{:.2%}\",\"salary_mean\":\"{:.0f}\",\"salary_median\":\"{:.0f}\",\"salary_95_min\":\"{:.0f}\",\"salary_95_max\":\"{:.0f}\"}\n",
199 |     "\n"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "data_career=get_sub_stats_by_col(data,'career')\n",
209 |     "data_career.style.format(data_format)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "# 程序员工资"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "data_city=get_sub_stats_by_col(data,'province')\n",
226 |     "#data_city.city=data_city.city.map(translate_dict)\n",
227 |     "data_city.style.hide_index().format(data_format)"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": []
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "describe(data_city,'程序员')"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": []
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "draw_province_map(data_city,2000,'2019年5月中国大陆各省程序员工资')"
260 |    ]
261 |   }
262 |  ],
263 |  "metadata": {
264 |   "kernelspec": {
265 |    "display_name": "Python 3",
266 |    "language": "python",
267 |    "name": "python3"
268 |   },
269 |   "language_info": {
270 |    "codemirror_mode": {
271 |     "name": "ipython",
272 |     "version": 3
273 |    },
274 |    "file_extension": ".py",
275 |    "mimetype": "text/x-python",
276 |    "name": "python",
277 |    "nbconvert_exporter": "python",
278 |    "pygments_lexer": "ipython3",
279 |    "version": "3.8.3"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 4
284 | }
285 | 


--------------------------------------------------------------------------------
/reports/202012/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202012/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/202101/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | pwd=os.getcwd()
4 | year_month=pwd.split('\\')[-1]
5 | 
6 | year=int(year_month[:4])
7 | month=int(year_month[4:])
8 | 


--------------------------------------------------------------------------------
/reports/202101/map_wrapper.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import pandas as pd
  4 | from config import *
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap
  7 | 
  8 | def draw_city_map(data_city,headcount_scale, title):
  9 | 
 10 | 
 11 |     
 12 |     data_location = pd.read_csv('../city_locations.csv')
 13 |     data_location=data_location.set_index('city')    
 14 |     
 15 |     #cities = []
 16 |     scale = 5
 17 | 
 18 |     locations = [(116.407526, 39.90403),(120, 30)]
 19 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
 20 |     plt.rcParams['figure.figsize'] = [13, 13]
 21 |     #plt.figure(figsize = (10,5))
 22 |     fig, ax = plt.subplots()
 23 |     fig.title=title
 24 |     fig.figsize=(10,5)
 25 |     fig.dpi=80
 26 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
 27 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
 28 | 
 29 |     # load the shapefile, use the name 'states'
 30 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
 31 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
 32 |     #geolocator = Nominatim(user_agent="my-application")
 33 | 
 34 | 
 35 | 
 36 |     salary_min=data_city['平均工资'].min()
 37 |     salary_max=data_city['平均工资'].max()
 38 |     salary_middle = (salary_min+salary_max)/2
 39 |     salary_scale=salary_max-salary_min
 40 | 
 41 |     for index, row in data_city.iterrows():
 42 |         city=row['city']
 43 |         
 44 |         longitude = data_location.loc[city,'longitude']
 45 |         latitude = data_location.loc[city,'latitude']
 46 |         salary=row['平均工资']
 47 |         headcount=row['招聘人数']
 48 |         #color
 49 |         color_red=0
 50 |         color_green=0
 51 |         color_blue=0
 52 |         if salary>salary_middle:
 53 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
 54 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
 55 |         else:
 56 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
 57 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
 58 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
 59 | 
 60 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
 61 | 
 62 | 
 63 |         x, y = cn_map(longitude,latitude)
 64 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
 65 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
 66 |         #"{}{:.0f}".format(city_cn, salary)
 67 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
 68 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
 69 |         if city == '杭州':
 70 |             x=x-400000
 71 |             y=y+10000
 72 |         elif city=='广州':
 73 |             x=x-400000
 74 |             y=y+10000
 75 |         elif city=='合肥':
 76 |             x=x-300000
 77 |             y=y+10000
 78 |         elif city=='深圳':
 79 |             y=y-100000
 80 |         elif city=='南京':
 81 |             x=x-100000
 82 |         elif city=='天津':
 83 |             y=y-50000
 84 |         elif city=='上海':
 85 |             x=x+50000
 86 |         elif city=='武汉':
 87 |             y=y-50000
 88 |         elif city=='厦门':
 89 |             pass
 90 |         elif city=='福州':
 91 |             pass
 92 |         elif city=='苏州':
 93 |             y=y-100000
 94 |             pass
 95 |         elif city=='宁波':
 96 |             y=y-100000
 97 |             pass
 98 | 
 99 |         ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
100 |     ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
101 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
102 |     ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
103 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
104 |     #cn_map.drawcoastlines() #绘制海岸线
105 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
106 |     plt.show()
107 | 
108 | def draw_province_map(data_city,headcount_scale, title):
109 |     
110 |     data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8')
111 |     data_location=data_location.set_index('province')
112 | 
113 |     #cities = []
114 |     scale = 5
115 | 
116 |     locations = [(116.407526, 39.90403),(120, 30)]
117 |     #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
118 |     plt.rcParams['figure.figsize'] = [13, 13]
119 |     #plt.figure(figsize = (10,5))
120 |     fig, ax = plt.subplots()
121 |     fig.title=title
122 |     fig.figsize=(10,5)
123 |     fig.dpi=80
124 |     cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \
125 |                projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影
126 | 
127 |     # load the shapefile, use the name 'states'
128 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray')
129 |     cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray')
130 |     #geolocator = Nominatim(user_agent="my-application")
131 | 
132 | 
133 | 
134 |     salary_min=data_city.salary_mean.min()
135 |     salary_max=data_city.salary_mean.max()
136 |     salary_middle = (salary_min+salary_max)/2
137 |     salary_scale=salary_max-salary_min
138 | 
139 |     for index, row in data_city.iterrows():
140 |         province=row['province']
141 |         
142 |         longitude = data_location.loc[province,'longitude']
143 |         latitude = data_location.loc[province,'latitude']
144 |         salary=row[1]
145 |         headcount=row[5]
146 |         #color
147 |         color_red=0
148 |         color_green=0
149 |         color_blue=0
150 |         if salary>salary_middle:
151 |             color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255)
152 |             color_green = int((salary_max - salary) / (salary_scale/2)*255)
153 |         else:
154 |             color_blue = int((salary_middle - salary) / (salary_scale/2)*255)
155 |             color_green = int((salary - salary_min) / (salary_scale/2)*255)
156 |             color_red = int((salary - salary_min) / (salary_scale/2)*255)
157 | 
158 |         color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue)
159 | 
160 | 
161 |         x, y = cn_map(longitude,latitude)
162 |         cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8)
163 |         #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15)
164 |         #"{}{:.0f}".format(city_cn, salary)
165 |         #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12))
166 |         fontsize=int(math.sqrt(headcount/headcount_scale))+13
167 |         if province == '浙江':
168 |             #x=x-400000
169 |             y=y-100000
170 | 
171 |         elif province=='安徽':
172 |             x=x-300000
173 |             y=y+10000
174 |         elif province=='江苏':
175 |             x=x-150000
176 |         elif province=='天津':
177 |             y=y-50000
178 |         elif province=='上海':
179 |             x=x+50000
180 |         elif province=='湖北':
181 |             y=y-50000
182 | 
183 |         ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0})
184 |     ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
185 |     ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0})    
186 |     ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25)    
187 |     ax.text(805805, 3807845, "（城市大小代表招聘数量，颜色代表工资，红色最高，黄色次之，蓝最少）", fontweight='bold',color='#111111', fontsize=13)    
188 |     #cn_map.drawcoastlines() #绘制海岸线
189 |     #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线
190 |     plt.show()
191 |     
192 |     
193 | def describe(data_city, career):
194 |     
195 |     for index, row in data_city.iterrows():
196 |         print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元，工资中位数{row[3]:.0f}元，其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n")
197 | 


--------------------------------------------------------------------------------
/reports/city_locations.csv:
--------------------------------------------------------------------------------
 1 | ﻿city,longitude,latitude
 2 | 北京,116.407526,39.90403
 3 | 上海,121.473701,31.230416
 4 | 深圳,114.07,22.62
 5 | 杭州,120.19,30.26
 6 | 广州,113.23,23.16
 7 | 南京,118.78,32.04
 8 | 成都,104.06,30.67
 9 | 东莞,113.75,23.04
10 | 西安,108.95,34.27
11 | 武汉,114.31,30.52
12 | 天津,117.200983,39.084158
13 | 长沙,113,28.21
14 | 宁波,121.56,29.86
15 | 福州,119.3,26.08
16 | 大连,121.62,38.92
17 | 重庆,106.551556,29.563009
18 | 青岛,120.33,36.07
19 | 济南,117,36.65
20 | 合肥,117.27,31.86
21 | 长春,125.35,43.88
22 | 昆明,102.73,25.04
23 | 郑州,113.65,34.76
24 | 沈阳,123.38,41.8
25 | 哈尔滨,126.63,45.75
26 | 厦门,118.06,24.44
27 | 苏州,120.62,31.32


--------------------------------------------------------------------------------
/reports/geo_data/province_city.csv:
--------------------------------------------------------------------------------
  1 | ﻿
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | 
  8 | 
  9 | 
 10 | 
 11 | 
 12 | 
 13 | 
 14 | 
 15 | province,city
 16 | 北京,北京
 17 | 上海,上海
 18 | 天津,天津
 19 | 重庆,重庆
 20 | 深圳,深圳
 21 | 河北,保定
 22 | 河北,沧州
 23 | 河北,承德
 24 | 河北,邯郸
 25 | 河北,衡水
 26 | 河北,廊坊
 27 | 河北,秦皇岛
 28 | 河北,石家庄
 29 | 河北,唐山
 30 | 河北,邢台
 31 | 河北,张家口
 32 | 山西,长治
 33 | 山西,大同
 34 | 山西,晋城
 35 | 山西,晋中
 36 | 山西,临汾
 37 | 山西,吕梁
 38 | 山西,朔州
 39 | 山西,太原
 40 | 山西,忻州
 41 | 山西,阳泉
 42 | 山西,运城
 43 | 内蒙古,阿拉善盟
 44 | 内蒙古,巴彦淖尔
 45 | 内蒙古,包头
 46 | 内蒙古,赤峰
 47 | 内蒙古,鄂尔多斯
 48 | 内蒙古,呼和浩特
 49 | 内蒙古,呼伦贝尔
 50 | 内蒙古,通辽
 51 | 内蒙古,乌海
 52 | 内蒙古,乌兰察布
 53 | 内蒙古,锡林郭勒盟
 54 | 内蒙古,兴安盟
 55 | 辽宁,鞍山
 56 | 辽宁,本溪
 57 | 辽宁,朝阳
 58 | 辽宁,大连
 59 | 辽宁,丹东
 60 | 辽宁,抚顺
 61 | 辽宁,阜新
 62 | 辽宁,葫芦岛
 63 | 辽宁,锦州
 64 | 辽宁,辽阳
 65 | 辽宁,盘锦
 66 | 辽宁,沈阳
 67 | 辽宁,铁岭
 68 | 辽宁,营口
 69 | 吉林,白城
 70 | 吉林,白山
 71 | 吉林,长春
 72 | 吉林,吉林
 73 | 吉林,辽源
 74 | 吉林,四平
 75 | 吉林,松原
 76 | 吉林,通化
 77 | 吉林,延边
 78 | 黑龙江,大庆
 79 | 黑龙江,大兴安岭
 80 | 黑龙江,哈尔滨
 81 | 黑龙江,鹤岗
 82 | 黑龙江,黑河
 83 | 黑龙江,鸡西
 84 | 黑龙江,佳木斯
 85 | 黑龙江,牡丹江
 86 | 黑龙江,七台河
 87 | 黑龙江,齐齐哈尔
 88 | 黑龙江,双鸭山
 89 | 黑龙江,绥化
 90 | 黑龙江,伊春
 91 | 江苏,常州
 92 | 江苏,淮安
 93 | 江苏,连云港
 94 | 江苏,南京
 95 | 江苏,南通
 96 | 江苏,苏州
 97 | 江苏,宿迁
 98 | 江苏,泰州
 99 | 江苏,无锡
100 | 江苏,徐州
101 | 江苏,盐城
102 | 江苏,扬州
103 | 江苏,镇江
104 | 浙江,杭州
105 | 浙江,湖州
106 | 浙江,嘉兴
107 | 浙江,金华
108 | 浙江,丽水
109 | 浙江,宁波
110 | 浙江,衢州
111 | 浙江,绍兴
112 | 浙江,台州
113 | 浙江,温州
114 | 浙江,舟山
115 | 安徽,安庆
116 | 安徽,蚌埠
117 | 安徽,亳州
118 | 安徽,巢湖
119 | 安徽,池州
120 | 安徽,滁州
121 | 安徽,阜阳
122 | 安徽,合肥
123 | 安徽,淮北
124 | 安徽,淮南
125 | 安徽,黄山
126 | 安徽,六安
127 | 安徽,马鞍山
128 | 安徽,宿州
129 | 安徽,铜陵
130 | 安徽,芜湖
131 | 安徽,宣城
132 | 福建,福州
133 | 福建,龙岩
134 | 福建,南平
135 | 福建,宁德
136 | 福建,莆田
137 | 福建,泉州
138 | 福建,三明
139 | 福建,厦门
140 | 福建,漳州
141 | 江西,抚州
142 | 江西,赣州
143 | 江西,吉安
144 | 江西,景德镇
145 | 江西,九江
146 | 江西,南昌
147 | 江西,萍乡
148 | 江西,上饶
149 | 江西,新余
150 | 江西,宜春
151 | 江西,鹰潭
152 | 山东,滨州
153 | 山东,德州
154 | 山东,东营
155 | 山东,菏泽
156 | 山东,济南
157 | 山东,济宁
158 | 山东,莱芜
159 | 山东,聊城
160 | 山东,临沂
161 | 山东,青岛
162 | 山东,日照
163 | 山东,泰安
164 | 山东,威海
165 | 山东,潍坊
166 | 山东,烟台
167 | 山东,枣庄
168 | 山东,淄博
169 | 河南,安阳
170 | 河南,鹤壁
171 | 河南,焦作
172 | 河南,开封
173 | 河南,洛阳
174 | 河南,漯河
175 | 河南,南阳
176 | 河南,平顶山
177 | 河南,濮阳
178 | 河南,三门峡
179 | 河南,商丘
180 | 河南,新乡
181 | 河南,信阳
182 | 河南,许昌
183 | 河南,郑州
184 | 河南,周口
185 | 河南,驻马店
186 | 湖北,鄂州
187 | 湖北,恩施
188 | 湖北,黄冈
189 | 湖北,黄石
190 | 湖北,荆门
191 | 湖北,荆州
192 | 湖北,十堰
193 | 湖北,随州
194 | 湖北,武汉
195 | 湖北,咸宁
196 | 湖北,襄樊
197 | 湖北,孝感
198 | 湖北,宜昌
199 | 湖南,长沙
200 | 湖南,常德
201 | 湖南,郴州
202 | 湖南,衡阳
203 | 湖南,怀化
204 | 湖南,娄底
205 | 湖南,邵阳
206 | 湖南,湘潭
207 | 湖南,湘西
208 | 湖南,益阳
209 | 湖南,永州
210 | 湖南,岳阳
211 | 湖南,张家界
212 | 湖南,株洲
213 | 广东,潮州
214 | 广东,东莞
215 | 广东,佛山
216 | 广东,广州
217 | 广东,河源
218 | 广东,惠州
219 | 广东,江门
220 | 广东,揭阳
221 | 广东,茂名
222 | 广东,梅州
223 | 广东,清远
224 | 广东,汕头
225 | 广东,汕尾
226 | 广东,韶关
227 | 广东,深圳
228 | 广东,阳江
229 | 广东,云浮
230 | 广东,湛江
231 | 广东,肇庆
232 | 广东,中山
233 | 广东,珠海
234 | 广西,百色
235 | 广西,北海
236 | 广西,崇左
237 | 广西,防城港
238 | 广西,贵港
239 | 广西,桂林
240 | 广西,河池
241 | 广西,贺州
242 | 广西,来宾
243 | 广西,柳州
244 | 广西,南宁
245 | 广西,钦州
246 | 广西,梧州
247 | 广西,玉林
248 | 海南,海口
249 | 海南,三亚
250 | 海南,直辖县级行政区划
251 | 四川,阿坝
252 | 四川,巴中
253 | 四川,成都
254 | 四川,达州
255 | 四川,德阳
256 | 四川,甘孜
257 | 四川,广安
258 | 四川,广元
259 | 四川,乐山
260 | 四川,凉山
261 | 四川,泸州
262 | 四川,眉山
263 | 四川,绵阳
264 | 四川,内江
265 | 四川,南充
266 | 四川,攀枝花
267 | 四川,遂宁
268 | 四川,雅安
269 | 四川,宜宾
270 | 四川,资阳
271 | 四川,自贡
272 | 贵州,安顺
273 | 贵州,毕节
274 | 贵州,贵阳
275 | 贵州,六盘水
276 | 贵州,黔东南
277 | 贵州,黔南
278 | 贵州,黔西南
279 | 贵州,铜仁
280 | 贵州,遵义
281 | 云南,保山
282 | 云南,楚雄
283 | 云南,大理
284 | 云南,德宏
285 | 云南,迪庆
286 | 云南,红河
287 | 云南,昆明
288 | 云南,丽江
289 | 云南,临沧
290 | 云南,怒江
291 | 云南,普洱
292 | 云南,曲靖
293 | 云南,文山
294 | 云南,西双版纳
295 | 云南,玉溪
296 | 云南,昭通
297 | 西藏,阿里
298 | 西藏,昌都
299 | 西藏,拉萨
300 | 西藏,林芝
301 | 西藏,那曲
302 | 西藏,日喀则
303 | 西藏,山南
304 | 陕西,安康
305 | 陕西,宝鸡
306 | 陕西,汉中
307 | 陕西,商洛
308 | 陕西,铜川
309 | 陕西,渭南
310 | 陕西,西安
311 | 陕西,咸阳
312 | 陕西,延安
313 | 陕西,榆林
314 | 甘肃,白银
315 | 甘肃,定西
316 | 甘肃,甘南
317 | 甘肃,嘉峪关
318 | 甘肃,金昌
319 | 甘肃,酒泉
320 | 甘肃,兰州
321 | 甘肃,临夏
322 | 甘肃,陇南
323 | 甘肃,平凉
324 | 甘肃,庆阳
325 | 甘肃,天水
326 | 甘肃,武威
327 | 甘肃,张掖
328 | 青海,果洛
329 | 青海,海北
330 | 青海,海东
331 | 青海,海南
332 | 青海,海西
333 | 青海,黄南
334 | 青海,西宁
335 | 青海,玉树
336 | 宁夏,固原
337 | 宁夏,石嘴山
338 | 宁夏,吴忠
339 | 宁夏,银川
340 | 宁夏,中卫
341 | 新疆,阿克苏
342 | 新疆,阿勒泰
343 | 新疆,巴音郭楞
344 | 新疆,博尔塔拉
345 | 新疆,昌吉
346 | 新疆,哈密
347 | 新疆,和田
348 | 新疆,喀什
349 | 新疆,克拉玛依
350 | 新疆,克孜勒苏柯尔克孜
351 | 新疆,塔城
352 | 新疆,吐鲁番
353 | 新疆,乌鲁木齐
354 | 新疆,伊犁哈萨克
355 | 新疆,直辖县级行政区划
356 | 


--------------------------------------------------------------------------------
/reports/geo_data/provincial_capital_locations.csv:
--------------------------------------------------------------------------------
 1 | province,capital,longitude,latitude
 2 | 辽宁,沈阳市,123.429092,41.796768
 3 | 吉林,长春市,125.324501,43.886841
 4 | 黑龙江,哈尔滨市,126.642464,45.756966
 5 | 北京,北京市,116.405289,39.904987
 6 | 天津,天津市,117.190186,39.125595
 7 | 内蒙古,呼和浩特市,111.75199,40.84149
 8 | 宁夏,银川市,106.23248,38.48644
 9 | 山西,太原市,112.549248,37.857014
10 | 河北,石家庄市,114.502464,38.045475
11 | 山东,济南市,117.000923,36.675808
12 | 河南,郑州市,113.665413,34.757977
13 | 陕西,西安市,108.948021,34.263161
14 | 湖北,武汉市,114.298569,30.584354
15 | 江苏,南京市,118.76741,32.041546
16 | 安徽,合肥市,117.283043,31.861191
17 | 上海,上海市,121.472641,31.231707
18 | 湖南,长沙市,112.982277,28.19409
19 | 江西,南昌市,115.892151,28.676493
20 | 浙江,杭州市,120.15358,30.287458
21 | 福建,福州市,119.306236,26.075302
22 | 广东,广州市,113.28064,23.125177
23 | 台湾,台北市,121.520076,25.030724
24 | 海南,海口市,110.19989,20.04422
25 | 广西,南宁市,108.320007,22.82402
26 | 重庆,重庆市,106.504959,29.533155
27 | 云南,昆明市,102.71225,25.040609
28 | 贵州,贵阳市,106.713478,26.578342
29 | 四川,成都市,104.065735,30.659462
30 | 甘肃,兰州市,103.83417,36.06138
31 | 青海,西宁市,101.77782,36.61729
32 | 西藏,拉萨市,91.1145,29.64415
33 | 新疆,乌鲁木齐市,87.61688,43.82663
34 | 香港,香港,114.16546,22.27534
35 | 澳门,澳门,113.54913,22.19875
36 | 


--------------------------------------------------------------------------------
/sql/create_city_stats.sql:
--------------------------------------------------------------------------------
 1 | USE [it_jobs]
 2 | GO
 3 | 
 4 | /****** Object:  Table [dbo].[city_stats]    Script Date: 8/2/2020 5:13:39 PM ******/
 5 | SET ANSI_NULLS ON
 6 | GO
 7 | 
 8 | SET QUOTED_IDENTIFIER ON
 9 | GO
10 | 
11 | CREATE TABLE [dbo].[city_stats](
12 | 	[yearmonth] [int] NOT NULL,
13 | 	[city] [nvarchar](50) NOT NULL,
14 | 	[salary] [int] NOT NULL,
15 |  CONSTRAINT [PK_city_stats] PRIMARY KEY CLUSTERED 
16 | (
17 | 	[yearmonth] ASC,
18 | 	[city] ASC
19 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
20 | ) ON [PRIMARY]
21 | GO
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/sql/create_company.sql:
--------------------------------------------------------------------------------
 1 | USE [it_jobs]
 2 | GO
 3 | 
 4 | /****** Object:  Table [dbo].[companies]    Script Date: 6/28/2020 9:24:57 PM ******/
 5 | SET ANSI_NULLS ON
 6 | GO
 7 | 
 8 | SET QUOTED_IDENTIFIER ON
 9 | GO
10 | 
11 | CREATE TABLE [dbo].[companies](
12 | 	[company_id] [nvarchar](100) NOT NULL,
13 | 	[company_size] [nvarchar](100) NOT NULL,
14 | 	[company_name] [nvarchar](100) NOT NULL,
15 | 	[company_type] [nvarchar](100) NOT NULL,
16 | 	[company_description] [varchar](max) NOT NULL,
17 | 	[company_industry] [nvarchar](100) NOT NULL,
18 |  CONSTRAINT [PK_companies] PRIMARY KEY CLUSTERED 
19 | (
20 | 	[company_id] ASC
21 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
22 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
23 | GO
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/sql/create_general_stats.sql:
--------------------------------------------------------------------------------
 1 | USE [it_jobs]
 2 | GO
 3 | 
 4 | /****** Object:  Table [dbo].[general_stats]    Script Date: 8/2/2020 5:14:16 PM ******/
 5 | SET ANSI_NULLS ON
 6 | GO
 7 | 
 8 | SET QUOTED_IDENTIFIER ON
 9 | GO
10 | 
11 | CREATE TABLE [dbo].[general_stats](
12 | 	[yearmonth] [int] NOT NULL,
13 | 	[salary_mean] [int] NOT NULL,
14 | 	[salary_median] [int] NOT NULL,
15 | 	[jd_count] [int] NOT NULL,
16 | 	[head_count] [int] NOT NULL,
17 |  CONSTRAINT [PK_general_stats] PRIMARY KEY CLUSTERED 
18 | (
19 | 	[yearmonth] ASC
20 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
21 | ) ON [PRIMARY]
22 | GO
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/sql/create_table.sql:
--------------------------------------------------------------------------------
  1 | USE [jobs]
  2 | GO
  3 | 
  4 | /****** Object:  Table [dbo].[_51jobs]    Script Date: 4/5/2019 2:13:00 PM ******/
  5 | SET ANSI_NULLS ON
  6 | GO
  7 | 
  8 | SET QUOTED_IDENTIFIER ON
  9 | GO
 10 | 
 11 | CREATE TABLE [dbo].[_51jobs](
 12 | 	[ageism] [bit] NOT NULL,
 13 | 	[career_algorithm] [bit] NOT NULL,
 14 | 	[career_architect] [bit] NOT NULL,
 15 | 	[career_software_engineer] [bit] NOT NULL,
 16 | 	[city_beijing] [bit] NOT NULL,
 17 | 	[city_changchun] [bit] NOT NULL,
 18 | 	[city_changsha] [bit] NOT NULL,
 19 | 	[city_chengdu] [bit] NOT NULL,
 20 | 	[city_chongqing] [bit] NOT NULL,
 21 | 	[city_dalian] [bit] NOT NULL,
 22 | 	[city_dongguan] [bit] NOT NULL,
 23 | 	[city_fuzhou] [bit] NOT NULL,
 24 | 	[city_guangzhou] [bit] NOT NULL,
 25 | 	[city_hangzhou] [bit] NOT NULL,
 26 | 	[city_harbin] [bit] NOT NULL,
 27 | 	[city_hefei] [bit] NOT NULL,
 28 | 	[city_jinan] [bit] NOT NULL,
 29 | 	[city_kuming] [bit] NOT NULL,
 30 | 	[city_nanjing] [bit] NOT NULL,
 31 | 	[city_ningbo] [bit] NOT NULL,
 32 | 	[city_qingdao] [bit] NOT NULL,
 33 | 	[city_shanghai] [bit] NOT NULL,
 34 | 	[city_shenyang] [bit] NOT NULL,
 35 | 	[city_shenzhen] [bit] NOT NULL,
 36 | 	[city_tianjin] [bit] NOT NULL,
 37 | 	[city_wuhan] [bit] NOT NULL,
 38 | 	[city_xian] [bit] NOT NULL,
 39 | 	[city_zhengzhou] [bit] NOT NULL,
 40 | 	[company_description] [varchar](max) NOT NULL,
 41 | 	[company_size_10000] [bit] NOT NULL,
 42 | 	[company_size_1000_5000] [bit] NOT NULL,
 43 | 	[company_size_150_500] [bit] NOT NULL,
 44 | 	[company_size_50] [bit] NOT NULL,
 45 | 	[company_size_5000_10000] [bit] NOT NULL,
 46 | 	[company_size_500_1000] [bit] NOT NULL,
 47 | 	[company_size_50_150] [bit] NOT NULL,
 48 | 	[company_title] [varchar](max) NOT NULL,
 49 | 	[company_tpye_jv] [bit] NOT NULL,
 50 | 	[company_type_foreign] [bit] NOT NULL,
 51 | 	[company_type_foreign_gov] [bit] NOT NULL,
 52 | 	[company_type_foreign_rep] [bit] NOT NULL,
 53 | 	[company_type_listed] [bit] NOT NULL,
 54 | 	[company_type_non_profit] [bit] NOT NULL,
 55 | 	[company_type_private] [bit] NOT NULL,
 56 | 	[company_type_public_institution] [bit] NOT NULL,
 57 | 	[company_type_startup] [bit] NOT NULL,
 58 | 	[company_type_state] [bit] NOT NULL,
 59 | 	[company_type_us_eu] [bit] NOT NULL,
 60 | 	[db_Apache_Hive] [bit] NOT NULL,
 61 | 	[db_CouchBase] [bit] NOT NULL,
 62 | 	[db_CouchDB] [bit] NOT NULL,
 63 | 	[db_DB2] [bit] NOT NULL,
 64 | 	[db_DynamoDB] [bit] NOT NULL,
 65 | 	[db_Elasticsearch] [bit] NOT NULL,
 66 | 	[db_FileMaker] [bit] NOT NULL,
 67 | 	[db_Firebase] [bit] NOT NULL,
 68 | 	[db_Firebird] [bit] NOT NULL,
 69 | 	[db_Hbase] [bit] NOT NULL,
 70 | 	[db_Informix] [bit] NOT NULL,
 71 | 	[db_Ingres] [bit] NOT NULL,
 72 | 	[db_MariaDB] [bit] NOT NULL,
 73 | 	[db_Memcached] [bit] NOT NULL,
 74 | 	[db_MongoDB] [bit] NOT NULL,
 75 | 	[db_MySQL] [bit] NOT NULL,
 76 | 	[db_Neo4j] [bit] NOT NULL,
 77 | 	[db_Netezza] [bit] NOT NULL,
 78 | 	[db_Oracle] [bit] NOT NULL,
 79 | 	[db_PostgreSQL] [bit] NOT NULL,
 80 | 	[db_Redis] [bit] NOT NULL,
 81 | 	[db_Riak] [bit] NOT NULL,
 82 | 	[db_SAP_HANA] [bit] NOT NULL,
 83 | 	[db_SQL_Server] [bit] NOT NULL,
 84 | 	[db_SQLite] [bit] NOT NULL,
 85 | 	[db_Solr] [bit] NOT NULL,
 86 | 	[db_Splunk] [bit] NOT NULL,
 87 | 	[db_Sybase] [bit] NOT NULL,
 88 | 	[db_Teradata] [bit] NOT NULL,
 89 | 	[db_dBase] [bit] NOT NULL,
 90 | 	[edu_associate] [bit] NOT NULL,
 91 | 	[edu_bachelor] [bit] NOT NULL,
 92 | 	[edu_high_school] [bit] NOT NULL,
 93 | 	[edu_master] [bit] NOT NULL,
 94 | 	[edu_middle_school] [bit] NOT NULL,
 95 | 	[edu_phd] [bit] NOT NULL,
 96 | 	[english] [bit] NOT NULL,
 97 | 	[experience_10] [bit] NOT NULL,
 98 | 	[experience_1_3] [bit] NOT NULL,
 99 | 	[experience_3_5] [bit] NOT NULL,
100 | 	[experience_5_10] [bit] NOT NULL,
101 | 	[experience_no] [bit] NOT NULL,
102 | 	[icu_996] [bit] NOT NULL,
103 | 	[industry_ads] [bit] NOT NULL,
104 | 	[industry_computer] [bit] NOT NULL,
105 | 	[industry_edu] [bit] NOT NULL,
106 | 	[industry_energy] [bit] NOT NULL,
107 | 	[industry_finance] [bit] NOT NULL,
108 | 	[industry_gov] [bit] NOT NULL,
109 | 	[industry_logistic] [bit] NOT NULL,
110 | 	[industry_medical] [bit] NOT NULL,
111 | 	[industry_realestate] [bit] NOT NULL,
112 | 	[industry_service] [bit] NOT NULL,
113 | 	[industry_trade] [bit] NOT NULL,
114 | 	[japanese] [bit] NOT NULL,
115 | 	[job_description] [varchar](max) NOT NULL,
116 | 	[job_id] [varchar](max) NOT NULL,
117 | 	[monthly_salary] [float] NOT NULL,
118 | 	[non_996] [bit] NOT NULL,
119 | 	[phone_android] [bit] NOT NULL,
120 | 	[phone_app] [bit] NOT NULL,
121 | 	[phone_iso] [bit] NOT NULL,
122 | 	[pl_c_sharp] [bit] NOT NULL,
123 | 	[pl_cpp] [bit] NOT NULL,
124 | 	[pl_delphi] [bit] NOT NULL,
125 | 	[pl_go] [bit] NOT NULL,
126 | 	[pl_haskell] [bit] NOT NULL,
127 | 	[pl_java] [bit] NOT NULL,
128 | 	[pl_javascript] [bit] NOT NULL,
129 | 	[pl_julia] [bit] NOT NULL,
130 | 	[pl_kotlin] [bit] NOT NULL,
131 | 	[pl_lua] [bit] NOT NULL,
132 | 	[pl_matlab] [bit] NOT NULL,
133 | 	[pl_objective_c] [bit] NOT NULL,
134 | 	[pl_perl] [bit] NOT NULL,
135 | 	[pl_php] [bit] NOT NULL,
136 | 	[pl_python] [bit] NOT NULL,
137 | 	[pl_ruby] [bit] NOT NULL,
138 | 	[pl_rust] [bit] NOT NULL,
139 | 	[pl_scrala] [bit] NOT NULL,
140 | 	[pl_swift] [bit] NOT NULL,
141 | 	[pl_typescript] [bit] NOT NULL,
142 | 	[pl_vba] [bit] NOT NULL,
143 | 	[pl_visual_basic] [bit] NOT NULL,
144 | 	[publish_date] [datetime] NOT NULL,
145 | 	[published_on_weekend] [bit] NOT NULL,
146 | 	[tag_baby_care] [bit] NOT NULL,
147 | 	[tag_five_insurance] [bit] NOT NULL,
148 | 	[tag_flexible] [bit] NOT NULL,
149 | 	[tag_no_overtime] [bit] NOT NULL,
150 | 	[tag_rest_one_day] [bit] NOT NULL,
151 | 	[tag_rest_two_days] [bit] NOT NULL,
152 | 	[tag_stock] [bit] NOT NULL,
153 | 	[title] [varchar](max) NOT NULL
154 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
155 | GO
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/sql/create_table_v2.sql:
--------------------------------------------------------------------------------
  1 | USE [jobs]
  2 | GO
  3 | 
  4 | /****** Object:  Table [dbo].[_201903v2]    Script Date: 4/28/2019 10:26:07 PM ******/
  5 | SET ANSI_NULLS ON
  6 | GO
  7 | 
  8 | SET QUOTED_IDENTIFIER ON
  9 | GO
 10 | 
 11 | CREATE TABLE [dbo].[_201904](
 12 | 	[job_id] [varchar](20) NOT NULL,
 13 | 	[monthly_salary] [float] NOT NULL,
 14 | 	[headcount] [bigint] NOT NULL,
 15 | 	[title] [varchar](max) NOT NULL,
 16 | 	[career] [nvarchar](100) NOT NULL,
 17 | 	[city] [nvarchar](20) NOT NULL,
 18 | 	[province] [nvarchar](20) NOT NULL,
 19 | 	[company_description] [nvarchar](max) NOT NULL,
 20 | 	[company_size] [nvarchar](100) NOT NULL,
 21 | 	[company_title] [nvarchar](100) NOT NULL,
 22 | 	[company_type] [nvarchar](100) NOT NULL,
 23 | 	[ageism] [bit] NOT NULL,
 24 | 	[db_Apache_Hive] [bit] NOT NULL,
 25 | 	[db_CouchBase] [bit] NOT NULL,
 26 | 	[db_CouchDB] [bit] NOT NULL,
 27 | 	[db_DB2] [bit] NOT NULL,
 28 | 	[db_DynamoDB] [bit] NOT NULL,
 29 | 	[db_Elasticsearch] [bit] NOT NULL,
 30 | 	[db_FileMaker] [bit] NOT NULL,
 31 | 	[db_Firebase] [bit] NOT NULL,
 32 | 	[db_Firebird] [bit] NOT NULL,
 33 | 	[db_Hbase] [bit] NOT NULL,
 34 | 	[db_Informix] [bit] NOT NULL,
 35 | 	[db_Ingres] [bit] NOT NULL,
 36 | 	[db_MariaDB] [bit] NOT NULL,
 37 | 	[db_Memcached] [bit] NOT NULL,
 38 | 	[db_MongoDB] [bit] NOT NULL,
 39 | 	[db_MySQL] [bit] NOT NULL,
 40 | 	[db_Neo4j] [bit] NOT NULL,
 41 | 	[db_Netezza] [bit] NOT NULL,
 42 | 	[db_Oracle] [bit] NOT NULL,
 43 | 	[db_PostgreSQL] [bit] NOT NULL,
 44 | 	[db_Redis] [bit] NOT NULL,
 45 | 	[db_Riak] [bit] NOT NULL,
 46 | 	[db_SAP_HANA] [bit] NOT NULL,
 47 | 	[db_SQL_Server] [bit] NOT NULL,
 48 | 	[db_SQLite] [bit] NOT NULL,
 49 | 	[db_Solr] [bit] NOT NULL,
 50 | 	[db_Splunk] [bit] NOT NULL,
 51 | 	[db_Sybase] [bit] NOT NULL,
 52 | 	[db_Teradata] [bit] NOT NULL,
 53 | 	[db_dBase] [bit] NOT NULL,
 54 | 	[edu] [nvarchar](100) NOT NULL,
 55 | 	[english] [bit] NOT NULL,
 56 | 	[experience] [nvarchar](100) NOT NULL,
 57 | 	[expert_adas] [bit] NOT NULL,
 58 | 	[expert_blockchain] [bit] NOT NULL,
 59 | 	[expert_embed] [bit] NOT NULL,
 60 | 	[expert_expert] [bit] NOT NULL,
 61 | 	[expert_gis] [bit] NOT NULL,
 62 | 	[_996_yes] [bit] NOT NULL,
 63 | 	[_996_no] [bit] NOT NULL,
 64 | 	[industry] [nvarchar](100) NOT NULL,
 65 | 	[japanese] [bit] NOT NULL,
 66 | 	[job_description] [nvarchar](max) NOT NULL,
 67 | 	[job_summary] [nvarchar](100) NOT NULL,
 68 | 	[job_tags] [nvarchar](100) NOT NULL,
 69 | 	[phone_android] [bit] NOT NULL,
 70 | 	[phone_app] [bit] NOT NULL,
 71 | 	[phone_iso] [bit] NOT NULL,
 72 | 	[pl_c_sharp] [bit] NOT NULL,
 73 | 	[pl_cpp] [bit] NOT NULL,
 74 | 	[pl_delphi] [bit] NOT NULL,
 75 | 	[pl_go] [bit] NOT NULL,
 76 | 	[pl_haskell] [bit] NOT NULL,
 77 | 	[pl_java] [bit] NOT NULL,
 78 | 	[pl_javascript] [bit] NOT NULL,
 79 | 	[pl_julia] [bit] NOT NULL,
 80 | 	[pl_kotlin] [bit] NOT NULL,
 81 | 	[pl_lua] [bit] NOT NULL,
 82 | 	[pl_matlab] [bit] NOT NULL,
 83 | 	[pl_objective_c] [bit] NOT NULL,
 84 | 	[pl_perl] [bit] NOT NULL,
 85 | 	[pl_php] [bit] NOT NULL,
 86 | 	[pl_python] [bit] NOT NULL,
 87 | 	[pl_ruby] [bit] NOT NULL,
 88 | 	[pl_rust] [bit] NOT NULL,
 89 | 	[pl_scrala] [bit] NOT NULL,
 90 | 	[pl_swift] [bit] NOT NULL,
 91 | 	[pl_typescript] [bit] NOT NULL,
 92 | 	[pl_vba] [bit] NOT NULL,
 93 | 	[pl_visual_basic] [bit] NOT NULL,
 94 | 	[publish_date] [datetime] NOT NULL,
 95 | 	[published_on_weekend] [bit] NOT NULL,
 96 | 	[tag_baby_care] [bit] NOT NULL,
 97 | 	[tag_five_insurance] [bit] NOT NULL,
 98 | 	[tag_flexible] [bit] NOT NULL,
 99 | 	[tag_no_overtime] [bit] NOT NULL,
100 | 	[tag_rest_one_day] [bit] NOT NULL,
101 | 	[tag_rest_two_days] [bit] NOT NULL,
102 | 	[tag_stock] [bit] NOT NULL,
103 | 	[ml_tensorflow] [bit] NOT NULL,
104 | 	[ml_caffe] [bit] NOT NULL,
105 | 	[ml_cntk] [bit] NOT NULL,
106 | 	[ml_chainer] [bit] NOT NULL,
107 | 	[ml_mxnet] [bit] NOT NULL,
108 | 	[ml_keras] [bit] NOT NULL,
109 | 	[ml_deeplearning4j] [bit] NOT NULL,
110 | 	[ml_theano] [bit] NOT NULL,
111 | 	[ml_sklearn] [bit] NOT NULL,
112 | 	[ml_mahout] [bit] NOT NULL,
113 | 	[ml_paddlepaddle] [bit] NOT NULL
114 | 
115 |  CONSTRAINT [PK__201904v22] PRIMARY KEY CLUSTERED 
116 | (
117 | 	[job_id] ASC
118 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY]
119 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
120 | GO
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/sql/create_table_v3.sql:
--------------------------------------------------------------------------------
  1 | USE [it_jobs]
  2 | GO
  3 | 
  4 | /****** Object:  Table [dbo].[jobs]    Script Date: 6/28/2020 9:23:04 PM ******/
  5 | SET ANSI_NULLS ON
  6 | GO
  7 | 
  8 | SET QUOTED_IDENTIFIER ON
  9 | GO
 10 | 
 11 | CREATE TABLE [dbo].[jobs](
 12 | 	[job_id] [varchar](20) NOT NULL,
 13 | 	[yearmonth] [int] NOT NULL,
 14 | 	[monthly_salary] [int] NOT NULL,
 15 | 	[headcount] [bigint] NOT NULL,
 16 | 	[title] [nvarchar](max) NOT NULL,
 17 | 	[page_title] [nvarchar](max) NOT NULL,
 18 | 	[zhinengleibie] [nvarchar](100) NOT NULL,
 19 | 	[career] [nvarchar](100) NOT NULL,
 20 | 	[city] [nvarchar](20) NOT NULL,
 21 | 	[province] [nvarchar](20) NOT NULL,
 22 | 	[company_id] [nvarchar](max) NOT NULL,
 23 | 	[ageism] [bit] NOT NULL,
 24 | 	[db_Apache_Hive] [bit] NOT NULL,
 25 | 	[db_CouchBase] [bit] NOT NULL,
 26 | 	[db_CouchDB] [bit] NOT NULL,
 27 | 	[db_DB2] [bit] NOT NULL,
 28 | 	[db_DynamoDB] [bit] NOT NULL,
 29 | 	[db_Elasticsearch] [bit] NOT NULL,
 30 | 	[db_FileMaker] [bit] NOT NULL,
 31 | 	[db_Firebase] [bit] NOT NULL,
 32 | 	[db_Firebird] [bit] NOT NULL,
 33 | 	[db_Hbase] [bit] NOT NULL,
 34 | 	[db_Informix] [bit] NOT NULL,
 35 | 	[db_Ingres] [bit] NOT NULL,
 36 | 	[db_MariaDB] [bit] NOT NULL,
 37 | 	[db_Memcached] [bit] NOT NULL,
 38 | 	[db_MongoDB] [bit] NOT NULL,
 39 | 	[db_MySQL] [bit] NOT NULL,
 40 | 	[db_Neo4j] [bit] NOT NULL,
 41 | 	[db_Netezza] [bit] NOT NULL,
 42 | 	[db_Oracle] [bit] NOT NULL,
 43 | 	[db_PostgreSQL] [bit] NOT NULL,
 44 | 	[db_Redis] [bit] NOT NULL,
 45 | 	[db_Riak] [bit] NOT NULL,
 46 | 	[db_SAP_HANA] [bit] NOT NULL,
 47 | 	[db_SQL_Server] [bit] NOT NULL,
 48 | 	[db_SQLite] [bit] NOT NULL,
 49 | 	[db_Solr] [bit] NOT NULL,
 50 | 	[db_Splunk] [bit] NOT NULL,
 51 | 	[db_Sybase] [bit] NOT NULL,
 52 | 	[db_Teradata] [bit] NOT NULL,
 53 | 	[db_dBase] [bit] NOT NULL,
 54 | 	[edu] [nvarchar](100) NOT NULL,
 55 | 	[experience] [nvarchar](100) NOT NULL,
 56 | 	[expert_adas] [bit] NOT NULL,
 57 | 	[expert_blockchain] [bit] NOT NULL,
 58 | 	[expert_embed] [bit] NOT NULL,
 59 | 	[expert_expert] [bit] NOT NULL,
 60 | 	[expert_gis] [bit] NOT NULL,
 61 | 	[_996_yes] [bit] NOT NULL,
 62 | 	[_996_no] [bit] NOT NULL,
 63 | 	[lang_english] [bit] NOT NULL,
 64 | 	[lang_japanese] [bit] NOT NULL,
 65 | 	[job_description] [nvarchar](max) NOT NULL,
 66 | 	[job_summary] [nvarchar](100) NOT NULL,
 67 | 	[job_tags] [nvarchar](100) NOT NULL,
 68 | 	[phone_android] [bit] NOT NULL,
 69 | 	[phone_app] [bit] NOT NULL,
 70 | 	[phone_iso] [bit] NOT NULL,
 71 | 	[pl_c_sharp] [bit] NOT NULL,
 72 | 	[pl_cpp] [bit] NOT NULL,
 73 | 	[pl_delphi] [bit] NOT NULL,
 74 | 	[pl_go] [bit] NOT NULL,
 75 | 	[pl_haskell] [bit] NOT NULL,
 76 | 	[pl_java] [bit] NOT NULL,
 77 | 	[pl_javascript] [bit] NOT NULL,
 78 | 	[pl_julia] [bit] NOT NULL,
 79 | 	[pl_kotlin] [bit] NOT NULL,
 80 | 	[pl_lua] [bit] NOT NULL,
 81 | 	[pl_matlab] [bit] NOT NULL,
 82 | 	[pl_objective_c] [bit] NOT NULL,
 83 | 	[pl_perl] [bit] NOT NULL,
 84 | 	[pl_php] [bit] NOT NULL,
 85 | 	[pl_python] [bit] NOT NULL,
 86 | 	[pl_ruby] [bit] NOT NULL,
 87 | 	[pl_rust] [bit] NOT NULL,
 88 | 	[pl_swift] [bit] NOT NULL,
 89 | 	[pl_typescript] [bit] NOT NULL,
 90 | 	[pl_vba] [bit] NOT NULL,
 91 | 	[pl_visual_basic] [bit] NOT NULL,
 92 | 	[pl_r] [bit] NOT NULL,
 93 | 	[pl_scala] [bit] NOT NULL,
 94 | 	[publish_date] [datetime] NOT NULL,
 95 | 	[published_on_weekend] [bit] NOT NULL,
 96 | 	[tag_baby_care] [bit] NOT NULL,
 97 | 	[tag_five_insurance] [bit] NOT NULL,
 98 | 	[tag_flexible] [bit] NOT NULL,
 99 | 	[tag_no_overtime] [bit] NOT NULL,
100 | 	[tag_rest_one_day] [bit] NOT NULL,
101 | 	[tag_rest_two_days] [bit] NOT NULL,
102 | 	[tag_stock] [bit] NOT NULL,
103 | 	[ml_tensorflow] [bit] NOT NULL,
104 | 	[ml_caffe] [bit] NOT NULL,
105 | 	[ml_cntk] [bit] NOT NULL,
106 | 	[ml_chainer] [bit] NOT NULL,
107 | 	[ml_mxnet] [bit] NOT NULL,
108 | 	[ml_keras] [bit] NOT NULL,
109 | 	[ml_deeplearning4j] [bit] NOT NULL,
110 | 	[ml_theano] [bit] NOT NULL,
111 | 	[ml_sklearn] [bit] NOT NULL,
112 | 	[ml_mahout] [bit] NOT NULL,
113 | 	[ml_paddlepaddle] [bit] NOT NULL,
114 | 	[bd_hadoop] [bit] NOT NULL,
115 | 	[bd_spark] [bit] NOT NULL,
116 | 	[bd_hive] [bit] NOT NULL,
117 | 	[bd_mapReduce] [bit] NOT NULL,
118 | 	[bd_kafka] [bit] NOT NULL,
119 | 	[bd_hbase] [bit] NOT NULL,
120 | 	[bd_storm] [bit] NOT NULL,
121 | 	[bd_pig] [bit] NOT NULL,
122 | 	[bd_mahout] [bit] NOT NULL,
123 | 	[bd_impala] [bit] NOT NULL,
124 | 	[bd_yarn] [bit] NOT NULL,
125 | 	[bd_alluxio] [bit] NOT NULL,
126 | 	[bd_flink] [bit] NOT NULL,
127 | 	[bd_presto] [bit] NOT NULL,
128 | 	[bd_heron] [bit] NOT NULL,
129 |  CONSTRAINT [PK_jobs] PRIMARY KEY CLUSTERED 
130 | (
131 | 	[job_id] ASC,
132 | 	[yearmonth] ASC
133 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY]
134 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
135 | GO
136 | 
137 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_expert_adas]  DEFAULT ((0)) FOR [expert_adas]
138 | GO
139 | 
140 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_expert_blockchain]  DEFAULT ((0)) FOR [expert_blockchain]
141 | GO
142 | 
143 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_expert_embed]  DEFAULT ((0)) FOR [expert_embed]
144 | GO
145 | 
146 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_expert_expert]  DEFAULT ((0)) FOR [expert_expert]
147 | GO
148 | 
149 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_expert_gis]  DEFAULT ((0)) FOR [expert_gis]
150 | GO
151 | 
152 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_pl_r]  DEFAULT ((0)) FOR [pl_r]
153 | GO
154 | 
155 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_pl_scala]  DEFAULT ((0)) FOR [pl_scala]
156 | GO
157 | 
158 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_tensorflow]  DEFAULT ((0)) FOR [ml_tensorflow]
159 | GO
160 | 
161 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_caffe]  DEFAULT ((0)) FOR [ml_caffe]
162 | GO
163 | 
164 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_cntk]  DEFAULT ((0)) FOR [ml_cntk]
165 | GO
166 | 
167 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_chainer]  DEFAULT ((0)) FOR [ml_chainer]
168 | GO
169 | 
170 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_mxnet]  DEFAULT ((0)) FOR [ml_mxnet]
171 | GO
172 | 
173 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_keras]  DEFAULT ((0)) FOR [ml_keras]
174 | GO
175 | 
176 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_deeplearning4j]  DEFAULT ((0)) FOR [ml_deeplearning4j]
177 | GO
178 | 
179 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_theano]  DEFAULT ((0)) FOR [ml_theano]
180 | GO
181 | 
182 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_sklearn]  DEFAULT ((0)) FOR [ml_sklearn]
183 | GO
184 | 
185 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_mahout]  DEFAULT ((0)) FOR [ml_mahout]
186 | GO
187 | 
188 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_ml_paddlepaddle]  DEFAULT ((0)) FOR [ml_paddlepaddle]
189 | GO
190 | 
191 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_hadoop]  DEFAULT ((0)) FOR [bd_hadoop]
192 | GO
193 | 
194 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_spark]  DEFAULT ((0)) FOR [bd_spark]
195 | GO
196 | 
197 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_hive]  DEFAULT ((0)) FOR [bd_hive]
198 | GO
199 | 
200 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_mapReduce]  DEFAULT ((0)) FOR [bd_mapReduce]
201 | GO
202 | 
203 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_kafka]  DEFAULT ((0)) FOR [bd_kafka]
204 | GO
205 | 
206 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_hbase]  DEFAULT ((0)) FOR [bd_hbase]
207 | GO
208 | 
209 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_storm]  DEFAULT ((0)) FOR [bd_storm]
210 | GO
211 | 
212 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_pig]  DEFAULT ((0)) FOR [bd_pig]
213 | GO
214 | 
215 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_mahout]  DEFAULT ((0)) FOR [bd_mahout]
216 | GO
217 | 
218 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_impala]  DEFAULT ((0)) FOR [bd_impala]
219 | GO
220 | 
221 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_yarn]  DEFAULT ((0)) FOR [bd_yarn]
222 | GO
223 | 
224 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_alluxio]  DEFAULT ((0)) FOR [bd_alluxio]
225 | GO
226 | 
227 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_flink]  DEFAULT ((0)) FOR [bd_flink]
228 | GO
229 | 
230 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_presto]  DEFAULT ((0)) FOR [bd_presto]
231 | GO
232 | 
233 | ALTER TABLE [dbo].[jobs] ADD  CONSTRAINT [DF_jobs_bd_heron]  DEFAULT ((0)) FOR [bd_heron]
234 | GO
235 | 
236 | 
237 | 


--------------------------------------------------------------------------------
/sql/feature_engineering.sql:
--------------------------------------------------------------------------------
 1 | --career
 2 | --delete from jobs where year_month=202101 and monthly_salary>0 and monthly_salary<3000
 3 | delete from jobs where year_month=202101 and title like '%赴日%' and not title like '%机会%' 
 4 | --delete from jobs where year_month=202101 and monthly_salary>0 and monthly_salary<1000
 5 | 
 6 | delete [jobs] where year_month=202101 and title like '%技工%'
 7 | delete [jobs] where year_month=202101 and title like '%技术员%'
 8 | delete [jobs] where year_month=202101 and city ='杭州' and title like '00%(职位编号：%)'
 9 | delete [jobs] where year_month=202101 and province ='异地招聘'
10 | delete [jobs] where year_month=202101 and job_summary like '%应届生%'
11 | delete [jobs] where year_month=202101 and title like '%应届%'
12 | delete [jobs] where year_month=202101 and title like '%校招%'
13 | delete [jobs] where year_month=202101 and title like '%校园招聘%'
14 | 
15 | update jobs set career='软件工程师' where year_month=202101 and zhinengleibie in ('软件工程师', '高级软件工程师', 'PHP开发工程师', 'Java开发工程师', 'C开发工程师', 'Python开发工程师', '.NET开发工程师', '脚本开发工程师', 'Ruby开发工程师', 'Go开发工程师')
16 | update jobs set career='软件工程师' where year_month=202101 and career='一般程序员'
17 | update jobs set career='Android开发工程师' where year_month=202101 and title like '%Android%' or title like '%安卓%' 
18 | 
19 | update jobs set career='信号处理' where year_month=202101 and title like '%信号处理%'
20 | update jobs set career='爬虫开发工程师' where year_month=202101 and title like '%爬虫%'
21 | update jobs set career='ADAS' where year_month=202101 and title like '%adas%'
22 | update jobs set career='机器人' where year_month=202101 and title like '%机器人%' or title like '%ROS%'
23 | update jobs set career='GIS' where year_month=202101 and title like '%GIS%'
24 | update jobs set career='CAE' where year_month=202101 and title like '%CAE%'
25 | update jobs set career='光学算法' where year_month=202101 and title like '%光学算法工程师%'
26 | update jobs set career='ETL' where year_month=202101 and title like '%ETL%'
27 | update jobs set career='Unity3D' where year_month=202101 and title like '%Unity3D%'
28 | update jobs set career='遥感' where year_month=202101 and title like '%遥感%'
29 | update jobs set career='规划算法' where year_month=202101 and title like '%规划算法工程师%'
30 | update jobs set career='视觉软件工程师' where year_month=202101 and title like '%三维重建%'
31 | update jobs set career='视觉软件工程师' where year_month=202101 and title like '%视觉软件工程师%'
32 | 
33 | 
34 | update jobs set career='大数据' where year_month=202101 and title like '%大数据%'
35 | update jobs set career='CT重建' where year_month=202101 and title like '%CT重建%'
36 | update jobs set career='SLAM' where year_month=202101 and title like '%SLAM%'
37 | update jobs set career='DSP' where year_month=202101 and title like '%DSP%'
38 | update jobs set career='生物信息' where year_month=202101 and title like '%生物信息%'
39 | update jobs set career='编译器开发工程师' where year_month=202101 and title like '%编译器%'
40 | update jobs set career='算法工程师' where year_month=202101 and title like '%算法%' or zhinengleibie='算法工程师'
41 | update jobs set career='自然语言处理（NLP）' where year_month=202101 and title like '%自然语言处理%' or title like '%NLP%'
42 | 
43 | delete from jobs where year_month=202101 and zhinengleibie='推荐算法工程师' and not title like '%推荐%'
44 | update jobs set career='推荐算法工程师' where year_month=202101 and title like '%推荐算法%'
45 | 
46 | delete from jobs where year_month=202101 and zhinengleibie='搜索算法工程师' and not title like '%搜索%'
47 | update jobs set career='搜索算法工程师' where year_month=202101 and title like '%搜索算法%' or title like '%Search Algorithm%'
48 | update jobs set career='反作弊算法工程师' where year_month=202101 and title like '%反作弊%'
49 | 
50 | update jobs set career='图像处理工程师' where year_month=202101 and title like '%图像处理%'
51 | update jobs set career='图像算法工程师' where year_month=202101 and title like '%图像算法%' or zhinengleibie='图像算法工程师'
52 | update jobs set career='人工智能' where year_month=202101 and title like '%AI%' or title like '%人工智能%' or title like '%神经网络%'
53 | update jobs set career='区块链开发' where year_month=202101 and title like '%区块链%' or zhinengleibie='区块链开发'
54 | update jobs set career='CTO' where year_month=202101 and title like '%CTO%' or title like '%首席技术官%'  or title like '%智慧研究院院长%'
55 | update jobs set career='芯片' where year_month=202101 and title like '%芯片%' or title like '%SOC设计%'
56 | update jobs set career='驱动工程师'  where year_month=202101 and title like '%driver%' or title like '%驱动%'
57 | update jobs set career='机器学习' where year_month=202101 and title like '%机器学习%' or zhinengleibie='机器学习工程师'
58 | update jobs set career='深度学习工程师' where year_month=202101 and title like '%深度学习%'
59 | update jobs set career='数据科学家' where year_month=202101 and title like '%Data Scientist%' or  title like '%数据科学家%'
60 | 
61 | 
62 | update jobs set career='架构师' where year_month=202101 and title like '%系统架构师%' or title like '%架构师%' or title like '%架构专家%' or title like '%architect%'   or title like '%架构研发%'
63 | update jobs set career='技术主管' where year_month=202101 and title like '%主管%' or title like '%leader%' 
64 | 
65 | update jobs set career='分布式' where year_month=202101 and career='软件工程师' and title like '%分布式%' 
66 | 
67 | update jobs set career='敏捷教练' where year_month=202101 and title like '%敏捷教练%' or title like '%agile coach%'  or title like '%Scrum Master%' 
68 | 
69 | update jobs set career='Cocos2d-x开发工程师' where year_month=202101 and career='软件工程师' and title like '%Cocos2d-x%' 
70 | 
71 | update jobs set career='MES' where year_month=202101 and career='软件工程师' and title like '%MES%' 
72 | 
73 | update jobs set career='Hadoop工程师' where year_month=202101 and title like '%Hadoop%' 
74 | 
75 | update jobs set career='嵌入式软件开发' where year_month=202101 and title like '%嵌入式%' or title like '%FPGA%' 
76 | 
77 | delete from jobs where year_month=202101 and career='人工智能' and not title like '%人工智能%'
78 | 
79 | 
80 | update jobs set ageism=1 where year_month=202101 and job_description like '%岁%'
81 | 
82 | update jobs set ml_paddlepaddle=1 where year_month=202101 and job_description like '%paddlepaddle%'
83 | update jobs set ml_mahout=1 where year_month=202101 and job_description like '%mahout%'
84 | update jobs set ml_sklearn=1 where year_month=202101 and job_description like '%scikit-learn%' or  job_description like '%scikitlearn%' or  job_description like '%sklearn%'
85 | update jobs set ml_theano=1 where year_month=202101 and job_description like '%theano%'
86 | update jobs set ml_keras=1 where year_month=202101 and job_description like '%keras%'
87 | update jobs set ml_mxnet=1 where year_month=202101 and job_description like '%mxnet%'
88 | update jobs set ml_cntk=1 where year_month=202101 and job_description like '%cntk%'
89 | update jobs set ml_caffe=1 where year_month=202101 and job_description like '%caffe%'
90 | update jobs set ml_tensorflow=1 where year_month=202101 and job_description like '%tensorflow%'
91 | update jobs set ml_pytorch=1 where year_month=202101 and job_description like '%pytorch%'
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/sql/update.sql:
--------------------------------------------------------------------------------
 1 | --update _51jobs set career_software_engineer=0 where career_algorithm=1 or career_architect=1
 2 | --改为以万元为单位
 3 | --update _51jobs set monthly_salary=monthly_salary/10000
 4 | 
 5 | 
 6 | --R语言统计 R语言 "R Studio" R编程 '%，R，%''%,R,%'
 7 | --update _51jobs set pl_r=1 where job_description like '%、R、%' 
 8 | --or job_description like '%，R，%' 
 9 | --or job_description like '%,R,%'
10 | --or  job_description like '%R语言%' 
11 | --or  job_description like '%R Studio%' 
12 | --or  job_description like '%R编程%' 
13 | --or  job_description like '%R语言%' 
14 | --vb.net
15 | --update _51jobs set pl_visual_basic_net=1  where job_description like '%vb.net%' 
16 | --or job_description like '%visual basic.net%' 
17 | --select COUNT(1) from _51jobs where  job_description like '%vb.net%'
18 | --select COUNT(1) from _51jobs where  job_description like '%Vb.net%'
19 | --Groovy
20 | 
21 | --update _51jobs set pl_groovy=1  where job_description like '%groovy%'
22 | --87
23 | --
24 | --update _51jobs set pl_scala=1  where job_description like '%scala%'
25 | --(1639 rows affected)
26 | --Assembly language 汇编
27 | --update _51jobs set pl_assembly=1  where job_description like '%Assembly language%' or  job_description like '%汇编%' 
28 | --(1147 rows affected)
29 | --Linux Linux CentOS Ubuntu  redhat
30 | 
31 | --select * from _201904 where title like '%爬虫%'
32 | --ALTER TABLE _201904 ADD career_spider bit DEFAULT 0 NOT NULL;
33 | --update _201904 set career_spider=1  where title like '%爬虫%'
34 | --update _201904 set career_software_engineer=0  where career_spider=1
35 | 
36 | 
37 | --update _201904 set city='zhengzhou' where city_zhengzhou=1
38 | 
39 | --update _201904 set career='algorithm' where career_algorithm=1
40 | --update _201904 set career='architect' where career_architect=1
41 | --update _201904 set career='software' where career_software_engineer=1
42 | --update _201904 set career='spider' where career_spider=1
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/sql/update_ml.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --ALTER TABLE _201904 ADD ml_tensorflow bit DEFAULT 0 NOT NULL;
 3 | --
 4 | 
 5 | --ALTER TABLE _201904 ADD ml_caffe bit DEFAULT 0 NOT NULL;
 6 | --
 7 | 
 8 | --ALTER TABLE _201904 ADD ml_cntk bit DEFAULT 0 NOT NULL;
 9 | --
10 | 
11 | --ALTER TABLE _201904 ADD ml_chainer bit DEFAULT 0 NOT NULL;
12 | --
13 | 
14 | --ALTER TABLE _201904 ADD ml_mxnet bit DEFAULT 0 NOT NULL;
15 | --
16 | 
17 | --ALTER TABLE _201904 ADD ml_keras bit DEFAULT 0 NOT NULL;
18 | --
19 | 
20 | --ALTER TABLE _201904 ADD ml_deeplearning4j bit DEFAULT 0 NOT NULL;
21 | --
22 | 
23 | --ALTER TABLE _201904 ADD ml_theano bit DEFAULT 0 NOT NULL;
24 | --
25 | 
26 | --ALTER TABLE _201904 ADD ml_sklearn bit DEFAULT 0 NOT NULL;
27 | --
28 | 
29 | --ALTER TABLE _201904 ADD ml_mahout bit DEFAULT 0 NOT NULL;
30 | --
31 | 
32 | --ALTER TABLE _201904 ADD ml_paddlepaddle bit DEFAULT 0 NOT NULL;
33 | --
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/sql/update_v2.sql:
--------------------------------------------------------------------------------
 1 | --update _201904v2 set career = '系统架构师' where career='架构设计师'
 2 | 
 3 | select * from _201904v2 where career like '%爬虫%'
 4 | 
 5 | 
 6 | update _201905 set career='爬虫工程师'  where title like '%爬虫%'
 7 | update _201905 set career='生物信息工程师'  where title like '%生物信息%'
 8 | 
 9 | 
10 | update _201903 set expert_blockchain=1  where title like '%blockchain%'
11 | update _201903 set expert_blockchain=1  where title like '%区块链%'
12 | 


--------------------------------------------------------------------------------