├── .gitignore ├── LICENSE.txt ├── Plagiarism.md ├── README.md ├── Untitled.ipynb ├── images ├── 201904_salary_by_cities.png ├── 2019_04_pl_pie.png └── 2019_04_pl_word_cloud.png ├── py ├── 2db.py ├── black_list ├── clean.py ├── common.py ├── company.py ├── config.py ├── db.py ├── download.py ├── feature_engineering.py ├── multiprocess.py ├── old.download_lagou.py ├── stats.py └── weighted.py ├── reports ├── 201904 │ ├── 996_Survey.ipynb │ ├── Beijing.ipynb │ ├── Chengdu.ipynb │ ├── First_Tier.ipynb │ ├── Nanjing.ipynb │ ├── Qingdao.ipynb │ ├── Survey.ipynb │ ├── china_v2.ipynb │ ├── cities_basemap.ipynb │ ├── cities_pyecharts.ipynb │ ├── first_tier_v2.ipynb │ └── programming_language.ipynb ├── 201905 │ ├── China.ipynb │ ├── cities_basemap.ipynb │ ├── first_tier_v2.ipynb │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 201906 │ ├── cities_basemap.ipynb │ ├── first_tier.ipynb │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 201907 │ ├── cities_basemap.ipynb │ ├── first_tier_v2.ipynb │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 201908 │ ├── cities_basemap.ipynb │ ├── first_tier_v2.ipynb │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 201909 │ ├── cities_basemap.ipynb │ ├── first_tier_v2.ipynb │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 201910 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── first_tier_v2.ipynb │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 201911 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── first_tier_v2.ipynb │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 201912 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier_v2.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 202001 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 202002 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 202003 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 202004 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 202005 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 202006 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 202007 │ ├── General_Stats.ipynb │ ├── Untitled.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ └── provinces_basemap.ipynb ├── 202008 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ ├── provinces_basemap.ipynb │ └── trend.ipynb ├── 202009 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ ├── provinces_basemap.ipynb │ └── trend.ipynb ├── 202010 │ ├── General_Stats.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ ├── provinces_basemap.ipynb │ └── trend.ipynb ├── 202011 │ ├── General_Stats.ipynb │ ├── anomaly_explore.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ ├── provinces_basemap.ipynb │ └── trend.ipynb ├── 202012 │ ├── General_Stats.ipynb │ ├── anomaly_explore.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ ├── provinces_basemap.ipynb │ └── trend.ipynb ├── 202101 │ ├── General_Stats.ipynb │ ├── anomaly_explore.ipynb │ ├── cities_basemap.ipynb │ ├── config.py │ ├── first_tier.ipynb │ ├── machine_learning.ipynb │ ├── map_wrapper.py │ ├── programming_language.ipynb │ ├── provinces_basemap.ipynb │ └── trend.ipynb ├── city_locations.csv └── geo_data │ ├── province_city.csv │ └── provincial_capital_locations.csv ├── spyder.ipynb ├── sql ├── create_city_stats.sql ├── create_company.sql ├── create_general_stats.sql ├── create_table.sql ├── create_table_v2.sql ├── create_table_v3.sql ├── feature_engineering.sql ├── update.sql ├── update_ml.sql └── update_v2.sql └── whitelist.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .vs/ 2 | .vscode/ 3 | .spyproject/ 4 | .ipynb_checkpoints/ 5 | __pycache__ 6 | debug.log 7 | geckodriver.log -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2 | 3 | Anti 996 License Version 1.0 (Draft) 4 | 5 | Permission is hereby granted to any individual or legal entity 6 | obtaining a copy of this licensed work (including the source code, 7 | documentation and/or related items, hereinafter collectively referred 8 | to as the "licensed work"), free of charge, to deal with the licensed 9 | work for any purpose, including without limitation, the rights to use, 10 | reproduce, modify, prepare derivative works of, distribute, publish 11 | and sublicense the licensed work, subject to the following conditions: 12 | 13 | 1. The individual or the legal entity must conspicuously display, 14 | without modification, this License and the notice on each redistributed 15 | or derivative copy of the Licensed Work. 16 | 17 | 2. The individual or the legal entity must strictly comply with all 18 | applicable laws, regulations, rules and standards of the jurisdiction 19 | relating to labor and employment where the individual is physically 20 | located or where the individual was born or naturalized; or where the 21 | legal entity is registered or is operating (whichever is stricter). In 22 | case that the jurisdiction has no such laws, regulations, rules and 23 | standards or its laws, regulations, rules and standards are 24 | unenforceable, the individual or the legal entity are required to 25 | comply with Core International Labor Standards. 26 | 27 | 3. The individual or the legal entity shall not induce or force its 28 | employee(s), whether full-time or part-time, or its independent 29 | contractor(s), in any methods, to agree in oral or written form, to 30 | directly or indirectly restrict, weaken or relinquish his or her 31 | rights or remedies under such laws, regulations, rules and standards 32 | relating to labor and employment as mentioned above, no matter whether 33 | such written or oral agreement are enforceable under the laws of the 34 | said jurisdiction, nor shall such individual or the legal entity 35 | limit, in any methods, the rights of its employee(s) or independent 36 | contractor(s) from reporting or complaining to the copyright holder or 37 | relevant authorities monitoring the compliance of the license about 38 | its violation(s) of the said license. 39 | 40 | THE LICENSED WORK IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 41 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 42 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 43 | IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, 44 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 45 | OTHERWISE, ARISING FROM, OUT OF OR IN ANY WAY CONNECTION WITH THE 46 | LICENSED WORK OR THE USE OR OTHER DEALINGS IN THE LICENSED WORK. 47 | -------------------------------------------------------------------------------- /Plagiarism.md: -------------------------------------------------------------------------------- 1 | # Blacklist of Plagiarism 剽窃黑名单 2 | 3 | 一下作者剽窃了该文: 4 | 5 | [2017年一线城市程序员工资大调查](https://blog.csdn.net/juwikuang/article/details/72888792) 6 | 7 | 8 | |网站|名字|证据|Archive|注解| 9 | |--|--|--|--|--| 10 | |CSDN|代码技巧|[链接](https://blog.csdn.net/tTU1EvLDeLFq5btqiK/article/details/78929087)||未注明出处| 11 | |公众号|代码技巧|[链接](https://mp.weixin.qq.com/s/8kxgoaAEn-Qoz6M0-MhJNw)|[Archive](https://web.archive.org/web/20190420151822/https://mp.weixin.qq.com/s/8kxgoaAEn-Qoz6M0-MhJNw)|未注明出处,和上面是一家| 12 | |CSDN|黄小斜|[链接](https://blog.csdn.net/a724888/article/details/85841595)||未注明出处| 13 | |公众号|程序员江湖|[链接](https://mp.weixin.qq.com/s?__biz=MzUyOTk5NDQwOA==&mid=2247484646&idx=1&sn=9e71c0b3a411f19596e76a719663c003&chksm=fa59c321cd2e4a37602247cf906de6fe842dd20bf9f6940540dfd7d95aaeff49d947a7e4c309&mpshare=1&scene=1&srcid=0420GOJD9QoZi5FgxftRZ9zx&key=cb077656fa10b4eb27f368d30cef703ffcc82770b492b9a167537ddc4000a70a4468f004bea96a7914dad864c99824061f96cc11ffd063e41a281895d0f8677bb0eb2e8d42e2cd6e8b80844fa9914ac4&ascene=1&uin=MjU2NTc5MTk0Mg%3D%3D&devicetype=Windows+10&version=62060739&lang=en&pass_ticket=uf9nNPrm%2FwExDIh6qF0atahVZa%2BFwjMGxjOE8z0Uy5i0YwiioUkwCxNWMl7tVZmp)|[Archive](https://web.archive.org/web/20190420151741/https://mp.weixin.qq.com/s?__biz=MzUyOTk5NDQwOA==&mid=2247484646&idx=1&sn=9e71c0b3a411f19596e76a719663c003&chksm=fa59c321cd2e4a37602247cf906de6fe842dd20bf9f6940540dfd7d95aaeff49d947a7e4c309&mpshare=1&scene=1&srcid=0420GOJD9QoZi5FgxftRZ9zx&key=cb077656fa10b4eb27f368d30cef703ffcc82770b492b9a167537ddc4000a70a4468f004bea96a7914dad864c99824061f96cc11ffd063e41a281895d0f8677bb0eb2e8d42e2cd6e8b80844fa9914ac4&ascene=1&uin=MjU2NTc5MTk0Mg%3D%3D&devicetype=Windows+10&version=62060739&lang=en&pass_ticket=uf9nNPrm%2FwExDIh6qF0atahVZa%2BFwjMGxjOE8z0Uy5i0YwiioUkwCxNWMl7tVZmp)|出处写为【码农有道】,和上面是一家| 14 | |公众号|码农有道|[链接](https://mp.weixin.qq.com/s?__biz=MzIwNTc4NTEwOQ==&mid=2247486120&idx=1&sn=9c4d677ff9823254c7cf3f86b79dd8dd&scene=21#wechat_redirect)|[Archive](https://web.archive.org/web/20190420151309/https://mp.weixin.qq.com/s?__biz=MzIwNTc4NTEwOQ==&mid=2247486120&idx=1&sn=9c4d677ff9823254c7cf3f86b79dd8dd&scene=21%23wechat_redirect)|冒充原创| 15 | |CSDN|Java成长记_Camel|[链接](https://blog.csdn.net/qq_30225725/article/details/86729448)||冒充原创,篡改(2017改成2019)| 16 | |CSDN| 运维派V|[链接](https://blog.csdn.net/ki8qzvka6gz4n450m/article/details/79548177)||冒充原创,误导(2017误导成2019)| 17 | |CSDN| weixin_34248118|[链接](https://blog.csdn.net/weixin_34248118/article/details/87058659)||冒充原创| 18 | |CSDN| 程序员之家v|[链接](https://blog.csdn.net/EGEFCXzo3Ha1x4/article/details/79454135)||未注明出处,百度关键字【一线城市 程序员 工资】头条| 19 | |公众号|养码场|[链接](https://mp.weixin.qq.com/s/EOi3wY0d6K2z7cTZghscvA)|[Archive](https://web.archive.org/web/20190420151218/https://mp.weixin.qq.com/s/EOi3wY0d6K2z7cTZghscvA)|为注明出处,和上面是一家,误导(2017误导为2018)| 20 | |公众号|程序员之家|[链接](https://mp.weixin.qq.com/s/Q4rZdblmjPVJKuoFOMKRjA)|[Archive](https://web.archive.org/web/20190420151129/https://mp.weixin.qq.com/s/Q4rZdblmjPVJKuoFOMKRjA)|为注明出处,误导(2017误导为2018)| 21 | |公众号|千锋教育|[链接](https://mp.weixin.qq.com/s/Xro1BhiYsdwb5IOBPQtZbQ)|[Archive](https://web.archive.org/web/20190420151028/https://mp.weixin.qq.com/s/Xro1BhiYsdwb5IOBPQtZbQ)|为注明出处,误导(2017误导为2018)| 22 | |搜狐号|华俊竹传媒|[链接](http://m.sohu.com/a/277091383_120001579)||为注明出处,误导(2017误导为2018)| 23 | |企鹅号|码农有道|[链接](https://new.qq.com/omn/20181117/20181117B00KBW.html)||为注明出处,误导(2017误导为2018)| 24 | |CSDN| Exceed Oneself|[链接](https://blog.csdn.net/ll666634/article/details/79156271)||未注明出处,误导(2017误导为2018)| 25 | |公众号|Python人工智能|[链接](https://mp.weixin.qq.com/s/Umlu3HI8A-XnJcAK5U9g1Q)|[Archive](https://web.archive.org/web/20190420150859/https://mp.weixin.qq.com/s/Umlu3HI8A-XnJcAK5U9g1Q)|为注明出处,误导(2017误导为2018)| 26 | |公众号|资料在线|[链接](https://mp.weixin.qq.com/s/RiRSGVKzj0gkZoBnVYlX_Q)|[Archive](https://web.archive.org/web/20190420150643/https://mp.weixin.qq.com/s/RiRSGVKzj0gkZoBnVYlX_Q)|出处错误,误导(2017误导为2018)| 27 | |公众号|Java之猿程之家|[链接](https://mp.weixin.qq.com/s/H1QQ3cc64jZF_fHeP2gkQA)|[Archive](https://web.archive.org/web/20190420150511/https://mp.weixin.qq.com/s/H1QQ3cc64jZF_fHeP2gkQA)|未注明出处,误导(2017误导为2019)| 28 | |简书|Grady_Camel|[链接](https://www.jianshu.com/p/a33f7281568a)|[Archive](https://web.archive.org/web/20190420140556/https://www.jianshu.com/p/a33f7281568a)|篡改数据!| 29 | 30 | 以上公众号,博主,抄袭别人的文章,还篡改,造成误导。请大家取消关注,谢谢。 31 | 32 | 抄袭文章删除的,联系本人,从黑名单里~~删除~~。 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stats of Chinese Developers 2 | #统计中国程序员的就业情况 3 | 4 | This repo is to look into Chinese Job website and make stats. 5 | 根据招聘网站,统计程序员就业信息。 6 | 7 | ## Salary 程序员工资 8 | 9 | ![Salary Distribution](https://github.com/juwikuang/china_job_survey/blob/master/images/201904_salary_by_cities.png?raw=true) 10 | 11 | ## Programming Languages 编程语言 12 | 13 | | - | Langueage | Percentage | 14 | |---|------------|--------| 15 | | 1 | java | 29.28% | 16 | | 2 | cpp | 16.08% | 17 | | 3 | javascript | 15.09% | 18 | | 4 | c_sharp | 10.95% | 19 | | 5 | python | 8.21% | 20 | 21 | ![](https://github.com/juwikuang/job_survey/blob/master/images/2019_04_pl_word_cloud.png?raw=true) 22 | 23 | For users from China, please use this link to render the reports: 24 | 25 | [https://nbviewer.jupyter.org/](https://nbviewer.jupyter.org/) 26 | 27 | -------------------------------------------------------------------------------- /Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "provinces = {}\n", 10 | "provinces['北京'] = '010000'\n", 11 | "provinces['上海'] = '020000'\n", 12 | "provinces['广东'] = '030000'\n", 13 | "provinces['深圳'] = '040000'\n", 14 | "provinces['天津'] = '050000'\n", 15 | "provinces['重庆'] = '060000'\n", 16 | "provinces['江苏'] = '070000'\n", 17 | "provinces['浙江'] = '080000'\n", 18 | "provinces['四川'] = '090000'" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "{'北京': '010000',\n", 30 | " '上海': '020000',\n", 31 | " '广东': '030000',\n", 32 | " '深圳': '040000',\n", 33 | " '天津': '050000',\n", 34 | " '重庆': '060000',\n", 35 | " '江苏': '070000',\n", 36 | " '浙江': '080000',\n", 37 | " '四川': '090000'}" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "provinces" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "dict_items([('北京', '010000'), ('上海', '020000'), ('广东', '030000'), ('深圳', '040000'), ('天津', '050000'), ('重庆', '060000'), ('江苏', '070000'), ('浙江', '080000'), ('四川', '090000')])" 58 | ] 59 | }, 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "provinces.items()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 7, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "[('四川', '090000'),\n", 78 | " ('浙江', '080000'),\n", 79 | " ('江苏', '070000'),\n", 80 | " ('重庆', '060000'),\n", 81 | " ('天津', '050000'),\n", 82 | " ('深圳', '040000'),\n", 83 | " ('广东', '030000'),\n", 84 | " ('上海', '020000'),\n", 85 | " ('北京', '010000')]" 86 | ] 87 | }, 88 | "execution_count": 7, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "list(provinces.items())[::-1]" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.7.6" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 4 126 | } 127 | -------------------------------------------------------------------------------- /images/201904_salary_by_cities.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EricWebsmith/china_job_survey/87c8db28d3ba28729b984b38fd990f385c8e45fb/images/201904_salary_by_cities.png -------------------------------------------------------------------------------- /images/2019_04_pl_pie.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EricWebsmith/china_job_survey/87c8db28d3ba28729b984b38fd990f385c8e45fb/images/2019_04_pl_pie.png -------------------------------------------------------------------------------- /images/2019_04_pl_word_cloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EricWebsmith/china_job_survey/87c8db28d3ba28729b984b38fd990f385c8e45fb/images/2019_04_pl_word_cloud.png -------------------------------------------------------------------------------- /py/black_list: -------------------------------------------------------------------------------- 1 | 四川虹美智能科技有限公司 2 | 软件与服务中心 3 | 4 | 5 | update _202003 set career='机器学习' where title like '%机器学习%' or title like '%深度学习%' or title like '%推荐系统%' or title like '%推荐算法%' or title like '%图像识别%' 6 | or title like '%人工智能%' or title like '%nlp%' or title like '%自然语言%' or title like '%aml%' or title like '%AI%' or title like '%数据科学家%' 7 | or title like '%data scientist%' or title like '%知识图谱%' 8 | or zhinengleibie in ('机器学习工程师','深度学习工程师') -------------------------------------------------------------------------------- /py/clean.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 1 23:52:57 2020 4 | 5 | @author: eric 6 | """ 7 | 8 | import config 9 | 10 | for table in config.table_list: 11 | company_titles="('"+"','".join(config.company_blacklist)+"')" 12 | sql=f'delete from {table} where company_title in {company_titles}' 13 | print(sql) 14 | 15 | 16 | for table in config.table_list: 17 | 18 | for key in config.title_key_blacklist: 19 | sql=f"delete from {table} where title like '%{key}%'" 20 | print(sql) 21 | 22 | title_end_blacklist=['审核'] 23 | 24 | for table in config.table_list: 25 | sql=f"delete from {table} where title like '%审核'" 26 | print(sql) 27 | 28 | ids=['105141736', '89941978','107192348'] 29 | 30 | for table in config.table_list: 31 | sql=f"delete from {table} where job_id='107192348'" 32 | print(sql) -------------------------------------------------------------------------------- /py/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 3 11:44:37 2019 4 | 5 | @author: eric 6 | """ 7 | import inspect 8 | 9 | def is_letter_english(letter): 10 | return ord(letter)<=126 11 | 12 | def is_article_english(article): 13 | english_letters=sum(list(map(is_letter_english,list(article)))) 14 | length=len(article) 15 | percentage=100*english_letters/length 16 | return percentage>80 17 | 18 | def get_featurenames(o): 19 | #python reflection 20 | dictionary=inspect.getmembers(o) 21 | feature_names=[t[0] for t in dictionary if not t[0].startswith("__") and not t[0].startswith("get_") and not t[0].startswith("check_")] 22 | return feature_names 23 | 24 | def object2list(job): 25 | dictionary=inspect.getmembers(job) 26 | l=[] 27 | for key, value in dictionary: 28 | if key.startswith('__') or key.startswith('get_') or key.startswith('check_'): 29 | continue 30 | l.append(value) 31 | return l 32 | 33 | def object2dict(o): 34 | dictionary=inspect.getmembers(o) 35 | d={} 36 | for key, value in dictionary: 37 | if key.startswith('__'): 38 | continue 39 | d[key]=value 40 | return d -------------------------------------------------------------------------------- /py/company.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 6 10:26:35 2020 4 | 5 | @author: eric 6 | """ 7 | 8 | from requests import get 9 | from bs4 import BeautifulSoup 10 | 11 | class Company(): 12 | 13 | company_title="" 14 | 15 | company_description="" 16 | #外资(欧美) 17 | #外资(非欧美) 18 | #合资 19 | #国企 20 | #民营公司 21 | #外企代表处 22 | #政府机关 23 | #事业单位 24 | #非营利组织 25 | #上市公司 26 | #创业公司 27 | company_type='' 28 | 29 | #少于50人 30 | #50-150人 31 | #150-500人 32 | #500-1000人 33 | #1000-5000人 34 | #5000-10000人 35 | #10000人以上 36 | company_size='' 37 | 38 | #计算机/互联网/通信/电子 39 | #会计/金融/银行/保险 40 | #贸易/消费/制造/营运 41 | #制药/医疗 42 | #广告/媒体 43 | #房地产/建筑 44 | #专业服务/教育/培训 45 | #服务业 46 | #物流/运输 47 | #能源/原材料 48 | #政府/非营利组织/其他 49 | industry='' 50 | 51 | def get_company_tags(company_link): 52 | response=get(company_link) 53 | response.encoding='gbk' 54 | soup=BeautifulSoup(response.text, 'html.parser') 55 | ltype_tag=soup.select_one('.ltype') 56 | if not ltype_tag: 57 | return [] 58 | info_string=ltype_tag.text 59 | return [info.strip() for info in info_string.split('|')] 60 | 61 | def check_company_size(self): 62 | return not self.company_size=='' 63 | 64 | def get_company_size(self, tag): 65 | if (tag=='少于50人'): 66 | self.company_size='50-' 67 | elif (tag=='50-150人'): 68 | self.company_size='50-150' 69 | elif (tag=='150-500人'): 70 | self.company_size='150-500' 71 | elif (tag=='500-1000人'): 72 | self.company_size='500-1000' 73 | elif (tag=='1000-5000人'): 74 | self.company_size='1000-5000' 75 | elif (tag=='5000-10000人'): 76 | self.company_size='5000-10000' 77 | elif (tag=='10000人以上'): 78 | self.company_size='10000+' 79 | return self 80 | 81 | def get_company_type(self, tag): 82 | if tag in ['外资(欧美)','外资(非欧美)','合资','国企','民营公司','外企代表处','政府机关','事业单位','非营利组织','上市公司''创业公司']: 83 | self.company_type=tag 84 | return self 85 | 86 | def check_company_type(self): 87 | return not self.company_type=='' 88 | 89 | #公司信息 90 | company_info_tag=soup.find('span',text='公司信息') 91 | if company_info_tag: 92 | job.company_description=company_info_tag.parent.find_next('div').text.replace('\xa0',' ').strip() 93 | 94 | #['民营公司', '150-500人', '服装/纺织/皮革'] 95 | company_tags=[p.text.strip() for p in soup.select('.com_tag .at')] 96 | 97 | if len(company_tags)>0: 98 | job.get_company_type(company_tags[0]) 99 | if job.company_type=='': 100 | company_link=company_title_tag.attrs['href'] 101 | company_tags=get_company_tags(company_link) 102 | for tag in company_tags: 103 | if job.get_company_type(tag).check_company_type(): 104 | break 105 | 106 | if job.company_type=='': 107 | return None 108 | 109 | job.get_company_size(company_tags[1]) 110 | if not job.check_company_size(): 111 | company_link=company_title_tag.attrs['href'] 112 | company_tags=get_company_tags(company_link) 113 | for tag in company_tags: 114 | if job.get_company_size(tag).check_company_size(): 115 | break 116 | 117 | #计算机/互联网/通信/电子 118 | industry_tags=[p.text.strip() for p in soup.select('.com_tag .at a') if not p.text==''] 119 | 120 | 121 | if len(industry_tags)==0: 122 | company_link=soup.select_one('.com_name').attrs['href'] 123 | industry_tags=get_company_tags(company_link) 124 | 125 | for industry_tag in industry_tags: 126 | job.get_industry(industry_tag) 127 | 128 | 129 | 130 | def check_industry(self): 131 | return not self.industry=='' 132 | 133 | def get_industry(self, industry_tag): 134 | if industry_tag in ['计算机软件','计算机硬件','计算机服务(系统、数据服务、维修)','通信/电信/网络设备','通信/电信运营、增值服务','互联网/电子商务','网络游戏','电子技术/半导体/集成电路','仪器仪表/工业自动化']: 135 | self.industry='computer' 136 | #会计/金融/银行/保险 137 | if industry_tag in ['会计/审计','金融/投资/证券','银行','保险','信托/担保/拍卖/典当']: 138 | self.industry='finance' 139 | #贸易/消费/制造/营运 140 | if industry_tag in ['贸易/进出口','批发/零售','快速消费品(食品、饮料、化妆品)','服装/纺织/皮革','家具/家电/玩具/礼品','奢侈品/收藏品/工艺品/珠宝','办公用品及设备','机械/设备/重工','汽车及零配件']: 141 | self.industry='trade' 142 | #制药/医疗 143 | if industry_tag in ['制药/生物工程','医疗/护理/卫生','医疗设备/器械']: 144 | self.industry='medical' 145 | #广告/媒体 146 | if industry_tag in ['广告','公关/市场推广/会展','影视/媒体/艺术/文化传播','文字媒体/出版','印刷/包装/造纸']: 147 | self.industry='ads' 148 | #房地产/建筑 149 | if industry_tag in ['房地产','建筑/建材/工程','家居/室内设计/装潢','物业管理/商业中心']: 150 | self.industry='realestate' 151 | #专业服务/教育/培训 152 | if industry_tag in ['中介服务','专业服务(咨询、人力资源、财会)','外包服务','检测,认证','法律','教育/培训/院校','学术/科研','租赁服务']: 153 | self.industry='edu' 154 | #服务业 155 | if industry_tag in ['餐饮业','酒店/旅游','娱乐/休闲/体育','美容/保健','生活服务']: 156 | self.industry='service' 157 | #物流/运输 158 | if industry_tag in ['交通/运输/物流','航天/航空']: 159 | self.industry='logistic' 160 | #能源/原材料 161 | if industry_tag in ['石油/化工/矿产/地质','采掘业/冶炼','电气/电力/水利','新能源','原材料和加工']: 162 | self.industry='energy' 163 | #政府/非营利组织/其他 164 | if industry_tag in ['政府/公共事业','非营利组织','环保','农/林/牧/渔','多元化业务集团公司']: 165 | self.industry='gov' 166 | return self -------------------------------------------------------------------------------- /py/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Oct 8 00:45:34 2019 4 | 5 | @author: eric 6 | """ 7 | 8 | year=2020 9 | month=12 10 | 11 | table_list=['_201904','_201905','_201906','_201907','_201908','_201909','_201910','_201911','_201912', '_202001','_202002','_202003','_202004'] 12 | title_key_blacklist=['安全工程师','seo','测试','信息工程师','运维','经理','讲师','教师','老师','负责人','合伙人','计算机技术员','计算机辅助设计','DBA','实施','售前','售后','实习','数据标注员','管培生','2020届大专班','推广员','销售代表'] 13 | company_blacklist=['四川长虹网络科技有限责任公司', '软件与服务中心', '东华医为科技有限公司', '成都迈思信息技术有限公司', '广州国盛网络科技有限公司', '深圳市达铭丰科技有限公司', '北软互联(北京)科技有限公司', '南京瑞玥科技有限公司', '深圳极联信息技术股份有限公司','浙江八方电信有限公司诚聘', '深圳市捷兴电子商务有限公司', '西电济南变压器股份有限公司'] 14 | 15 | company_id_blocklist=['co5789883','co6033653'] 16 | 17 | zhinengleibies={} 18 | #后端 19 | zhinengleibies['0106']='高级软件工程师' 20 | zhinengleibies['0107']='软件工程师' 21 | zhinengleibies['0120']='PHP开发工程师' 22 | zhinengleibies['0121']='Java开发工程师' 23 | zhinengleibies['0122']='C开发工程师' 24 | zhinengleibies['0123']='系统分析员' 25 | zhinengleibies['0124']='Python开发工程师' 26 | zhinengleibies['0126']='.NET开发工程师' 27 | 28 | zhinengleibies['0128']='区块链开发' 29 | zhinengleibies['0129']='Hadoop工程师' 30 | zhinengleibies['0130']='大数据开发工程师' 31 | zhinengleibies['0131']='爬虫开发工程师' 32 | zhinengleibies['0132']='脚本开发工程师' 33 | 34 | zhinengleibies['0143']='系统架构设计师' 35 | zhinengleibies['0151']='Ruby开发工程师' 36 | zhinengleibies['0152']='Go开发工程师' 37 | #前端 38 | zhinengleibies['7201']='Web前端开发' 39 | zhinengleibies['7202']='HTML5开发工程师' 40 | zhinengleibies['7203']='前端开发' 41 | #人工智能 42 | zhinengleibies['7301']='机器学习工程师' 43 | zhinengleibies['7302']='深度学习工程师' 44 | zhinengleibies['7303']='图像算法工程师' 45 | zhinengleibies['7304']='图像处理工程师' 46 | zhinengleibies['7305']='图像识别工程师' 47 | zhinengleibies['7306']='语音识别工程师' 48 | zhinengleibies['7307']='机器视觉工程师' 49 | zhinengleibies['7308']='自然语言处理(NLP)' 50 | zhinengleibies['7309']='算法工程师' 51 | zhinengleibies['7310']='推荐算法工程师' 52 | zhinengleibies['7311']='搜索算法工程师' 53 | zhinengleibies['7312']='人工智能' 54 | 55 | #设计 56 | zhinengleibies['7405']='网站架构设计师' 57 | #数据 58 | zhinengleibies['7501']='数据分析师' 59 | zhinengleibies['7502']='数据分析经理主管' 60 | zhinengleibies['7503']='ETL开发工程师' 61 | zhinengleibies['7504']='BI工程师' 62 | zhinengleibies['7505']='数据仓库工程师' 63 | zhinengleibies['7506']='数据采集工程师' 64 | zhinengleibies['7507']='数据建模工程师' 65 | zhinengleibies['7508']='数据治理工程师' 66 | zhinengleibies['7509']='数据' 67 | #移动开发 68 | zhinengleibies['7701']='Android开发工程师' 69 | zhinengleibies['7702']='iOS开发工程师' 70 | zhinengleibies['7703']='移动开发工程师' 71 | zhinengleibies['7704']='移动开发工程师' 72 | zhinengleibies['7705']='小程序开发工程师' 73 | #游戏 74 | zhinengleibies['7809']='游戏开发工程师' 75 | zhinengleibies['7810']='Cocos2d-x开发工程师' 76 | zhinengleibies['7811']='Unity3d开发工程师' 77 | zhinengleibies['7812']='游戏客户端开发工程师' 78 | zhinengleibies['7813']='游戏服务端开发工程师' 79 | #嵌入式 80 | zhinengleibies['2910']='嵌入式软件开发' 81 | 82 | 83 | 84 | provinces = {} 85 | provinces['北京'] = '010000' 86 | provinces['上海'] = '020000' 87 | provinces['广东'] = '030000' 88 | provinces['深圳'] = '040000' 89 | provinces['天津'] = '050000' 90 | provinces['重庆'] = '060000' 91 | provinces['江苏'] = '070000' 92 | provinces['浙江'] = '080000' 93 | provinces['四川'] = '090000' 94 | provinces['海南'] = '100000' 95 | provinces['福建'] = '110000' 96 | provinces['山东'] = '120000' 97 | provinces['江西'] = '130000' 98 | provinces['广西'] = '140000' 99 | provinces['安徽'] = '150000' 100 | provinces['河北'] = '160000' 101 | provinces['河南'] = '170000' 102 | provinces['湖北'] = '180000' 103 | provinces['湖南'] = '190000' 104 | provinces['陕西'] = '200000' 105 | provinces['山西'] = '210000' 106 | provinces['黑龙江'] = '220000' 107 | provinces['辽宁'] = '230000' 108 | provinces['吉林'] = '240000' 109 | provinces['云南'] = '250000' 110 | provinces['贵州'] = '260000' 111 | provinces['甘肃'] = '270000' 112 | provinces['内蒙古'] = '280000' 113 | provinces['宁夏'] = '290000' 114 | provinces['西藏'] = '300000' 115 | provinces['新疆'] = '310000' 116 | provinces['青海'] = '320000' 117 | -------------------------------------------------------------------------------- /py/db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 2 22:46:54 2019 4 | 5 | @author: eric 6 | """ 7 | 8 | from sqlalchemy import create_engine 9 | from urllib.parse import quote_plus 10 | import pandas as pd 11 | 12 | def get_conn(): 13 | params = quote_plus("DRIVER={SQL Server Native Client 11.0};" 14 | "SERVER=localhost;" 15 | "DATABASE=it_jobs;" 16 | "Trusted_Connection=yes;") 17 | 18 | engine = create_engine("mssql+pyodbc:///?odbc_connect={}".format(params)) 19 | conn=engine.connect() 20 | return conn 21 | 22 | def get_data(sqlOrTableName, connection, params=None): 23 | sql = "" 24 | if len(sqlOrTableName.split(' '))==1: #if it is a table name 25 | sql = "select * from {}".format(sqlOrTableName) 26 | else: 27 | sql=sqlOrTableName 28 | bookExtensionTable = pd.read_sql(sql,con=connection, params=params) 29 | return bookExtensionTable 30 | 31 | -------------------------------------------------------------------------------- /py/multiprocess.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | 3 | import numpy as np 4 | import os 5 | 6 | import glob 7 | 8 | from config import year, month, company_blacklist, title_key_blacklist, zhinengleibies 9 | year_month=f'{year}{month:02}' 10 | 11 | provinces=['北京','上海','广东','深圳','天津','重庆','江苏','浙江','四川','海南','福建','山东','江西','广西','安徽','河北','河南','湖北','湖南','陕西','山西','黑龙江','辽宁','吉林','云南','贵州','甘肃','内蒙古','宁夏','西藏','新疆','青海'] 12 | data_folder = '../../data/51jobs_{}/'.format(year_month) 13 | back_folder = '../../data/51jobs_{}_b/'.format(year_month) 14 | 15 | d = {} 16 | 17 | counts = [] 18 | for zhinengleibie in list(zhinengleibies.values())[::-1]: 19 | count=0 20 | for province in provinces: 21 | files=glob.glob(f'{data_folder}{zhinengleibie}/{province}/*.*') 22 | count+=len(files) 23 | d[zhinengleibie]=count 24 | counts.append(count) 25 | #forglob(data_folder+"*") 26 | 27 | orders = np.argsort(counts) 28 | 29 | zhineng_splitters=[] 30 | zhineng_splitters=[] 31 | 32 | n_splitters = 4 33 | 34 | for splitter_index in range(4): 35 | zhineng_splitters.append([]) 36 | 37 | for i in range(len(orders)): 38 | for splitter_index in range(4): 39 | if i % n_splitters == splitter_index: 40 | zhineng_splitters[splitter_index].append(znlbs[orders[i]]) 41 | 42 | znlbs=zhinengleibies.values() 43 | znlbs = list(znlbs) 44 | 45 | 46 | -------------------------------------------------------------------------------- /py/old.download_lagou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 31 12:10:20 2019 4 | 5 | @author: eric 6 | """ 7 | 8 | from os import mkdir 9 | from os import path 10 | import re 11 | 12 | from urllib.request import urlretrieve 13 | from requests import get 14 | from bs4 import BeautifulSoup 15 | import threading 16 | from config import year, month 17 | 18 | 19 | data_folder = f'../../data/lagou_{year}{month:02}/' 20 | 21 | 22 | def main(): 23 | #city_names=['beijing','shanghai','guangzhou','shenzhen','hangzhou','nanjing','wuhan','chongqing','chengdu','changsha','fuzhou','hefei','ningbo','zhengzhou','tianjin','qingdao','jinan','kuming','shenyang','xian','dongguan','dalian','harbin','changchun'] 24 | #city_codes=['010000','020000','030200','040000','080200','070200','180200','060000','090200','190200','110200','150200','080300','170200','050000','120300','120200','250200','230200','200200','030800','230300','220200','240200'] 25 | 26 | provinces = {} 27 | provinces['北京'] = '010000' 28 | provinces['上海'] = '020000' 29 | provinces['广东'] = '030000' 30 | provinces['深圳'] = '040000' 31 | provinces['天津'] = '050000' 32 | provinces['重庆'] = '060000' 33 | provinces['江苏'] = '070000' 34 | provinces['浙江'] = '080000' 35 | provinces['四川'] = '090000' 36 | provinces['海南'] = '100000' 37 | provinces['福建'] = '110000' 38 | provinces['山东'] = '120000' 39 | provinces['江西'] = '130000' 40 | provinces['广西'] = '140000' 41 | provinces['安徽'] = '150000' 42 | provinces['河北'] = '160000' 43 | provinces['河南'] = '170000' 44 | provinces['湖北'] = '180000' 45 | provinces['湖南'] = '190000' 46 | provinces['陕西'] = '200000' 47 | provinces['山西'] = '210000' 48 | provinces['黑龙江'] = '220000' 49 | provinces['辽宁'] = '230000' 50 | provinces['吉林'] = '240000' 51 | provinces['云南'] = '250000' 52 | provinces['贵州'] = '260000' 53 | provinces['甘肃'] = '270000' 54 | provinces['内蒙古'] = '280000' 55 | provinces['宁夏'] = '290000' 56 | provinces['西藏'] = '300000' 57 | provinces['新疆'] = '310000' 58 | provinces['青海'] = '320000' 59 | 60 | 61 | #make sure this folder is created 62 | 63 | 64 | 65 | def download_pages(links, folder): 66 | for link in links: 67 | try: 68 | if not link.startswith("https://jobs.51job.com/"): 69 | continue 70 | filename = path.split(link)[-1] 71 | filename = filename.split('?')[0] 72 | destination_file = path.join(folder, filename) 73 | if not path.isfile(destination_file): 74 | print(link) 75 | t = threading.Thread(target=urlretrieve, args=(link, path.join(folder, filename))) 76 | t.start() 77 | #urlretrieve(link, path.join(folder, filename)) 78 | 79 | except Exception as e: 80 | print(str(e)) 81 | pass 82 | 83 | #0100是软件,2500是互联网 84 | categories={} 85 | categories['0106']='高级软件工程师' 86 | categories['0107']='软件工程师' 87 | categories['0109']='机器学习工程师' 88 | categories['0110']='深度学习工程师' 89 | categories['0111']='图像算法工程师' 90 | categories['0112']='图像处理工程师' 91 | categories['0113']='语音识别工程师' 92 | categories['0114']='图像识别工程师' 93 | categories['0115']='机器视觉工程师' 94 | categories['0116']='自然语言处理(NLP)' 95 | categories['0148']='算法工程师' 96 | categories['0143']='系统架构设计师' 97 | categories['2501']='互联网软件开发工程师' 98 | categories['2537']='手机应用开发工程师' 99 | categories['2512']='网站架构设计师' 100 | #categories['']='' 101 | #categories['']='' 102 | for category_key, category_name in categories.items(): 103 | job_category_folder = path.join(data_folder, category_name) 104 | if not path.isdir(job_category_folder): 105 | mkdir(job_category_folder) 106 | for province_name, province_code in provinces.items(): 107 | #create forlder 108 | province_folder = path.join(job_category_folder, province_name) 109 | if not path.isdir(province_folder): 110 | mkdir(province_folder) 111 | #links - 112 | #first page 113 | first_page_url = 'https://search.51job.com/list/{0},000000,{1},00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(province_code, category_key) 114 | first_page = get(first_page_url) 115 | first_page.encoding = 'gb2312' 116 | soup = BeautifulSoup(first_page.text,"html.parser") 117 | #page_count_string='共328页,到第' 118 | page_count_string = soup.select_one(".p_in .td").text 119 | re_result = re.match(r'共(\d+)页,到第',page_count_string) 120 | total_page = int(re_result.group(1)) 121 | print("{0} has {1} pages".format(province_name, total_page)) 122 | 123 | 124 | 125 | 126 | for page_index in range(1,total_page): 127 | #'https://sou.zhaopin.com/?jl=530&sf=0&st=0&jt=23,160000,045' 128 | list_url = 'https://search.51job.com/list/{0},000000,{1},00,9,99,%2B,2,{2}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(province_code, category_key, page_index) 129 | list_page = get(list_url) 130 | list_page.encoding = 'gb2312' 131 | soup = BeautifulSoup(list_page.text,"html.parser") 132 | #get list page 133 | links = [tag.attrs['href'] for tag in soup.select(".t1 a")] 134 | download_pages(links, province_folder) 135 | 136 | 137 | if __name__ == '__main__': 138 | main() 139 | 140 | 141 | 142 | #beijing page 1 %25=% 143 | #https://search.51job.com/list/010000,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 144 | #shanghai page 1 %25=% 145 | #https://search.51job.com/list/020000,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 146 | #guangzhou page 1 147 | #https://search.51job.com/list/030200,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 148 | #shenzhen page 1 149 | #https://search.51job.com/list/040000,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 150 | #hangzhou page 1 151 | #https://search.51job.com/list/080200,000000,0100,00,9,99,%2520,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 152 | 153 | #beijing page 2 %2B=+ 154 | #https://search.51job.com/list/010000,000000,0100,00,9,99,%2B,2,2.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 155 | #https://search.51job.com/list/010000,000000,0100,00,9,99,%2B,2,3.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 156 | 157 | #beijiang page 1 158 | #https://search.51job.com/list/010000,000000,0100,00,9,99,%2B,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare= 159 | 160 | 161 | -------------------------------------------------------------------------------- /py/stats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Oct 5 23:49:42 2019 4 | 5 | @author: eric 6 | """ 7 | import numpy as np 8 | import pandas as pd 9 | import weighted 10 | import db 11 | from config import year, month 12 | 13 | 14 | conn=db.get_conn() 15 | 16 | #month=1 17 | #city stats 18 | cities = """ 19 | ('北京','上海','深圳','杭州','广州','南京','苏州','成都','东莞','西安','武汉','天津','长沙', 20 | '宁波','福州','大连','重庆','青岛','济南','合肥','长春','昆明','郑州','沈阳','哈尔滨','厦门') 21 | """ 22 | 23 | sql=f"""select SUM(monthly_salary * headcount)/SUM(headcount) as salary, MAX(city) as city 24 | from jobs where year_month={year}{month:02} and monthly_salary>0 and monthly_salary<80000 and city in {cities} 25 | group by city 26 | """ 27 | 28 | result=conn.execute(sql).fetchall() 29 | 30 | conn.execute(f"delete from City_Stats where year_month='{year}{month:02}'") 31 | 32 | sql_insert="" 33 | 34 | for salary, city in result: 35 | sql_insert+="insert into City_Stats(year_month, City, Salary) " 36 | sql_insert+=f" values('{year}{month:02}', '{city}', {salary});\n" 37 | 38 | conn.execute(sql_insert) 39 | 40 | 41 | 42 | #MonthlyStats 43 | def get_summary(data, career): 44 | 45 | salaries = data.monthly_salary.values 46 | headcounts = data.headcount.values 47 | head_count=np.sum(headcounts) 48 | salary_average=int(np.average(salaries, weights=headcounts)) 49 | q = weighted.weighted_quantile(salaries,[0.025,0.5,0.975],headcounts) 50 | print(f"{year}年{month}月全国招收{career}{head_count}人。{year}年{month}月全国{career}平均工资{salary_average:.0f}元,工资中位数{q[1]:.0f}元,其中95%的人的工资介于{q[0]:.0f}元到{q[2]:.0f}元。\r\n") 51 | return head_count, salary_average, q[1] 52 | 53 | data=pd.read_sql(sql=f"select * from jobs where year_month= {year}{month:02} and monthly_salary>0 and monthly_salary<80000", con=conn) 54 | headcount, mean, median=get_summary(data, '程序员') 55 | conn.execute(f"delete from general_Stats where year_month='{year}{month:02}'") 56 | sql="insert into general_Stats(year_month, Salary_Mean, Salary_Median, JD_Count, Head_Count)" 57 | sql=sql+f" values('{year}{month:02}',{mean},{median},{data.shape[0]},{headcount})" 58 | conn.execute(sql) 59 | 60 | conn.close() -------------------------------------------------------------------------------- /py/weighted.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def weighted_mean(values, weights): 4 | return np.sum(values * weights) / weights.sum() 5 | 6 | def weighted_median(values, weights): 7 | sorter = np.argsort(values) 8 | values = values[sorter] 9 | weights = weights[sorter] 10 | weight_sum = np.sum(weights) 11 | weight_cum=0 12 | index=0 13 | for i in range(len(weights)): 14 | weight_cum+=weights[i] 15 | if weight_cum>=weight_sum/2: 16 | index = i 17 | break 18 | 19 | return values[index] 20 | 21 | #https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy 22 | 23 | def weighted_quantile(values, quantiles, sample_weight=None, 24 | values_sorted=False, old_style=False): 25 | """ Very close to numpy.percentile, but supports weights. 26 | NOTE: quantiles should be in [0, 1]! 27 | :param values: numpy.array with data 28 | :param quantiles: array-like with many quantiles needed 29 | :param sample_weight: array-like of the same length as `array` 30 | :param values_sorted: bool, if True, then will avoid sorting of 31 | initial array 32 | :param old_style: if True, will correct output to be consistent 33 | with numpy.percentile. 34 | :return: numpy.array with computed quantiles. 35 | """ 36 | values = np.array(values) 37 | quantiles = np.array(quantiles) 38 | if sample_weight is None: 39 | sample_weight = np.ones(len(values)) 40 | sample_weight = np.array(sample_weight) 41 | assert np.all(quantiles >= 0) and np.all(quantiles <= 1), \ 42 | 'quantiles should be in [0, 1]' 43 | 44 | if not values_sorted: 45 | sorter = np.argsort(values) 46 | values = values[sorter] 47 | sample_weight = sample_weight[sorter] 48 | 49 | weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight 50 | if old_style: 51 | # To be convenient with numpy.percentile 52 | weighted_quantiles -= weighted_quantiles[0] 53 | weighted_quantiles /= weighted_quantiles[-1] 54 | else: 55 | weighted_quantiles /= np.sum(sample_weight) 56 | return np.interp(quantiles, weighted_quantiles, values) -------------------------------------------------------------------------------- /reports/201912/config.py: -------------------------------------------------------------------------------- 1 | year=2019 2 | month=12 -------------------------------------------------------------------------------- /reports/201912/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row[0] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row[1] 47 | headcount=row[5] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row[0] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202001/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202001/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row[0] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row[1] 47 | headcount=row[5] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row[0] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202002/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202002/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row[0] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row[1] 47 | headcount=row[5] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row[0] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202003/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202003/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row[0] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row[1] 47 | headcount=row[5] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row[0] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202004/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202004/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row[0] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row[1] 47 | headcount=row[5] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row[0] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202005/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202005/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202006/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202006/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202007/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202007/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202008/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202008/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202009/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202009/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202010/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202010/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202010/provinces_basemap.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ModuleNotFoundError", 10 | "evalue": "No module named 'mpl_toolkits.basemap'", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 15 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mconfig\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mmap_wrapper\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 16 | "\u001b[1;32mD:\\projects\\51job_survey\\51job_survey_py\\reports\\202010\\map_wrapper.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mconfig\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mmpl_toolkits\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbasemap\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mBasemap\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdraw_city_map\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_city\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mheadcount_scale\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 17 | "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'mpl_toolkits.basemap'" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "from config import *\n", 23 | "from map_wrapper import *" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "print(f'{year}年{month}月')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import pandas as pd\n", 42 | "import sys\n", 43 | "sys.path.append('../../py')\n", 44 | "import db\n", 45 | "import weighted\n", 46 | "import inspect\n", 47 | "import matplotlib.pyplot as plt\n", 48 | "plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签\n", 49 | "plt.rcParams['axes.unicode_minus']=False #用来正常显示负号\n", 50 | "%matplotlib inline\n", 51 | "from mpl_toolkits.basemap import Basemap\n", 52 | "import seaborn as sns\n", 53 | "import scipy.stats as stats\n", 54 | "import numpy as np\n", 55 | "import math\n", 56 | "from matplotlib.font_manager import _rebuild\n", 57 | "\n", 58 | "_rebuild() #reload一下" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "conn=db.get_conn()\n", 68 | "data_original=pd.read_sql(sql=f\"select * from _{year}{month:02} where monthly_salary>0 and monthly_salary<80000\", con=conn)\n", 69 | "\n", 70 | "data=data_original[~data_original.job_id.isin(error_job_ids)]\n", 71 | "\n", 72 | "del data['publish_date']\n", 73 | "del data['published_on_weekend']\n", 74 | "del data['title']\n", 75 | "#del data['company_title']\n", 76 | "#del data['company_description']\n", 77 | "del data['job_description']\n", 78 | "del data['job_id']\n", 79 | "\n", 80 | "\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "\n", 90 | "join_counts=[conn.execute(f\"select COUNT(1) from _{year}{month:02}\").fetchall()[0][0]]\n", 91 | "percents=[]\n", 92 | "for i in range(1,month-6+1):\n", 93 | " sql=f\"select COUNT(1) from _{year}{month:02} a join _{year}{month-i:02} b on a.job_id = b.job_id\"\n", 94 | " #print(sql)\n", 95 | " count=conn.execute(sql).fetchall()[0][0]\n", 96 | "\n", 97 | " join_counts.append(count)\n", 98 | " subtract = join_counts[i-1]-join_counts[i]\n", 99 | " percents.append(subtract*1.0/join_counts[i])\n", 100 | "\n", 101 | "percents.append(join_counts[-1]/join_counts[0])" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "join_counts" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "percents" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "#plt.pie(percents, labels=['1','2','3','4','5','6','7','7+'])\n", 129 | "#plt.show()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "data.shape[0]" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "conn.close()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "#Common Functions\n", 157 | "def get_sub_stats_by_col(data, col):\n", 158 | " categories=data[col].unique()\n", 159 | " salary_mean=[]\n", 160 | " salary_95_min=[]\n", 161 | " salary_95_max=[]\n", 162 | " salary_median=[]\n", 163 | "\n", 164 | " count=[]\n", 165 | " \n", 166 | " categorys_out=[]\n", 167 | " for category in categories:\n", 168 | " #print(feature)\n", 169 | " idata=data[data[col]==category]\n", 170 | " headcount=idata.headcount.sum()\n", 171 | " values = idata.monthly_salary.values\n", 172 | " weights = idata.headcount.values\n", 173 | " #print(str(headcount))\n", 174 | " if headcount==0:\n", 175 | " continue\n", 176 | " \n", 177 | " salary_mean.append(np.average(values, weights=weights))\n", 178 | " \n", 179 | "\n", 180 | " q = weighted.weighted_quantile(values,[0.025,0.5,0.975],weights)\n", 181 | " salary_95_min.append(q[0])\n", 182 | " salary_median.append(q[1])\n", 183 | " salary_95_max.append(q[2])\n", 184 | " count.append(idata.headcount.sum())\n", 185 | " categorys_out.append(category)\n", 186 | " sub_data=pd.DataFrame()\n", 187 | " sub_data[col]=[c for c in categorys_out]\n", 188 | " sub_data['salary_mean']=salary_mean\n", 189 | " sub_data['salary_95_min']=salary_95_min\n", 190 | " sub_data['salary_median']=salary_median\n", 191 | " sub_data['salary_95_max']=salary_95_max\n", 192 | " sub_data['head_count']=count\n", 193 | " sub_data['percentage']=count/np.sum(count)\n", 194 | " sub_data=sub_data.sort_values(by='salary_mean', ascending=False)\n", 195 | "\n", 196 | " return sub_data\n", 197 | "\n", 198 | "data_format={\"percentage\":\"{:.2%}\",\"salary_mean\":\"{:.0f}\",\"salary_median\":\"{:.0f}\",\"salary_95_min\":\"{:.0f}\",\"salary_95_max\":\"{:.0f}\"}\n", 199 | "\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "data_career=get_sub_stats_by_col(data,'career')\n", 209 | "data_career.style.format(data_format)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "# 程序员工资" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "data_city=get_sub_stats_by_col(data,'province')\n", 226 | "#data_city.city=data_city.city.map(translate_dict)\n", 227 | "data_city.style.hide_index().format(data_format)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "describe(data_city,'程序员')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "draw_province_map(data_city,2000,'2019年5月中国大陆各省程序员工资')" 260 | ] 261 | } 262 | ], 263 | "metadata": { 264 | "kernelspec": { 265 | "display_name": "Python 3", 266 | "language": "python", 267 | "name": "python3" 268 | }, 269 | "language_info": { 270 | "codemirror_mode": { 271 | "name": "ipython", 272 | "version": 3 273 | }, 274 | "file_extension": ".py", 275 | "mimetype": "text/x-python", 276 | "name": "python", 277 | "nbconvert_exporter": "python", 278 | "pygments_lexer": "ipython3", 279 | "version": "3.8.3" 280 | } 281 | }, 282 | "nbformat": 4, 283 | "nbformat_minor": 4 284 | } 285 | -------------------------------------------------------------------------------- /reports/202011/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202011/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202011/provinces_basemap.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ModuleNotFoundError", 10 | "evalue": "No module named 'mpl_toolkits.basemap'", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 15 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mconfig\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mmap_wrapper\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 16 | "\u001b[1;32mD:\\projects\\51job_survey\\51job_survey_py\\reports\\202010\\map_wrapper.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mconfig\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 6\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mmpl_toolkits\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbasemap\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mBasemap\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdraw_city_map\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_city\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mheadcount_scale\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 17 | "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'mpl_toolkits.basemap'" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "from config import *\n", 23 | "from map_wrapper import *" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "print(f'{year}年{month}月')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import pandas as pd\n", 42 | "import sys\n", 43 | "sys.path.append('../../py')\n", 44 | "import db\n", 45 | "import weighted\n", 46 | "import inspect\n", 47 | "import matplotlib.pyplot as plt\n", 48 | "plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签\n", 49 | "plt.rcParams['axes.unicode_minus']=False #用来正常显示负号\n", 50 | "%matplotlib inline\n", 51 | "from mpl_toolkits.basemap import Basemap\n", 52 | "import seaborn as sns\n", 53 | "import scipy.stats as stats\n", 54 | "import numpy as np\n", 55 | "import math\n", 56 | "from matplotlib.font_manager import _rebuild\n", 57 | "\n", 58 | "_rebuild() #reload一下" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "conn=db.get_conn()\n", 68 | "data_original=pd.read_sql(sql=f\"select * from _{year}{month:02} where monthly_salary>0 and monthly_salary<80000\", con=conn)\n", 69 | "\n", 70 | "data=data_original[~data_original.job_id.isin(error_job_ids)]\n", 71 | "\n", 72 | "del data['publish_date']\n", 73 | "del data['published_on_weekend']\n", 74 | "del data['title']\n", 75 | "#del data['company_title']\n", 76 | "#del data['company_description']\n", 77 | "del data['job_description']\n", 78 | "del data['job_id']\n", 79 | "\n", 80 | "\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "\n", 90 | "join_counts=[conn.execute(f\"select COUNT(1) from _{year}{month:02}\").fetchall()[0][0]]\n", 91 | "percents=[]\n", 92 | "for i in range(1,month-6+1):\n", 93 | " sql=f\"select COUNT(1) from _{year}{month:02} a join _{year}{month-i:02} b on a.job_id = b.job_id\"\n", 94 | " #print(sql)\n", 95 | " count=conn.execute(sql).fetchall()[0][0]\n", 96 | "\n", 97 | " join_counts.append(count)\n", 98 | " subtract = join_counts[i-1]-join_counts[i]\n", 99 | " percents.append(subtract*1.0/join_counts[i])\n", 100 | "\n", 101 | "percents.append(join_counts[-1]/join_counts[0])" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "join_counts" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "percents" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "#plt.pie(percents, labels=['1','2','3','4','5','6','7','7+'])\n", 129 | "#plt.show()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "data.shape[0]" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "conn.close()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "#Common Functions\n", 157 | "def get_sub_stats_by_col(data, col):\n", 158 | " categories=data[col].unique()\n", 159 | " salary_mean=[]\n", 160 | " salary_95_min=[]\n", 161 | " salary_95_max=[]\n", 162 | " salary_median=[]\n", 163 | "\n", 164 | " count=[]\n", 165 | " \n", 166 | " categorys_out=[]\n", 167 | " for category in categories:\n", 168 | " #print(feature)\n", 169 | " idata=data[data[col]==category]\n", 170 | " headcount=idata.headcount.sum()\n", 171 | " values = idata.monthly_salary.values\n", 172 | " weights = idata.headcount.values\n", 173 | " #print(str(headcount))\n", 174 | " if headcount==0:\n", 175 | " continue\n", 176 | " \n", 177 | " salary_mean.append(np.average(values, weights=weights))\n", 178 | " \n", 179 | "\n", 180 | " q = weighted.weighted_quantile(values,[0.025,0.5,0.975],weights)\n", 181 | " salary_95_min.append(q[0])\n", 182 | " salary_median.append(q[1])\n", 183 | " salary_95_max.append(q[2])\n", 184 | " count.append(idata.headcount.sum())\n", 185 | " categorys_out.append(category)\n", 186 | " sub_data=pd.DataFrame()\n", 187 | " sub_data[col]=[c for c in categorys_out]\n", 188 | " sub_data['salary_mean']=salary_mean\n", 189 | " sub_data['salary_95_min']=salary_95_min\n", 190 | " sub_data['salary_median']=salary_median\n", 191 | " sub_data['salary_95_max']=salary_95_max\n", 192 | " sub_data['head_count']=count\n", 193 | " sub_data['percentage']=count/np.sum(count)\n", 194 | " sub_data=sub_data.sort_values(by='salary_mean', ascending=False)\n", 195 | "\n", 196 | " return sub_data\n", 197 | "\n", 198 | "data_format={\"percentage\":\"{:.2%}\",\"salary_mean\":\"{:.0f}\",\"salary_median\":\"{:.0f}\",\"salary_95_min\":\"{:.0f}\",\"salary_95_max\":\"{:.0f}\"}\n", 199 | "\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "data_career=get_sub_stats_by_col(data,'career')\n", 209 | "data_career.style.format(data_format)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "# 程序员工资" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "data_city=get_sub_stats_by_col(data,'province')\n", 226 | "#data_city.city=data_city.city.map(translate_dict)\n", 227 | "data_city.style.hide_index().format(data_format)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "describe(data_city,'程序员')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "draw_province_map(data_city,2000,'2019年5月中国大陆各省程序员工资')" 260 | ] 261 | } 262 | ], 263 | "metadata": { 264 | "kernelspec": { 265 | "display_name": "Python 3", 266 | "language": "python", 267 | "name": "python3" 268 | }, 269 | "language_info": { 270 | "codemirror_mode": { 271 | "name": "ipython", 272 | "version": 3 273 | }, 274 | "file_extension": ".py", 275 | "mimetype": "text/x-python", 276 | "name": "python", 277 | "nbconvert_exporter": "python", 278 | "pygments_lexer": "ipython3", 279 | "version": "3.8.3" 280 | } 281 | }, 282 | "nbformat": 4, 283 | "nbformat_minor": 4 284 | } 285 | -------------------------------------------------------------------------------- /reports/202012/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202012/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/202101/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | pwd=os.getcwd() 4 | year_month=pwd.split('\\')[-1] 5 | 6 | year=int(year_month[:4]) 7 | month=int(year_month[4:]) 8 | -------------------------------------------------------------------------------- /reports/202101/map_wrapper.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | from config import * 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap 7 | 8 | def draw_city_map(data_city,headcount_scale, title): 9 | 10 | 11 | 12 | data_location = pd.read_csv('../city_locations.csv') 13 | data_location=data_location.set_index('city') 14 | 15 | #cities = [] 16 | scale = 5 17 | 18 | locations = [(116.407526, 39.90403),(120, 30)] 19 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 20 | plt.rcParams['figure.figsize'] = [13, 13] 21 | #plt.figure(figsize = (10,5)) 22 | fig, ax = plt.subplots() 23 | fig.title=title 24 | fig.figsize=(10,5) 25 | fig.dpi=80 26 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 27 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 28 | 29 | # load the shapefile, use the name 'states' 30 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 31 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 32 | #geolocator = Nominatim(user_agent="my-application") 33 | 34 | 35 | 36 | salary_min=data_city['平均工资'].min() 37 | salary_max=data_city['平均工资'].max() 38 | salary_middle = (salary_min+salary_max)/2 39 | salary_scale=salary_max-salary_min 40 | 41 | for index, row in data_city.iterrows(): 42 | city=row['city'] 43 | 44 | longitude = data_location.loc[city,'longitude'] 45 | latitude = data_location.loc[city,'latitude'] 46 | salary=row['平均工资'] 47 | headcount=row['招聘人数'] 48 | #color 49 | color_red=0 50 | color_green=0 51 | color_blue=0 52 | if salary>salary_middle: 53 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 54 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 55 | else: 56 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 57 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 58 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 59 | 60 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 61 | 62 | 63 | x, y = cn_map(longitude,latitude) 64 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 65 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 66 | #"{}{:.0f}".format(city_cn, salary) 67 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 68 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 69 | if city == '杭州': 70 | x=x-400000 71 | y=y+10000 72 | elif city=='广州': 73 | x=x-400000 74 | y=y+10000 75 | elif city=='合肥': 76 | x=x-300000 77 | y=y+10000 78 | elif city=='深圳': 79 | y=y-100000 80 | elif city=='南京': 81 | x=x-100000 82 | elif city=='天津': 83 | y=y-50000 84 | elif city=='上海': 85 | x=x+50000 86 | elif city=='武汉': 87 | y=y-50000 88 | elif city=='厦门': 89 | pass 90 | elif city=='福州': 91 | pass 92 | elif city=='苏州': 93 | y=y-100000 94 | pass 95 | elif city=='宁波': 96 | y=y-100000 97 | pass 98 | 99 | ax.text(x, y, "{}{:.0f}k".format(city, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 100 | ax.text(1100000, 1077845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 101 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(city, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 102 | ax.text(805805, 4007845, title.format(city, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 103 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 104 | #cn_map.drawcoastlines() #绘制海岸线 105 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 106 | plt.show() 107 | 108 | def draw_province_map(data_city,headcount_scale, title): 109 | 110 | data_location = pd.read_csv('../geo_data/provincial_capital_locations.csv', encoding='utf-8') 111 | data_location=data_location.set_index('province') 112 | 113 | #cities = [] 114 | scale = 5 115 | 116 | locations = [(116.407526, 39.90403),(120, 30)] 117 | #fig=plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k') 118 | plt.rcParams['figure.figsize'] = [13, 13] 119 | #plt.figure(figsize = (10,5)) 120 | fig, ax = plt.subplots() 121 | fig.title=title 122 | fig.figsize=(10,5) 123 | fig.dpi=80 124 | cn_map= Basemap(llcrnrlon=77, llcrnrlat=14, urcrnrlon=140, urcrnrlat=51, \ 125 | projection='lcc', lat_1=33, lat_2=45, lon_0=100) # ‘lcc'将投影方式设置为兰伯特投影 126 | 127 | # load the shapefile, use the name 'states' 128 | cn_map.readshapefile(r'D:/data/basemap/gadm36_CHN_shp/gadm36_CHN_1', name='china', drawbounds=True, color='gray') 129 | cn_map.readshapefile(r'D:/data/basemap/gadm36_TWN_shp/gadm36_TWN_1', name='taiwan', drawbounds=True, color='gray') 130 | #geolocator = Nominatim(user_agent="my-application") 131 | 132 | 133 | 134 | salary_min=data_city.salary_mean.min() 135 | salary_max=data_city.salary_mean.max() 136 | salary_middle = (salary_min+salary_max)/2 137 | salary_scale=salary_max-salary_min 138 | 139 | for index, row in data_city.iterrows(): 140 | province=row['province'] 141 | 142 | longitude = data_location.loc[province,'longitude'] 143 | latitude = data_location.loc[province,'latitude'] 144 | salary=row[1] 145 | headcount=row[5] 146 | #color 147 | color_red=0 148 | color_green=0 149 | color_blue=0 150 | if salary>salary_middle: 151 | color_red = 255 #int((salary - salary_middle) / (salary_scale/2)*255) 152 | color_green = int((salary_max - salary) / (salary_scale/2)*255) 153 | else: 154 | color_blue = int((salary_middle - salary) / (salary_scale/2)*255) 155 | color_green = int((salary - salary_min) / (salary_scale/2)*255) 156 | color_red = int((salary - salary_min) / (salary_scale/2)*255) 157 | 158 | color = '#{:02x}{:02x}{:02x}'.format(color_red,color_green,color_blue) 159 | 160 | 161 | x, y = cn_map(longitude,latitude) 162 | cn_map.plot(x,y,marker='o',color=color,markersize=int(math.sqrt(headcount/headcount_scale))+8) 163 | #ax.annotate(city, (x,y), xytext=(5, 5), textcoords='offset points', fontsize=15) 164 | #"{}{:.0f}".format(city_cn, salary) 165 | #ax.text(x+5, y+5,city , fontweight='bold', fontsize=int(headcount/2500+12)) 166 | fontsize=int(math.sqrt(headcount/headcount_scale))+13 167 | if province == '浙江': 168 | #x=x-400000 169 | y=y-100000 170 | 171 | elif province=='安徽': 172 | x=x-300000 173 | y=y+10000 174 | elif province=='江苏': 175 | x=x-150000 176 | elif province=='天津': 177 | y=y-50000 178 | elif province=='上海': 179 | x=x+50000 180 | elif province=='湖北': 181 | y=y-50000 182 | 183 | ax.text(x, y, "{}{:.0f}k".format(province, np.round(salary/1000)), fontweight='bold', fontsize=fontsize, bbox={'facecolor':color, 'alpha':0.3, 'pad':0}) 184 | ax.text(2053805, 1077845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 185 | ax.text(205805, 107845, "https://github.com/juwikuang/china_job_survey".format(province, np.round(salary/1000)), fontweight='bold',color='#999999', fontsize=20, bbox={'facecolor':'#eeeeee', 'alpha':0.4, 'pad':0}) 186 | ax.text(805805, 4007845, title.format(province, np.round(salary/1000)), fontweight='bold',color='#111111', fontsize=25) 187 | ax.text(805805, 3807845, "(城市大小代表招聘数量,颜色代表工资,红色最高,黄色次之,蓝最少)", fontweight='bold',color='#111111', fontsize=13) 188 | #cn_map.drawcoastlines() #绘制海岸线 189 | #cn_map.drawcountries(linewidth=1.5) #绘制国家边界线 190 | plt.show() 191 | 192 | 193 | def describe(data_city, career): 194 | 195 | for index, row in data_city.iterrows(): 196 | print(f"{year}年{month}月{row[0]}招收{career}{row[5]}人。2019年{month}月{row[0]}{career}平均工资{row[1]:.0f}元,工资中位数{row[3]:.0f}元,其中95%的人的工资介于{row[2]:.0f}元到{row[4]:.0f}元。\r\n") 197 | -------------------------------------------------------------------------------- /reports/city_locations.csv: -------------------------------------------------------------------------------- 1 | city,longitude,latitude 2 | 北京,116.407526,39.90403 3 | 上海,121.473701,31.230416 4 | 深圳,114.07,22.62 5 | 杭州,120.19,30.26 6 | 广州,113.23,23.16 7 | 南京,118.78,32.04 8 | 成都,104.06,30.67 9 | 东莞,113.75,23.04 10 | 西安,108.95,34.27 11 | 武汉,114.31,30.52 12 | 天津,117.200983,39.084158 13 | 长沙,113,28.21 14 | 宁波,121.56,29.86 15 | 福州,119.3,26.08 16 | 大连,121.62,38.92 17 | 重庆,106.551556,29.563009 18 | 青岛,120.33,36.07 19 | 济南,117,36.65 20 | 合肥,117.27,31.86 21 | 长春,125.35,43.88 22 | 昆明,102.73,25.04 23 | 郑州,113.65,34.76 24 | 沈阳,123.38,41.8 25 | 哈尔滨,126.63,45.75 26 | 厦门,118.06,24.44 27 | 苏州,120.62,31.32 -------------------------------------------------------------------------------- /reports/geo_data/province_city.csv: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | province,city 16 | 北京,北京 17 | 上海,上海 18 | 天津,天津 19 | 重庆,重庆 20 | 深圳,深圳 21 | 河北,保定 22 | 河北,沧州 23 | 河北,承德 24 | 河北,邯郸 25 | 河北,衡水 26 | 河北,廊坊 27 | 河北,秦皇岛 28 | 河北,石家庄 29 | 河北,唐山 30 | 河北,邢台 31 | 河北,张家口 32 | 山西,长治 33 | 山西,大同 34 | 山西,晋城 35 | 山西,晋中 36 | 山西,临汾 37 | 山西,吕梁 38 | 山西,朔州 39 | 山西,太原 40 | 山西,忻州 41 | 山西,阳泉 42 | 山西,运城 43 | 内蒙古,阿拉善盟 44 | 内蒙古,巴彦淖尔 45 | 内蒙古,包头 46 | 内蒙古,赤峰 47 | 内蒙古,鄂尔多斯 48 | 内蒙古,呼和浩特 49 | 内蒙古,呼伦贝尔 50 | 内蒙古,通辽 51 | 内蒙古,乌海 52 | 内蒙古,乌兰察布 53 | 内蒙古,锡林郭勒盟 54 | 内蒙古,兴安盟 55 | 辽宁,鞍山 56 | 辽宁,本溪 57 | 辽宁,朝阳 58 | 辽宁,大连 59 | 辽宁,丹东 60 | 辽宁,抚顺 61 | 辽宁,阜新 62 | 辽宁,葫芦岛 63 | 辽宁,锦州 64 | 辽宁,辽阳 65 | 辽宁,盘锦 66 | 辽宁,沈阳 67 | 辽宁,铁岭 68 | 辽宁,营口 69 | 吉林,白城 70 | 吉林,白山 71 | 吉林,长春 72 | 吉林,吉林 73 | 吉林,辽源 74 | 吉林,四平 75 | 吉林,松原 76 | 吉林,通化 77 | 吉林,延边 78 | 黑龙江,大庆 79 | 黑龙江,大兴安岭 80 | 黑龙江,哈尔滨 81 | 黑龙江,鹤岗 82 | 黑龙江,黑河 83 | 黑龙江,鸡西 84 | 黑龙江,佳木斯 85 | 黑龙江,牡丹江 86 | 黑龙江,七台河 87 | 黑龙江,齐齐哈尔 88 | 黑龙江,双鸭山 89 | 黑龙江,绥化 90 | 黑龙江,伊春 91 | 江苏,常州 92 | 江苏,淮安 93 | 江苏,连云港 94 | 江苏,南京 95 | 江苏,南通 96 | 江苏,苏州 97 | 江苏,宿迁 98 | 江苏,泰州 99 | 江苏,无锡 100 | 江苏,徐州 101 | 江苏,盐城 102 | 江苏,扬州 103 | 江苏,镇江 104 | 浙江,杭州 105 | 浙江,湖州 106 | 浙江,嘉兴 107 | 浙江,金华 108 | 浙江,丽水 109 | 浙江,宁波 110 | 浙江,衢州 111 | 浙江,绍兴 112 | 浙江,台州 113 | 浙江,温州 114 | 浙江,舟山 115 | 安徽,安庆 116 | 安徽,蚌埠 117 | 安徽,亳州 118 | 安徽,巢湖 119 | 安徽,池州 120 | 安徽,滁州 121 | 安徽,阜阳 122 | 安徽,合肥 123 | 安徽,淮北 124 | 安徽,淮南 125 | 安徽,黄山 126 | 安徽,六安 127 | 安徽,马鞍山 128 | 安徽,宿州 129 | 安徽,铜陵 130 | 安徽,芜湖 131 | 安徽,宣城 132 | 福建,福州 133 | 福建,龙岩 134 | 福建,南平 135 | 福建,宁德 136 | 福建,莆田 137 | 福建,泉州 138 | 福建,三明 139 | 福建,厦门 140 | 福建,漳州 141 | 江西,抚州 142 | 江西,赣州 143 | 江西,吉安 144 | 江西,景德镇 145 | 江西,九江 146 | 江西,南昌 147 | 江西,萍乡 148 | 江西,上饶 149 | 江西,新余 150 | 江西,宜春 151 | 江西,鹰潭 152 | 山东,滨州 153 | 山东,德州 154 | 山东,东营 155 | 山东,菏泽 156 | 山东,济南 157 | 山东,济宁 158 | 山东,莱芜 159 | 山东,聊城 160 | 山东,临沂 161 | 山东,青岛 162 | 山东,日照 163 | 山东,泰安 164 | 山东,威海 165 | 山东,潍坊 166 | 山东,烟台 167 | 山东,枣庄 168 | 山东,淄博 169 | 河南,安阳 170 | 河南,鹤壁 171 | 河南,焦作 172 | 河南,开封 173 | 河南,洛阳 174 | 河南,漯河 175 | 河南,南阳 176 | 河南,平顶山 177 | 河南,濮阳 178 | 河南,三门峡 179 | 河南,商丘 180 | 河南,新乡 181 | 河南,信阳 182 | 河南,许昌 183 | 河南,郑州 184 | 河南,周口 185 | 河南,驻马店 186 | 湖北,鄂州 187 | 湖北,恩施 188 | 湖北,黄冈 189 | 湖北,黄石 190 | 湖北,荆门 191 | 湖北,荆州 192 | 湖北,十堰 193 | 湖北,随州 194 | 湖北,武汉 195 | 湖北,咸宁 196 | 湖北,襄樊 197 | 湖北,孝感 198 | 湖北,宜昌 199 | 湖南,长沙 200 | 湖南,常德 201 | 湖南,郴州 202 | 湖南,衡阳 203 | 湖南,怀化 204 | 湖南,娄底 205 | 湖南,邵阳 206 | 湖南,湘潭 207 | 湖南,湘西 208 | 湖南,益阳 209 | 湖南,永州 210 | 湖南,岳阳 211 | 湖南,张家界 212 | 湖南,株洲 213 | 广东,潮州 214 | 广东,东莞 215 | 广东,佛山 216 | 广东,广州 217 | 广东,河源 218 | 广东,惠州 219 | 广东,江门 220 | 广东,揭阳 221 | 广东,茂名 222 | 广东,梅州 223 | 广东,清远 224 | 广东,汕头 225 | 广东,汕尾 226 | 广东,韶关 227 | 广东,深圳 228 | 广东,阳江 229 | 广东,云浮 230 | 广东,湛江 231 | 广东,肇庆 232 | 广东,中山 233 | 广东,珠海 234 | 广西,百色 235 | 广西,北海 236 | 广西,崇左 237 | 广西,防城港 238 | 广西,贵港 239 | 广西,桂林 240 | 广西,河池 241 | 广西,贺州 242 | 广西,来宾 243 | 广西,柳州 244 | 广西,南宁 245 | 广西,钦州 246 | 广西,梧州 247 | 广西,玉林 248 | 海南,海口 249 | 海南,三亚 250 | 海南,直辖县级行政区划 251 | 四川,阿坝 252 | 四川,巴中 253 | 四川,成都 254 | 四川,达州 255 | 四川,德阳 256 | 四川,甘孜 257 | 四川,广安 258 | 四川,广元 259 | 四川,乐山 260 | 四川,凉山 261 | 四川,泸州 262 | 四川,眉山 263 | 四川,绵阳 264 | 四川,内江 265 | 四川,南充 266 | 四川,攀枝花 267 | 四川,遂宁 268 | 四川,雅安 269 | 四川,宜宾 270 | 四川,资阳 271 | 四川,自贡 272 | 贵州,安顺 273 | 贵州,毕节 274 | 贵州,贵阳 275 | 贵州,六盘水 276 | 贵州,黔东南 277 | 贵州,黔南 278 | 贵州,黔西南 279 | 贵州,铜仁 280 | 贵州,遵义 281 | 云南,保山 282 | 云南,楚雄 283 | 云南,大理 284 | 云南,德宏 285 | 云南,迪庆 286 | 云南,红河 287 | 云南,昆明 288 | 云南,丽江 289 | 云南,临沧 290 | 云南,怒江 291 | 云南,普洱 292 | 云南,曲靖 293 | 云南,文山 294 | 云南,西双版纳 295 | 云南,玉溪 296 | 云南,昭通 297 | 西藏,阿里 298 | 西藏,昌都 299 | 西藏,拉萨 300 | 西藏,林芝 301 | 西藏,那曲 302 | 西藏,日喀则 303 | 西藏,山南 304 | 陕西,安康 305 | 陕西,宝鸡 306 | 陕西,汉中 307 | 陕西,商洛 308 | 陕西,铜川 309 | 陕西,渭南 310 | 陕西,西安 311 | 陕西,咸阳 312 | 陕西,延安 313 | 陕西,榆林 314 | 甘肃,白银 315 | 甘肃,定西 316 | 甘肃,甘南 317 | 甘肃,嘉峪关 318 | 甘肃,金昌 319 | 甘肃,酒泉 320 | 甘肃,兰州 321 | 甘肃,临夏 322 | 甘肃,陇南 323 | 甘肃,平凉 324 | 甘肃,庆阳 325 | 甘肃,天水 326 | 甘肃,武威 327 | 甘肃,张掖 328 | 青海,果洛 329 | 青海,海北 330 | 青海,海东 331 | 青海,海南 332 | 青海,海西 333 | 青海,黄南 334 | 青海,西宁 335 | 青海,玉树 336 | 宁夏,固原 337 | 宁夏,石嘴山 338 | 宁夏,吴忠 339 | 宁夏,银川 340 | 宁夏,中卫 341 | 新疆,阿克苏 342 | 新疆,阿勒泰 343 | 新疆,巴音郭楞 344 | 新疆,博尔塔拉 345 | 新疆,昌吉 346 | 新疆,哈密 347 | 新疆,和田 348 | 新疆,喀什 349 | 新疆,克拉玛依 350 | 新疆,克孜勒苏柯尔克孜 351 | 新疆,塔城 352 | 新疆,吐鲁番 353 | 新疆,乌鲁木齐 354 | 新疆,伊犁哈萨克 355 | 新疆,直辖县级行政区划 356 | -------------------------------------------------------------------------------- /reports/geo_data/provincial_capital_locations.csv: -------------------------------------------------------------------------------- 1 | province,capital,longitude,latitude 2 | 辽宁,沈阳市,123.429092,41.796768 3 | 吉林,长春市,125.324501,43.886841 4 | 黑龙江,哈尔滨市,126.642464,45.756966 5 | 北京,北京市,116.405289,39.904987 6 | 天津,天津市,117.190186,39.125595 7 | 内蒙古,呼和浩特市,111.75199,40.84149 8 | 宁夏,银川市,106.23248,38.48644 9 | 山西,太原市,112.549248,37.857014 10 | 河北,石家庄市,114.502464,38.045475 11 | 山东,济南市,117.000923,36.675808 12 | 河南,郑州市,113.665413,34.757977 13 | 陕西,西安市,108.948021,34.263161 14 | 湖北,武汉市,114.298569,30.584354 15 | 江苏,南京市,118.76741,32.041546 16 | 安徽,合肥市,117.283043,31.861191 17 | 上海,上海市,121.472641,31.231707 18 | 湖南,长沙市,112.982277,28.19409 19 | 江西,南昌市,115.892151,28.676493 20 | 浙江,杭州市,120.15358,30.287458 21 | 福建,福州市,119.306236,26.075302 22 | 广东,广州市,113.28064,23.125177 23 | 台湾,台北市,121.520076,25.030724 24 | 海南,海口市,110.19989,20.04422 25 | 广西,南宁市,108.320007,22.82402 26 | 重庆,重庆市,106.504959,29.533155 27 | 云南,昆明市,102.71225,25.040609 28 | 贵州,贵阳市,106.713478,26.578342 29 | 四川,成都市,104.065735,30.659462 30 | 甘肃,兰州市,103.83417,36.06138 31 | 青海,西宁市,101.77782,36.61729 32 | 西藏,拉萨市,91.1145,29.64415 33 | 新疆,乌鲁木齐市,87.61688,43.82663 34 | 香港,香港,114.16546,22.27534 35 | 澳门,澳门,113.54913,22.19875 36 | -------------------------------------------------------------------------------- /sql/create_city_stats.sql: -------------------------------------------------------------------------------- 1 | USE [it_jobs] 2 | GO 3 | 4 | /****** Object: Table [dbo].[city_stats] Script Date: 8/2/2020 5:13:39 PM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | CREATE TABLE [dbo].[city_stats]( 12 | [yearmonth] [int] NOT NULL, 13 | [city] [nvarchar](50) NOT NULL, 14 | [salary] [int] NOT NULL, 15 | CONSTRAINT [PK_city_stats] PRIMARY KEY CLUSTERED 16 | ( 17 | [yearmonth] ASC, 18 | [city] ASC 19 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY] 20 | ) ON [PRIMARY] 21 | GO 22 | 23 | 24 | -------------------------------------------------------------------------------- /sql/create_company.sql: -------------------------------------------------------------------------------- 1 | USE [it_jobs] 2 | GO 3 | 4 | /****** Object: Table [dbo].[companies] Script Date: 6/28/2020 9:24:57 PM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | CREATE TABLE [dbo].[companies]( 12 | [company_id] [nvarchar](100) NOT NULL, 13 | [company_size] [nvarchar](100) NOT NULL, 14 | [company_name] [nvarchar](100) NOT NULL, 15 | [company_type] [nvarchar](100) NOT NULL, 16 | [company_description] [varchar](max) NOT NULL, 17 | [company_industry] [nvarchar](100) NOT NULL, 18 | CONSTRAINT [PK_companies] PRIMARY KEY CLUSTERED 19 | ( 20 | [company_id] ASC 21 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY] 22 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] 23 | GO 24 | 25 | 26 | -------------------------------------------------------------------------------- /sql/create_general_stats.sql: -------------------------------------------------------------------------------- 1 | USE [it_jobs] 2 | GO 3 | 4 | /****** Object: Table [dbo].[general_stats] Script Date: 8/2/2020 5:14:16 PM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | CREATE TABLE [dbo].[general_stats]( 12 | [yearmonth] [int] NOT NULL, 13 | [salary_mean] [int] NOT NULL, 14 | [salary_median] [int] NOT NULL, 15 | [jd_count] [int] NOT NULL, 16 | [head_count] [int] NOT NULL, 17 | CONSTRAINT [PK_general_stats] PRIMARY KEY CLUSTERED 18 | ( 19 | [yearmonth] ASC 20 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY] 21 | ) ON [PRIMARY] 22 | GO 23 | 24 | 25 | -------------------------------------------------------------------------------- /sql/create_table.sql: -------------------------------------------------------------------------------- 1 | USE [jobs] 2 | GO 3 | 4 | /****** Object: Table [dbo].[_51jobs] Script Date: 4/5/2019 2:13:00 PM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | CREATE TABLE [dbo].[_51jobs]( 12 | [ageism] [bit] NOT NULL, 13 | [career_algorithm] [bit] NOT NULL, 14 | [career_architect] [bit] NOT NULL, 15 | [career_software_engineer] [bit] NOT NULL, 16 | [city_beijing] [bit] NOT NULL, 17 | [city_changchun] [bit] NOT NULL, 18 | [city_changsha] [bit] NOT NULL, 19 | [city_chengdu] [bit] NOT NULL, 20 | [city_chongqing] [bit] NOT NULL, 21 | [city_dalian] [bit] NOT NULL, 22 | [city_dongguan] [bit] NOT NULL, 23 | [city_fuzhou] [bit] NOT NULL, 24 | [city_guangzhou] [bit] NOT NULL, 25 | [city_hangzhou] [bit] NOT NULL, 26 | [city_harbin] [bit] NOT NULL, 27 | [city_hefei] [bit] NOT NULL, 28 | [city_jinan] [bit] NOT NULL, 29 | [city_kuming] [bit] NOT NULL, 30 | [city_nanjing] [bit] NOT NULL, 31 | [city_ningbo] [bit] NOT NULL, 32 | [city_qingdao] [bit] NOT NULL, 33 | [city_shanghai] [bit] NOT NULL, 34 | [city_shenyang] [bit] NOT NULL, 35 | [city_shenzhen] [bit] NOT NULL, 36 | [city_tianjin] [bit] NOT NULL, 37 | [city_wuhan] [bit] NOT NULL, 38 | [city_xian] [bit] NOT NULL, 39 | [city_zhengzhou] [bit] NOT NULL, 40 | [company_description] [varchar](max) NOT NULL, 41 | [company_size_10000] [bit] NOT NULL, 42 | [company_size_1000_5000] [bit] NOT NULL, 43 | [company_size_150_500] [bit] NOT NULL, 44 | [company_size_50] [bit] NOT NULL, 45 | [company_size_5000_10000] [bit] NOT NULL, 46 | [company_size_500_1000] [bit] NOT NULL, 47 | [company_size_50_150] [bit] NOT NULL, 48 | [company_title] [varchar](max) NOT NULL, 49 | [company_tpye_jv] [bit] NOT NULL, 50 | [company_type_foreign] [bit] NOT NULL, 51 | [company_type_foreign_gov] [bit] NOT NULL, 52 | [company_type_foreign_rep] [bit] NOT NULL, 53 | [company_type_listed] [bit] NOT NULL, 54 | [company_type_non_profit] [bit] NOT NULL, 55 | [company_type_private] [bit] NOT NULL, 56 | [company_type_public_institution] [bit] NOT NULL, 57 | [company_type_startup] [bit] NOT NULL, 58 | [company_type_state] [bit] NOT NULL, 59 | [company_type_us_eu] [bit] NOT NULL, 60 | [db_Apache_Hive] [bit] NOT NULL, 61 | [db_CouchBase] [bit] NOT NULL, 62 | [db_CouchDB] [bit] NOT NULL, 63 | [db_DB2] [bit] NOT NULL, 64 | [db_DynamoDB] [bit] NOT NULL, 65 | [db_Elasticsearch] [bit] NOT NULL, 66 | [db_FileMaker] [bit] NOT NULL, 67 | [db_Firebase] [bit] NOT NULL, 68 | [db_Firebird] [bit] NOT NULL, 69 | [db_Hbase] [bit] NOT NULL, 70 | [db_Informix] [bit] NOT NULL, 71 | [db_Ingres] [bit] NOT NULL, 72 | [db_MariaDB] [bit] NOT NULL, 73 | [db_Memcached] [bit] NOT NULL, 74 | [db_MongoDB] [bit] NOT NULL, 75 | [db_MySQL] [bit] NOT NULL, 76 | [db_Neo4j] [bit] NOT NULL, 77 | [db_Netezza] [bit] NOT NULL, 78 | [db_Oracle] [bit] NOT NULL, 79 | [db_PostgreSQL] [bit] NOT NULL, 80 | [db_Redis] [bit] NOT NULL, 81 | [db_Riak] [bit] NOT NULL, 82 | [db_SAP_HANA] [bit] NOT NULL, 83 | [db_SQL_Server] [bit] NOT NULL, 84 | [db_SQLite] [bit] NOT NULL, 85 | [db_Solr] [bit] NOT NULL, 86 | [db_Splunk] [bit] NOT NULL, 87 | [db_Sybase] [bit] NOT NULL, 88 | [db_Teradata] [bit] NOT NULL, 89 | [db_dBase] [bit] NOT NULL, 90 | [edu_associate] [bit] NOT NULL, 91 | [edu_bachelor] [bit] NOT NULL, 92 | [edu_high_school] [bit] NOT NULL, 93 | [edu_master] [bit] NOT NULL, 94 | [edu_middle_school] [bit] NOT NULL, 95 | [edu_phd] [bit] NOT NULL, 96 | [english] [bit] NOT NULL, 97 | [experience_10] [bit] NOT NULL, 98 | [experience_1_3] [bit] NOT NULL, 99 | [experience_3_5] [bit] NOT NULL, 100 | [experience_5_10] [bit] NOT NULL, 101 | [experience_no] [bit] NOT NULL, 102 | [icu_996] [bit] NOT NULL, 103 | [industry_ads] [bit] NOT NULL, 104 | [industry_computer] [bit] NOT NULL, 105 | [industry_edu] [bit] NOT NULL, 106 | [industry_energy] [bit] NOT NULL, 107 | [industry_finance] [bit] NOT NULL, 108 | [industry_gov] [bit] NOT NULL, 109 | [industry_logistic] [bit] NOT NULL, 110 | [industry_medical] [bit] NOT NULL, 111 | [industry_realestate] [bit] NOT NULL, 112 | [industry_service] [bit] NOT NULL, 113 | [industry_trade] [bit] NOT NULL, 114 | [japanese] [bit] NOT NULL, 115 | [job_description] [varchar](max) NOT NULL, 116 | [job_id] [varchar](max) NOT NULL, 117 | [monthly_salary] [float] NOT NULL, 118 | [non_996] [bit] NOT NULL, 119 | [phone_android] [bit] NOT NULL, 120 | [phone_app] [bit] NOT NULL, 121 | [phone_iso] [bit] NOT NULL, 122 | [pl_c_sharp] [bit] NOT NULL, 123 | [pl_cpp] [bit] NOT NULL, 124 | [pl_delphi] [bit] NOT NULL, 125 | [pl_go] [bit] NOT NULL, 126 | [pl_haskell] [bit] NOT NULL, 127 | [pl_java] [bit] NOT NULL, 128 | [pl_javascript] [bit] NOT NULL, 129 | [pl_julia] [bit] NOT NULL, 130 | [pl_kotlin] [bit] NOT NULL, 131 | [pl_lua] [bit] NOT NULL, 132 | [pl_matlab] [bit] NOT NULL, 133 | [pl_objective_c] [bit] NOT NULL, 134 | [pl_perl] [bit] NOT NULL, 135 | [pl_php] [bit] NOT NULL, 136 | [pl_python] [bit] NOT NULL, 137 | [pl_ruby] [bit] NOT NULL, 138 | [pl_rust] [bit] NOT NULL, 139 | [pl_scrala] [bit] NOT NULL, 140 | [pl_swift] [bit] NOT NULL, 141 | [pl_typescript] [bit] NOT NULL, 142 | [pl_vba] [bit] NOT NULL, 143 | [pl_visual_basic] [bit] NOT NULL, 144 | [publish_date] [datetime] NOT NULL, 145 | [published_on_weekend] [bit] NOT NULL, 146 | [tag_baby_care] [bit] NOT NULL, 147 | [tag_five_insurance] [bit] NOT NULL, 148 | [tag_flexible] [bit] NOT NULL, 149 | [tag_no_overtime] [bit] NOT NULL, 150 | [tag_rest_one_day] [bit] NOT NULL, 151 | [tag_rest_two_days] [bit] NOT NULL, 152 | [tag_stock] [bit] NOT NULL, 153 | [title] [varchar](max) NOT NULL 154 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] 155 | GO 156 | 157 | 158 | -------------------------------------------------------------------------------- /sql/create_table_v2.sql: -------------------------------------------------------------------------------- 1 | USE [jobs] 2 | GO 3 | 4 | /****** Object: Table [dbo].[_201903v2] Script Date: 4/28/2019 10:26:07 PM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | CREATE TABLE [dbo].[_201904]( 12 | [job_id] [varchar](20) NOT NULL, 13 | [monthly_salary] [float] NOT NULL, 14 | [headcount] [bigint] NOT NULL, 15 | [title] [varchar](max) NOT NULL, 16 | [career] [nvarchar](100) NOT NULL, 17 | [city] [nvarchar](20) NOT NULL, 18 | [province] [nvarchar](20) NOT NULL, 19 | [company_description] [nvarchar](max) NOT NULL, 20 | [company_size] [nvarchar](100) NOT NULL, 21 | [company_title] [nvarchar](100) NOT NULL, 22 | [company_type] [nvarchar](100) NOT NULL, 23 | [ageism] [bit] NOT NULL, 24 | [db_Apache_Hive] [bit] NOT NULL, 25 | [db_CouchBase] [bit] NOT NULL, 26 | [db_CouchDB] [bit] NOT NULL, 27 | [db_DB2] [bit] NOT NULL, 28 | [db_DynamoDB] [bit] NOT NULL, 29 | [db_Elasticsearch] [bit] NOT NULL, 30 | [db_FileMaker] [bit] NOT NULL, 31 | [db_Firebase] [bit] NOT NULL, 32 | [db_Firebird] [bit] NOT NULL, 33 | [db_Hbase] [bit] NOT NULL, 34 | [db_Informix] [bit] NOT NULL, 35 | [db_Ingres] [bit] NOT NULL, 36 | [db_MariaDB] [bit] NOT NULL, 37 | [db_Memcached] [bit] NOT NULL, 38 | [db_MongoDB] [bit] NOT NULL, 39 | [db_MySQL] [bit] NOT NULL, 40 | [db_Neo4j] [bit] NOT NULL, 41 | [db_Netezza] [bit] NOT NULL, 42 | [db_Oracle] [bit] NOT NULL, 43 | [db_PostgreSQL] [bit] NOT NULL, 44 | [db_Redis] [bit] NOT NULL, 45 | [db_Riak] [bit] NOT NULL, 46 | [db_SAP_HANA] [bit] NOT NULL, 47 | [db_SQL_Server] [bit] NOT NULL, 48 | [db_SQLite] [bit] NOT NULL, 49 | [db_Solr] [bit] NOT NULL, 50 | [db_Splunk] [bit] NOT NULL, 51 | [db_Sybase] [bit] NOT NULL, 52 | [db_Teradata] [bit] NOT NULL, 53 | [db_dBase] [bit] NOT NULL, 54 | [edu] [nvarchar](100) NOT NULL, 55 | [english] [bit] NOT NULL, 56 | [experience] [nvarchar](100) NOT NULL, 57 | [expert_adas] [bit] NOT NULL, 58 | [expert_blockchain] [bit] NOT NULL, 59 | [expert_embed] [bit] NOT NULL, 60 | [expert_expert] [bit] NOT NULL, 61 | [expert_gis] [bit] NOT NULL, 62 | [_996_yes] [bit] NOT NULL, 63 | [_996_no] [bit] NOT NULL, 64 | [industry] [nvarchar](100) NOT NULL, 65 | [japanese] [bit] NOT NULL, 66 | [job_description] [nvarchar](max) NOT NULL, 67 | [job_summary] [nvarchar](100) NOT NULL, 68 | [job_tags] [nvarchar](100) NOT NULL, 69 | [phone_android] [bit] NOT NULL, 70 | [phone_app] [bit] NOT NULL, 71 | [phone_iso] [bit] NOT NULL, 72 | [pl_c_sharp] [bit] NOT NULL, 73 | [pl_cpp] [bit] NOT NULL, 74 | [pl_delphi] [bit] NOT NULL, 75 | [pl_go] [bit] NOT NULL, 76 | [pl_haskell] [bit] NOT NULL, 77 | [pl_java] [bit] NOT NULL, 78 | [pl_javascript] [bit] NOT NULL, 79 | [pl_julia] [bit] NOT NULL, 80 | [pl_kotlin] [bit] NOT NULL, 81 | [pl_lua] [bit] NOT NULL, 82 | [pl_matlab] [bit] NOT NULL, 83 | [pl_objective_c] [bit] NOT NULL, 84 | [pl_perl] [bit] NOT NULL, 85 | [pl_php] [bit] NOT NULL, 86 | [pl_python] [bit] NOT NULL, 87 | [pl_ruby] [bit] NOT NULL, 88 | [pl_rust] [bit] NOT NULL, 89 | [pl_scrala] [bit] NOT NULL, 90 | [pl_swift] [bit] NOT NULL, 91 | [pl_typescript] [bit] NOT NULL, 92 | [pl_vba] [bit] NOT NULL, 93 | [pl_visual_basic] [bit] NOT NULL, 94 | [publish_date] [datetime] NOT NULL, 95 | [published_on_weekend] [bit] NOT NULL, 96 | [tag_baby_care] [bit] NOT NULL, 97 | [tag_five_insurance] [bit] NOT NULL, 98 | [tag_flexible] [bit] NOT NULL, 99 | [tag_no_overtime] [bit] NOT NULL, 100 | [tag_rest_one_day] [bit] NOT NULL, 101 | [tag_rest_two_days] [bit] NOT NULL, 102 | [tag_stock] [bit] NOT NULL, 103 | [ml_tensorflow] [bit] NOT NULL, 104 | [ml_caffe] [bit] NOT NULL, 105 | [ml_cntk] [bit] NOT NULL, 106 | [ml_chainer] [bit] NOT NULL, 107 | [ml_mxnet] [bit] NOT NULL, 108 | [ml_keras] [bit] NOT NULL, 109 | [ml_deeplearning4j] [bit] NOT NULL, 110 | [ml_theano] [bit] NOT NULL, 111 | [ml_sklearn] [bit] NOT NULL, 112 | [ml_mahout] [bit] NOT NULL, 113 | [ml_paddlepaddle] [bit] NOT NULL 114 | 115 | CONSTRAINT [PK__201904v22] PRIMARY KEY CLUSTERED 116 | ( 117 | [job_id] ASC 118 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON [PRIMARY] 119 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] 120 | GO 121 | 122 | 123 | -------------------------------------------------------------------------------- /sql/create_table_v3.sql: -------------------------------------------------------------------------------- 1 | USE [it_jobs] 2 | GO 3 | 4 | /****** Object: Table [dbo].[jobs] Script Date: 6/28/2020 9:23:04 PM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | CREATE TABLE [dbo].[jobs]( 12 | [job_id] [varchar](20) NOT NULL, 13 | [yearmonth] [int] NOT NULL, 14 | [monthly_salary] [int] NOT NULL, 15 | [headcount] [bigint] NOT NULL, 16 | [title] [nvarchar](max) NOT NULL, 17 | [page_title] [nvarchar](max) NOT NULL, 18 | [zhinengleibie] [nvarchar](100) NOT NULL, 19 | [career] [nvarchar](100) NOT NULL, 20 | [city] [nvarchar](20) NOT NULL, 21 | [province] [nvarchar](20) NOT NULL, 22 | [company_id] [nvarchar](max) NOT NULL, 23 | [ageism] [bit] NOT NULL, 24 | [db_Apache_Hive] [bit] NOT NULL, 25 | [db_CouchBase] [bit] NOT NULL, 26 | [db_CouchDB] [bit] NOT NULL, 27 | [db_DB2] [bit] NOT NULL, 28 | [db_DynamoDB] [bit] NOT NULL, 29 | [db_Elasticsearch] [bit] NOT NULL, 30 | [db_FileMaker] [bit] NOT NULL, 31 | [db_Firebase] [bit] NOT NULL, 32 | [db_Firebird] [bit] NOT NULL, 33 | [db_Hbase] [bit] NOT NULL, 34 | [db_Informix] [bit] NOT NULL, 35 | [db_Ingres] [bit] NOT NULL, 36 | [db_MariaDB] [bit] NOT NULL, 37 | [db_Memcached] [bit] NOT NULL, 38 | [db_MongoDB] [bit] NOT NULL, 39 | [db_MySQL] [bit] NOT NULL, 40 | [db_Neo4j] [bit] NOT NULL, 41 | [db_Netezza] [bit] NOT NULL, 42 | [db_Oracle] [bit] NOT NULL, 43 | [db_PostgreSQL] [bit] NOT NULL, 44 | [db_Redis] [bit] NOT NULL, 45 | [db_Riak] [bit] NOT NULL, 46 | [db_SAP_HANA] [bit] NOT NULL, 47 | [db_SQL_Server] [bit] NOT NULL, 48 | [db_SQLite] [bit] NOT NULL, 49 | [db_Solr] [bit] NOT NULL, 50 | [db_Splunk] [bit] NOT NULL, 51 | [db_Sybase] [bit] NOT NULL, 52 | [db_Teradata] [bit] NOT NULL, 53 | [db_dBase] [bit] NOT NULL, 54 | [edu] [nvarchar](100) NOT NULL, 55 | [experience] [nvarchar](100) NOT NULL, 56 | [expert_adas] [bit] NOT NULL, 57 | [expert_blockchain] [bit] NOT NULL, 58 | [expert_embed] [bit] NOT NULL, 59 | [expert_expert] [bit] NOT NULL, 60 | [expert_gis] [bit] NOT NULL, 61 | [_996_yes] [bit] NOT NULL, 62 | [_996_no] [bit] NOT NULL, 63 | [lang_english] [bit] NOT NULL, 64 | [lang_japanese] [bit] NOT NULL, 65 | [job_description] [nvarchar](max) NOT NULL, 66 | [job_summary] [nvarchar](100) NOT NULL, 67 | [job_tags] [nvarchar](100) NOT NULL, 68 | [phone_android] [bit] NOT NULL, 69 | [phone_app] [bit] NOT NULL, 70 | [phone_iso] [bit] NOT NULL, 71 | [pl_c_sharp] [bit] NOT NULL, 72 | [pl_cpp] [bit] NOT NULL, 73 | [pl_delphi] [bit] NOT NULL, 74 | [pl_go] [bit] NOT NULL, 75 | [pl_haskell] [bit] NOT NULL, 76 | [pl_java] [bit] NOT NULL, 77 | [pl_javascript] [bit] NOT NULL, 78 | [pl_julia] [bit] NOT NULL, 79 | [pl_kotlin] [bit] NOT NULL, 80 | [pl_lua] [bit] NOT NULL, 81 | [pl_matlab] [bit] NOT NULL, 82 | [pl_objective_c] [bit] NOT NULL, 83 | [pl_perl] [bit] NOT NULL, 84 | [pl_php] [bit] NOT NULL, 85 | [pl_python] [bit] NOT NULL, 86 | [pl_ruby] [bit] NOT NULL, 87 | [pl_rust] [bit] NOT NULL, 88 | [pl_swift] [bit] NOT NULL, 89 | [pl_typescript] [bit] NOT NULL, 90 | [pl_vba] [bit] NOT NULL, 91 | [pl_visual_basic] [bit] NOT NULL, 92 | [pl_r] [bit] NOT NULL, 93 | [pl_scala] [bit] NOT NULL, 94 | [publish_date] [datetime] NOT NULL, 95 | [published_on_weekend] [bit] NOT NULL, 96 | [tag_baby_care] [bit] NOT NULL, 97 | [tag_five_insurance] [bit] NOT NULL, 98 | [tag_flexible] [bit] NOT NULL, 99 | [tag_no_overtime] [bit] NOT NULL, 100 | [tag_rest_one_day] [bit] NOT NULL, 101 | [tag_rest_two_days] [bit] NOT NULL, 102 | [tag_stock] [bit] NOT NULL, 103 | [ml_tensorflow] [bit] NOT NULL, 104 | [ml_caffe] [bit] NOT NULL, 105 | [ml_cntk] [bit] NOT NULL, 106 | [ml_chainer] [bit] NOT NULL, 107 | [ml_mxnet] [bit] NOT NULL, 108 | [ml_keras] [bit] NOT NULL, 109 | [ml_deeplearning4j] [bit] NOT NULL, 110 | [ml_theano] [bit] NOT NULL, 111 | [ml_sklearn] [bit] NOT NULL, 112 | [ml_mahout] [bit] NOT NULL, 113 | [ml_paddlepaddle] [bit] NOT NULL, 114 | [bd_hadoop] [bit] NOT NULL, 115 | [bd_spark] [bit] NOT NULL, 116 | [bd_hive] [bit] NOT NULL, 117 | [bd_mapReduce] [bit] NOT NULL, 118 | [bd_kafka] [bit] NOT NULL, 119 | [bd_hbase] [bit] NOT NULL, 120 | [bd_storm] [bit] NOT NULL, 121 | [bd_pig] [bit] NOT NULL, 122 | [bd_mahout] [bit] NOT NULL, 123 | [bd_impala] [bit] NOT NULL, 124 | [bd_yarn] [bit] NOT NULL, 125 | [bd_alluxio] [bit] NOT NULL, 126 | [bd_flink] [bit] NOT NULL, 127 | [bd_presto] [bit] NOT NULL, 128 | [bd_heron] [bit] NOT NULL, 129 | CONSTRAINT [PK_jobs] PRIMARY KEY CLUSTERED 130 | ( 131 | [job_id] ASC, 132 | [yearmonth] ASC 133 | )WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON [PRIMARY] 134 | ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] 135 | GO 136 | 137 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_expert_adas] DEFAULT ((0)) FOR [expert_adas] 138 | GO 139 | 140 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_expert_blockchain] DEFAULT ((0)) FOR [expert_blockchain] 141 | GO 142 | 143 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_expert_embed] DEFAULT ((0)) FOR [expert_embed] 144 | GO 145 | 146 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_expert_expert] DEFAULT ((0)) FOR [expert_expert] 147 | GO 148 | 149 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_expert_gis] DEFAULT ((0)) FOR [expert_gis] 150 | GO 151 | 152 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_pl_r] DEFAULT ((0)) FOR [pl_r] 153 | GO 154 | 155 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_pl_scala] DEFAULT ((0)) FOR [pl_scala] 156 | GO 157 | 158 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_tensorflow] DEFAULT ((0)) FOR [ml_tensorflow] 159 | GO 160 | 161 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_caffe] DEFAULT ((0)) FOR [ml_caffe] 162 | GO 163 | 164 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_cntk] DEFAULT ((0)) FOR [ml_cntk] 165 | GO 166 | 167 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_chainer] DEFAULT ((0)) FOR [ml_chainer] 168 | GO 169 | 170 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_mxnet] DEFAULT ((0)) FOR [ml_mxnet] 171 | GO 172 | 173 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_keras] DEFAULT ((0)) FOR [ml_keras] 174 | GO 175 | 176 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_deeplearning4j] DEFAULT ((0)) FOR [ml_deeplearning4j] 177 | GO 178 | 179 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_theano] DEFAULT ((0)) FOR [ml_theano] 180 | GO 181 | 182 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_sklearn] DEFAULT ((0)) FOR [ml_sklearn] 183 | GO 184 | 185 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_mahout] DEFAULT ((0)) FOR [ml_mahout] 186 | GO 187 | 188 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_ml_paddlepaddle] DEFAULT ((0)) FOR [ml_paddlepaddle] 189 | GO 190 | 191 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_hadoop] DEFAULT ((0)) FOR [bd_hadoop] 192 | GO 193 | 194 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_spark] DEFAULT ((0)) FOR [bd_spark] 195 | GO 196 | 197 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_hive] DEFAULT ((0)) FOR [bd_hive] 198 | GO 199 | 200 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_mapReduce] DEFAULT ((0)) FOR [bd_mapReduce] 201 | GO 202 | 203 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_kafka] DEFAULT ((0)) FOR [bd_kafka] 204 | GO 205 | 206 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_hbase] DEFAULT ((0)) FOR [bd_hbase] 207 | GO 208 | 209 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_storm] DEFAULT ((0)) FOR [bd_storm] 210 | GO 211 | 212 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_pig] DEFAULT ((0)) FOR [bd_pig] 213 | GO 214 | 215 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_mahout] DEFAULT ((0)) FOR [bd_mahout] 216 | GO 217 | 218 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_impala] DEFAULT ((0)) FOR [bd_impala] 219 | GO 220 | 221 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_yarn] DEFAULT ((0)) FOR [bd_yarn] 222 | GO 223 | 224 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_alluxio] DEFAULT ((0)) FOR [bd_alluxio] 225 | GO 226 | 227 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_flink] DEFAULT ((0)) FOR [bd_flink] 228 | GO 229 | 230 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_presto] DEFAULT ((0)) FOR [bd_presto] 231 | GO 232 | 233 | ALTER TABLE [dbo].[jobs] ADD CONSTRAINT [DF_jobs_bd_heron] DEFAULT ((0)) FOR [bd_heron] 234 | GO 235 | 236 | 237 | -------------------------------------------------------------------------------- /sql/feature_engineering.sql: -------------------------------------------------------------------------------- 1 | --career 2 | --delete from jobs where year_month=202101 and monthly_salary>0 and monthly_salary<3000 3 | delete from jobs where year_month=202101 and title like '%赴日%' and not title like '%机会%' 4 | --delete from jobs where year_month=202101 and monthly_salary>0 and monthly_salary<1000 5 | 6 | delete [jobs] where year_month=202101 and title like '%技工%' 7 | delete [jobs] where year_month=202101 and title like '%技术员%' 8 | delete [jobs] where year_month=202101 and city ='杭州' and title like '00%(职位编号:%)' 9 | delete [jobs] where year_month=202101 and province ='异地招聘' 10 | delete [jobs] where year_month=202101 and job_summary like '%应届生%' 11 | delete [jobs] where year_month=202101 and title like '%应届%' 12 | delete [jobs] where year_month=202101 and title like '%校招%' 13 | delete [jobs] where year_month=202101 and title like '%校园招聘%' 14 | 15 | update jobs set career='软件工程师' where year_month=202101 and zhinengleibie in ('软件工程师', '高级软件工程师', 'PHP开发工程师', 'Java开发工程师', 'C开发工程师', 'Python开发工程师', '.NET开发工程师', '脚本开发工程师', 'Ruby开发工程师', 'Go开发工程师') 16 | update jobs set career='软件工程师' where year_month=202101 and career='一般程序员' 17 | update jobs set career='Android开发工程师' where year_month=202101 and title like '%Android%' or title like '%安卓%' 18 | 19 | update jobs set career='信号处理' where year_month=202101 and title like '%信号处理%' 20 | update jobs set career='爬虫开发工程师' where year_month=202101 and title like '%爬虫%' 21 | update jobs set career='ADAS' where year_month=202101 and title like '%adas%' 22 | update jobs set career='机器人' where year_month=202101 and title like '%机器人%' or title like '%ROS%' 23 | update jobs set career='GIS' where year_month=202101 and title like '%GIS%' 24 | update jobs set career='CAE' where year_month=202101 and title like '%CAE%' 25 | update jobs set career='光学算法' where year_month=202101 and title like '%光学算法工程师%' 26 | update jobs set career='ETL' where year_month=202101 and title like '%ETL%' 27 | update jobs set career='Unity3D' where year_month=202101 and title like '%Unity3D%' 28 | update jobs set career='遥感' where year_month=202101 and title like '%遥感%' 29 | update jobs set career='规划算法' where year_month=202101 and title like '%规划算法工程师%' 30 | update jobs set career='视觉软件工程师' where year_month=202101 and title like '%三维重建%' 31 | update jobs set career='视觉软件工程师' where year_month=202101 and title like '%视觉软件工程师%' 32 | 33 | 34 | update jobs set career='大数据' where year_month=202101 and title like '%大数据%' 35 | update jobs set career='CT重建' where year_month=202101 and title like '%CT重建%' 36 | update jobs set career='SLAM' where year_month=202101 and title like '%SLAM%' 37 | update jobs set career='DSP' where year_month=202101 and title like '%DSP%' 38 | update jobs set career='生物信息' where year_month=202101 and title like '%生物信息%' 39 | update jobs set career='编译器开发工程师' where year_month=202101 and title like '%编译器%' 40 | update jobs set career='算法工程师' where year_month=202101 and title like '%算法%' or zhinengleibie='算法工程师' 41 | update jobs set career='自然语言处理(NLP)' where year_month=202101 and title like '%自然语言处理%' or title like '%NLP%' 42 | 43 | delete from jobs where year_month=202101 and zhinengleibie='推荐算法工程师' and not title like '%推荐%' 44 | update jobs set career='推荐算法工程师' where year_month=202101 and title like '%推荐算法%' 45 | 46 | delete from jobs where year_month=202101 and zhinengleibie='搜索算法工程师' and not title like '%搜索%' 47 | update jobs set career='搜索算法工程师' where year_month=202101 and title like '%搜索算法%' or title like '%Search Algorithm%' 48 | update jobs set career='反作弊算法工程师' where year_month=202101 and title like '%反作弊%' 49 | 50 | update jobs set career='图像处理工程师' where year_month=202101 and title like '%图像处理%' 51 | update jobs set career='图像算法工程师' where year_month=202101 and title like '%图像算法%' or zhinengleibie='图像算法工程师' 52 | update jobs set career='人工智能' where year_month=202101 and title like '%AI%' or title like '%人工智能%' or title like '%神经网络%' 53 | update jobs set career='区块链开发' where year_month=202101 and title like '%区块链%' or zhinengleibie='区块链开发' 54 | update jobs set career='CTO' where year_month=202101 and title like '%CTO%' or title like '%首席技术官%' or title like '%智慧研究院院长%' 55 | update jobs set career='芯片' where year_month=202101 and title like '%芯片%' or title like '%SOC设计%' 56 | update jobs set career='驱动工程师' where year_month=202101 and title like '%driver%' or title like '%驱动%' 57 | update jobs set career='机器学习' where year_month=202101 and title like '%机器学习%' or zhinengleibie='机器学习工程师' 58 | update jobs set career='深度学习工程师' where year_month=202101 and title like '%深度学习%' 59 | update jobs set career='数据科学家' where year_month=202101 and title like '%Data Scientist%' or title like '%数据科学家%' 60 | 61 | 62 | update jobs set career='架构师' where year_month=202101 and title like '%系统架构师%' or title like '%架构师%' or title like '%架构专家%' or title like '%architect%' or title like '%架构研发%' 63 | update jobs set career='技术主管' where year_month=202101 and title like '%主管%' or title like '%leader%' 64 | 65 | update jobs set career='分布式' where year_month=202101 and career='软件工程师' and title like '%分布式%' 66 | 67 | update jobs set career='敏捷教练' where year_month=202101 and title like '%敏捷教练%' or title like '%agile coach%' or title like '%Scrum Master%' 68 | 69 | update jobs set career='Cocos2d-x开发工程师' where year_month=202101 and career='软件工程师' and title like '%Cocos2d-x%' 70 | 71 | update jobs set career='MES' where year_month=202101 and career='软件工程师' and title like '%MES%' 72 | 73 | update jobs set career='Hadoop工程师' where year_month=202101 and title like '%Hadoop%' 74 | 75 | update jobs set career='嵌入式软件开发' where year_month=202101 and title like '%嵌入式%' or title like '%FPGA%' 76 | 77 | delete from jobs where year_month=202101 and career='人工智能' and not title like '%人工智能%' 78 | 79 | 80 | update jobs set ageism=1 where year_month=202101 and job_description like '%岁%' 81 | 82 | update jobs set ml_paddlepaddle=1 where year_month=202101 and job_description like '%paddlepaddle%' 83 | update jobs set ml_mahout=1 where year_month=202101 and job_description like '%mahout%' 84 | update jobs set ml_sklearn=1 where year_month=202101 and job_description like '%scikit-learn%' or job_description like '%scikitlearn%' or job_description like '%sklearn%' 85 | update jobs set ml_theano=1 where year_month=202101 and job_description like '%theano%' 86 | update jobs set ml_keras=1 where year_month=202101 and job_description like '%keras%' 87 | update jobs set ml_mxnet=1 where year_month=202101 and job_description like '%mxnet%' 88 | update jobs set ml_cntk=1 where year_month=202101 and job_description like '%cntk%' 89 | update jobs set ml_caffe=1 where year_month=202101 and job_description like '%caffe%' 90 | update jobs set ml_tensorflow=1 where year_month=202101 and job_description like '%tensorflow%' 91 | update jobs set ml_pytorch=1 where year_month=202101 and job_description like '%pytorch%' 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /sql/update.sql: -------------------------------------------------------------------------------- 1 | --update _51jobs set career_software_engineer=0 where career_algorithm=1 or career_architect=1 2 | --改为以万元为单位 3 | --update _51jobs set monthly_salary=monthly_salary/10000 4 | 5 | 6 | --R语言统计 R语言 "R Studio" R编程 '%,R,%''%,R,%' 7 | --update _51jobs set pl_r=1 where job_description like '%、R、%' 8 | --or job_description like '%,R,%' 9 | --or job_description like '%,R,%' 10 | --or job_description like '%R语言%' 11 | --or job_description like '%R Studio%' 12 | --or job_description like '%R编程%' 13 | --or job_description like '%R语言%' 14 | --vb.net 15 | --update _51jobs set pl_visual_basic_net=1 where job_description like '%vb.net%' 16 | --or job_description like '%visual basic.net%' 17 | --select COUNT(1) from _51jobs where job_description like '%vb.net%' 18 | --select COUNT(1) from _51jobs where job_description like '%Vb.net%' 19 | --Groovy 20 | 21 | --update _51jobs set pl_groovy=1 where job_description like '%groovy%' 22 | --87 23 | -- 24 | --update _51jobs set pl_scala=1 where job_description like '%scala%' 25 | --(1639 rows affected) 26 | --Assembly language 汇编 27 | --update _51jobs set pl_assembly=1 where job_description like '%Assembly language%' or job_description like '%汇编%' 28 | --(1147 rows affected) 29 | --Linux Linux CentOS Ubuntu redhat 30 | 31 | --select * from _201904 where title like '%爬虫%' 32 | --ALTER TABLE _201904 ADD career_spider bit DEFAULT 0 NOT NULL; 33 | --update _201904 set career_spider=1 where title like '%爬虫%' 34 | --update _201904 set career_software_engineer=0 where career_spider=1 35 | 36 | 37 | --update _201904 set city='zhengzhou' where city_zhengzhou=1 38 | 39 | --update _201904 set career='algorithm' where career_algorithm=1 40 | --update _201904 set career='architect' where career_architect=1 41 | --update _201904 set career='software' where career_software_engineer=1 42 | --update _201904 set career='spider' where career_spider=1 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /sql/update_ml.sql: -------------------------------------------------------------------------------- 1 | 2 | --ALTER TABLE _201904 ADD ml_tensorflow bit DEFAULT 0 NOT NULL; 3 | -- 4 | 5 | --ALTER TABLE _201904 ADD ml_caffe bit DEFAULT 0 NOT NULL; 6 | -- 7 | 8 | --ALTER TABLE _201904 ADD ml_cntk bit DEFAULT 0 NOT NULL; 9 | -- 10 | 11 | --ALTER TABLE _201904 ADD ml_chainer bit DEFAULT 0 NOT NULL; 12 | -- 13 | 14 | --ALTER TABLE _201904 ADD ml_mxnet bit DEFAULT 0 NOT NULL; 15 | -- 16 | 17 | --ALTER TABLE _201904 ADD ml_keras bit DEFAULT 0 NOT NULL; 18 | -- 19 | 20 | --ALTER TABLE _201904 ADD ml_deeplearning4j bit DEFAULT 0 NOT NULL; 21 | -- 22 | 23 | --ALTER TABLE _201904 ADD ml_theano bit DEFAULT 0 NOT NULL; 24 | -- 25 | 26 | --ALTER TABLE _201904 ADD ml_sklearn bit DEFAULT 0 NOT NULL; 27 | -- 28 | 29 | --ALTER TABLE _201904 ADD ml_mahout bit DEFAULT 0 NOT NULL; 30 | -- 31 | 32 | --ALTER TABLE _201904 ADD ml_paddlepaddle bit DEFAULT 0 NOT NULL; 33 | -- 34 | 35 | 36 | -------------------------------------------------------------------------------- /sql/update_v2.sql: -------------------------------------------------------------------------------- 1 | --update _201904v2 set career = '系统架构师' where career='架构设计师' 2 | 3 | select * from _201904v2 where career like '%爬虫%' 4 | 5 | 6 | update _201905 set career='爬虫工程师' where title like '%爬虫%' 7 | update _201905 set career='生物信息工程师' where title like '%生物信息%' 8 | 9 | 10 | update _201903 set expert_blockchain=1 where title like '%blockchain%' 11 | update _201903 set expert_blockchain=1 where title like '%区块链%' 12 | --------------------------------------------------------------------------------