├── .gitattributes ├── .gitignore ├── .landscape.yml ├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── job-chart.jpg ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── base.py ├── fixture │ ├── city.sql │ ├── company.sql │ ├── industry.sql │ ├── job.sql │ ├── job_keyword.sql │ ├── jobs_count.sql │ ├── keyword.sql │ └── keyword_statistic.sql ├── schema.sql ├── test_controllers │ ├── __init__.py │ ├── test_city_ctl.py │ ├── test_industry_ctl.py │ ├── test_job_ctl.py │ ├── test_job_keyword_ctl.py │ ├── test_keyword_ctl.py │ └── test_keyword_statistic_ctl.py ├── test_models │ └── test_job.py ├── test_utils │ ├── test_cache.py │ ├── test_classproperty.py │ ├── test_common.py │ ├── test_convert.py │ ├── test_http_tools.py │ ├── test_pagination.py │ ├── test_text.py │ └── test_time_tools.py ├── test_web │ ├── base.py │ ├── test_formatter.py │ └── test_keyword_statistic.py └── util.py └── webspider ├── __init__.py ├── constants.py ├── controllers ├── __init__.py ├── city_ctl.py ├── industry_ctl.py ├── job_ctl.py ├── job_keyword_ctl.py ├── keyword_ctl.py └── keyword_statistic_ctl.py ├── crawlers ├── __init__.py ├── lagou_cites.py ├── lagou_companies.py ├── lagou_jobs.py └── lagou_jobs_count.py ├── exceptions.py ├── models ├── __init__.py ├── base.py ├── city.py ├── company.py ├── company_industry.py ├── industry.py ├── job.py ├── job_keyword.py ├── jobs_count.py ├── keyword.py └── keyword_statistic.py ├── quickly_cmd.py ├── setting.py ├── tasks ├── __init__.py ├── actor │ ├── __init__.py │ ├── keyword_statistic.py │ ├── lagou_data.py │ └── lagou_jobs_count.py ├── celery_app.py └── celery_config.py ├── utils ├── __init__.py ├── cache.py ├── classproperty.py ├── common.py ├── convert.py ├── http_tools.py ├── log.py ├── pagination.py ├── sql.py ├── text.py └── time_tools.py └── web ├── __init__.py ├── app.py ├── formatter ├── __init__.py ├── base.py ├── jobs_count.py └── keyword_statistic.py ├── handlers ├── __init__.py ├── base.py └── keyword_statistics.py ├── static ├── __init__.py ├── bootstrap │ ├── css │ │ ├── bootstrap-theme.css │ │ ├── bootstrap-theme.css.map │ │ ├── bootstrap-theme.min.css │ │ ├── bootstrap-theme.min.css.map │ │ ├── bootstrap.css │ │ ├── bootstrap.css.map │ │ ├── bootstrap.min.css │ │ └── bootstrap.min.css.map │ ├── fonts │ │ ├── glyphicons-halflings-regular.eot │ │ ├── glyphicons-halflings-regular.svg │ │ ├── glyphicons-halflings-regular.ttf │ │ ├── glyphicons-halflings-regular.woff │ │ └── glyphicons-halflings-regular.woff2 │ └── js │ │ ├── bootstrap.js │ │ ├── bootstrap.min.js │ │ └── npm.js ├── css │ └── mystyle.css ├── img │ └── favicon.ico └── js │ ├── echarts.js │ ├── echarts.min.js │ └── jquery.min.js ├── templates ├── 404.html ├── 500.html ├── base.html ├── city-jobs-count-chart-module.html ├── education-chart-module.html ├── finance-stage-chart-module.html ├── pagination-module.html ├── per-day-jobs-count-chart-module.html ├── salary-chart-module.html ├── statistics.html └── work-year-chart-module.html └── urls.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=python 2 | *.css linguist-language=python 3 | *.html linguist-language=python 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | .idea/ 91 | bin/ 92 | spider_log.txt 93 | dump.rdb 94 | .DS_Store 95 | cover/ 96 | celerybeat.pid 97 | oj.py 98 | /webspider/log 99 | /webspider/security_constants.py 100 | celerybeat-schedule 101 | cove 102 | nohup.out 103 | -------------------------------------------------------------------------------- /.landscape.yml: -------------------------------------------------------------------------------- 1 | autodetect: yes 2 | test-warnings: true 3 | doc-warnings: true 4 | strictness: veryhigh 5 | max-line-length: 120 6 | python-targets: 3 7 | 8 | uses: 9 | - celery 10 | 11 | ignore-paths: 12 | - .git 13 | - coverage 14 | - env 15 | - test 16 | - webspider/web/templates 17 | - webspider/web/static 18 | 19 | pep8: 20 | run: true 21 | disable: 22 | - W291 23 | - E501 24 | 25 | pyflakes: 26 | run: true 27 | 28 | inherits: [flake8] 29 | 30 | requirements: 31 | - requirements.txt 32 | 33 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | sudo: false 4 | 5 | python: 6 | - "3.6" 7 | 8 | services: 9 | - mysql 10 | - redis-server 11 | 12 | before_install: 13 | - mysql -e 'CREATE DATABASE spider CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;' 14 | 15 | install: 16 | - make 17 | 18 | script: 19 | - make test 20 | 21 | after_success: 22 | - env/bin/codecov -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PYTHON:=$(shell which python3) 2 | 3 | all: python 4 | 5 | .PHONY: clean python test flake8 6 | 7 | python: setup.py requirements.txt 8 | pip install virtualenv 9 | echo "\n Creating python virtual environment......\n" 10 | virtualenv -p $(PYTHON) env 11 | echo "\n Use python virtual environment to install required packages......\n" 12 | env/bin/pip install -e . 13 | mkdir -p webspider/log 14 | touch webspider/log/spider_log.txt 15 | 16 | test: flake8 17 | env/bin/nosetests -vd 18 | 19 | flake8: 20 | env/bin/flake8 21 | 22 | clean: 23 | -rm -rf env cover *eggs *.egg-info *.egg webspider/log 24 | @find . -type f -name "*.py[co]" -delete 25 | @find . -type d -name "__pycache__" -delete 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2 | 3 | [![Build Status](https://travis-ci.org/JustForFunnnn/webspider.svg)](https://travis-ci.org/JustForFunnnn/webspider) 4 | [![codecov](https://codecov.io/gh/JustForFunnnn/webspider/branch/master/graph/badge.svg)](https://codecov.io/gh/JustForFunnnn/webspider) 5 | [![Code Health](https://landscape.io/github/JustForFunnnn/webspider/master/landscape.svg?style=flat)](https://landscape.io/github/JustForFunnnn/webspider/master) 6 | [![License](https://img.shields.io/github/license/JustForFunnnn/webspider.svg)](https://github.com/JustForFunnnn/webspider/blob/master/LICENSE) 7 | [![Python](https://img.shields.io/badge/python-3-ff69b4.svg)](https://github.com/JustForFunnnn/webspider) 8 | 9 | | -- | -- | 10 | | -------- | ------------------------------------------ | 11 | | Version | 1.0.1 | 12 | | WebSite | http://119.23.223.90:8000 | 13 | | Source | https://github.com/JustForFunnnn/webspider | 14 | | Keywords | `Python3`, `Tornado`, `Celery`, `Requests` | 15 | 16 | ## Introduction 17 | 18 | This project crawls job&company data from job-seeking websites, cleans the data, modelizes, converts, and stores it in the database. then use [Echarts](https://echarts.apache.org/en/index.html) and [Bootstrap](https://getbootstrap.com/) to build a front-end page to display the IT job statistics, to show the newest requirements and trends of the IT job market. 19 | 20 | ## Demo 21 | 22 | You can input the keyword you are interested in into the search box, such as "Python", then click the search button, and the statistics of this keyword will show. 23 | 24 | * The first chart `Years of Working(工作年限要求`) is about the experience requirement of the `Python`, according to the data, the "3 ~ 5 years" is the most frequent requirement, then the following is `1 ~ 3 years` ([Chart Source Code](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/work-year-chart-module.html)) 25 | 26 | * The second chart `Salary Range(薪水分布`) is about the salary of the `Python`, according to the data, the "11k ~ 20k" is the most frequent salary provided, then the following is `21k ~ 35k` ([Chart Source Code](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/salary-chart-module.html)) 27 | 28 | and we also got charts: 29 | * [Education Requirement Data Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/education-chart-module.html) 30 | * [City Job Count Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/city-jobs-count-chart-module.html) 31 | * [Job Count Change Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/per-day-jobs-count-chart-module.html) 32 | * [Company Finance Stage Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/finance-stage-chart-module.html) 33 | 34 | Python Charts Example: 35 | 36 | ![Alt text](job-chart.jpg) 37 | 38 | ## Quick Start 39 | > This tutorial is based on `Linux - Ubuntu`, for other systems, please find the corresponding command 40 | 41 | * Clone the project 42 | 43 | ```bash 44 | git clone git@github.com:JustForFunnnn/webspider.git 45 | ``` 46 | 47 | * Install `MySQL`, `Redis`, `Python3` 48 | 49 | ```bash 50 | # install Redis 51 | apt-get install redis-server 52 | 53 | # run Redis in background 54 | nohup redis-server & 55 | 56 | # install Python3 57 | apt-get install python3 58 | 59 | # install MySQL 60 | apt-get install mysql-server 61 | 62 | # start MySQL 63 | sudo service mysql start 64 | ``` 65 | 66 | * Config database and table 67 | ``` 68 | # create database 69 | CREATE DATABASE `spider` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; 70 | ``` 71 | We still need to create the tables, copy the table definition SQL from `tests/schema.sql` and run it in MySQL 72 | 73 | * Build project 74 | ```bash 75 | # after a successful build, some executable jobs will be generated under the path env/bin 76 | make 77 | ``` 78 | 79 | * Run unit-test 80 | ```bash 81 | make test 82 | ``` 83 | 84 | * Run code style check 85 | ```bash 86 | make flake8 87 | ``` 88 | 89 | * Start web service 90 | ```bash 91 | env/bin/web 92 | ``` 93 | 94 | * Stat crawler 95 | ```bash 96 | # run task scheduler/dispatcher 97 | env/bin/celery_beat 98 | # run celery worker for job data 99 | env/bin/celery_lg_jobs_data_worker 100 | # run celery worker for job count 101 | env/bin/celery_lg_jobs_count_worker 102 | ``` 103 | 104 | * Other jobs 105 | ```bash 106 | # start crawl job count immediately 107 | env/bin/crawl_lg_jobs_count 108 | # start crawl job data immediately 109 | env/bin/crawl_lg_data 110 | # start celery monitoring 111 | env/bin/celery_flower 112 | ``` 113 | 114 | * Clean 115 | ```bash 116 | # clean the existing build result 117 | make clean 118 | ``` 119 | -------------------------------------------------------------------------------- /job-chart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/job-chart.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # All requirements put in this file 2 | # You **MUST** specify the package version in this file 3 | 4 | tornado == 4.5.3 5 | gevent == 1.2.2 6 | gunicorn == 19.7.1 7 | lxml == 4.1.0 8 | requests == 2.18.4 9 | mysqlclient == 1.3.12 10 | sqlalchemy == 1.2.2 11 | redis == 2.10.6 12 | python-redis == 0.1.7 13 | retrying == 1.3.3 14 | celery == 4.0.2 15 | 16 | flower == 0.9.2 17 | ipython == 6.2.1 18 | nose == 1.3.7 19 | coverage == 4.4.2 20 | flake8 == 3.5.0 21 | codecov == 2.0.15 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = W291 3 | max-line-length = 120 4 | exclude = 5 | .git, 6 | eggs, 7 | env, 8 | tests 9 | 10 | [nosetests] 11 | logging-clear-handlers = 1 12 | with-coverage = 1 13 | cover-package = webspider 14 | cover-erase = 1 15 | logging-level = DEBUG 16 | cover-xml = 1 17 | cover-xml-file = coverage.xml 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from setuptools import find_packages, setup 5 | 6 | from webspider import __version__ 7 | 8 | # get the dependencies and installs 9 | here = os.path.abspath(os.path.dirname(__file__)) 10 | with open(os.path.join(here, 'requirements.txt')) as f: 11 | all_requirements = f.read().split('\n') 12 | 13 | setup( 14 | name='webspider', 15 | version=__version__, 16 | license='MIT', 17 | author='JustForFunnn', 18 | author_email='', 19 | description='web spider', 20 | url='https://github.com/JustForFunnnn/webspider', 21 | packages=find_packages(exclude=['tests']), 22 | package_data={'webspider': ['README.md']}, 23 | zip_safe=False, 24 | install_requires=all_requirements, 25 | entry_points={ 26 | 'console_scripts': [ 27 | 'web = webspider.web.app:main', 28 | 'production_web = webspider.quickly_cmd:run_web_app_by_gunicorn', 29 | 'crawl_lg_data = webspider.tasks.actor.lg_data:crawl_lg_data_task', 30 | 'crawl_lg_jobs_count = webspider.tasks.actor.lg_jobs_count:crawl_lg_jobs_count_task', 31 | # beat 32 | 'celery_beat = webspider.quickly_cmd:run_celery_beat', 33 | 'celery_flower = webspider.quickly_cmd.py:run_celery_flower', 34 | # worker 35 | 'celery_default_worker = webspider.quickly_cmd:run_celery_default_worker', 36 | 'celery_lg_data_worker = webspider.quickly_cmd:run_celery_lg_data_worker', 37 | 'celery_lg_jobs_data_worker = webspider.quickly_cmd:run_celery_lg_jobs_data_worker', 38 | 'celery_lg_jobs_count_worker = webspider.quickly_cmd:run_celery_lg_jobs_count_worker', 39 | ], 40 | } 41 | ) 42 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from unittest import TestCase 5 | 6 | from webspider.utils.sql import get_session 7 | from tests.util import execute_sql_file, get_current_database_name 8 | 9 | here_dir = os.path.dirname(__file__) 10 | 11 | 12 | class BaseTestCase(TestCase): 13 | session = get_session() 14 | 15 | def setUp(self): 16 | test_db_name = 'test_spider' 17 | # 清除测试数据库 18 | self.session.execute("DROP DATABASE IF EXISTS {db_name};".format(db_name=test_db_name)) 19 | # 创建测试数据库 20 | self.session.execute("CREATE DATABASE {db_name} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;".format( 21 | db_name=test_db_name)) 22 | # 指定测试数据库 test_spider 23 | self.session.execute("USE {db_name};".format(db_name=test_db_name)) 24 | 25 | path = os.path.dirname(__file__) 26 | # 创建表 27 | execute_sql_file( 28 | file_paths=[os.path.join(path, "schema.sql"), ], 29 | db_session=self.session, 30 | predictive_db_name=test_db_name 31 | ) 32 | fixture_path = os.path.join(path, 'fixture') 33 | # 装载表数据 34 | fixture_file_paths = [os.path.join(fixture_path, file) for file in os.listdir(fixture_path)] 35 | execute_sql_file( 36 | file_paths=fixture_file_paths, 37 | db_session=self.session, 38 | predictive_db_name=test_db_name 39 | ) 40 | assert get_current_database_name(self.session) == test_db_name 41 | 42 | def teardown(self): 43 | # 测试结束 销毁测试数据库 44 | self.session.execute('DROP DATABASE test_spider;') 45 | -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from unittest import TestCase 5 | 6 | from webspider.utils.sql import get_session 7 | from tests.util import create_test_db, drop_test_db 8 | 9 | here_dir = os.path.dirname(__file__) 10 | 11 | 12 | class BaseTestCase(TestCase): 13 | session = get_session() 14 | 15 | def setUp(self): 16 | create_test_db(session=self.session) 17 | 18 | def tearDown(self): 19 | # 测试结束 销毁测试数据库 20 | drop_test_db(session=self.session) 21 | -------------------------------------------------------------------------------- /tests/fixture/city.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `test_spider`.`city`(`id`, `name`) 2 | VALUE (2, '北京'), 3 | (3, '上海'), 4 | (4, '广州'); -------------------------------------------------------------------------------- /tests/fixture/company.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `company` (`id`, `lg_company_id`, `city_id`, `shortname`, `fullname`, `finance_stage`, `size`, `address`, `features`, `process_rate`, `introduce`, `advantage`, `created_at`, `updated_at`) 2 | VALUES 3 | (1, 168219, 2, '贝壳金控', '贝壳金控控股集团有限公司', 1, 5, '2017年5月,贝壳正式独立运作,是国内首家聚焦于居住领域的消费金融服务平台','不知道', 100, '我是简介', '[\"\\u4e13\\u9879\\u5956\\u91d1\", \"\\u5e74\\u7ec8\\u5206\\u7ea2\", \"\\u5b9a\\u671f\\u4f53\\u68c0\", \"\\u7ee9\\u6548\\u5956\\u91d1\", \"\\u5348\\u9910\\u8865\\u52a9\", \"\\u4ea4\\u901a\\u8865\\u52a9\"]', '2018-01-28 15:26:13', '2018-01-28 15:35:19'), 4 | (2, 142800, 2, '猫眼电影', '北京猫眼文化传媒有限公司', 1, 5, '北京朝阳区望京东路4号恒电大厦B座8层', '一网打尽好电影', 100, '猫眼电影简介\n猫眼电影(网站经营者:北京猫眼文化传媒有限公司)是美团。。。', '[]', '2018-01-28 15:26:13', '2018-01-28 15:35:19'), 5 | (3, 107435, 2, '熊猫直播', '上海熊猫互娱文化有限公司北京分公司', 3, 5, '北京朝阳区望京soho塔3,A座18层', '王思聪任CEO的视频直播平台', 100, '熊猫直播成立于2015年7月,由王思聪先生亲任CEO,并聚集了国内众多一线视频主播资源。', '[\"\\u5e74\\u5e95\\u53cc\\u85aa\", \"\\u5e26\\u85aa\\u5e74\\u5047\", \"\\u5348\\u9910\\u8865\\u52a9\", \"\\u7ee9\\u6548\\u5956\\u91d1\", \"\\u80a1\\u7968\\u671f\\u6743\"]', '2018-01-28 15:26:13', '2018-01-28 15:35:19'); 6 | -------------------------------------------------------------------------------- /tests/fixture/industry.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `test_spider`.`industry` (`id`, `name`, `created_at`, `updated_at`) 2 | VALUES 3 | (1000001, '开网吧', '2018-01-29 19:07:52', '2018-01-29 19:07:52'), 4 | (1000002, '开餐厅', '2018-01-29 19:07:52', '2018-01-29 19:07:52'); 5 | -------------------------------------------------------------------------------- /tests/fixture/job.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `test_spider`.`job` (`id`, `lg_job_id`, `city_id`, `company_id`, `title`, `work_year`, `department`, `salary`, `education`, `nature`, `description`, `advantage`, `created_at`, `updated_at`) 2 | VALUES 3 | (1, 10001, 2, 1, '高级前端开发工程师', 5, '贝壳金控交易研发部-交易前端组招聘', '15k-30k', 3, 1, '职位介绍A', '15薪,工作居住证,六险一金,双休', '2018-01-29 19:11:33', '2018-01-30 17:22:30'), 4 | (2, 10002, 4, 2, '前端开发工程师', 6, '贝壳金控技术产品中心招聘', '20k-40k', 3, 1, '职位介绍B', '高薪,大牛,六险一金,成长空间大', '2018-01-29 19:11:33', '2018-01-30 17:22:30'), 5 | (3, 10003, 4, 3, 'DBA工程师', 5, '贝壳金控运维技术部招聘', '15k-30k', 3, 1, '职位介绍C', '大牛,高薪,成长空间大,团队氛围好', '2018-01-29 19:11:33', '2018-01-30 17:22:30'); 6 | -------------------------------------------------------------------------------- /tests/fixture/job_keyword.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `job_keyword` (`id`, `job_id`, `keyword_id`, `created_at`, `updated_at`) 2 | VALUES 3 | (1, 1, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12'), 4 | (2, 1, 101, '2018-01-28 15:36:12', '2018-01-28 15:36:12'), 5 | (3, 2, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12'), 6 | (4, 2, 101, '2018-01-28 15:36:12', '2018-01-28 15:36:12'), 7 | (5, 2, 102, '2018-01-28 15:36:12', '2018-01-28 15:36:12'), 8 | (6, 3, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12'); 9 | -------------------------------------------------------------------------------- /tests/fixture/jobs_count.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `jobs_count` (`id`, `date`, `keyword_id`, `all_city`, `beijing`, `guangzhou`, `shenzhen`, 2 | `shanghai`, `hangzhou`, `chengdu`, `created_at`, `updated_at`) 3 | VALUES 4 | (1, 20180128, 100, 576, 198, 35, 93, 80, 41, 26, '2018-01-28 17:01:04', '2018-01-28 17:01:04'), 5 | (2, 20180129, 100, 580, 200, 36, 100, 82, 44, 30, '2018-01-28 17:01:04', '2018-01-28 17:01:04'); 6 | -------------------------------------------------------------------------------- /tests/fixture/keyword.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `test_spider`.`keyword` (`id`, `name`) 2 | VALUES 3 | (100, 'python'), 4 | (101, 'java'), 5 | (102, 'qt'), 6 | (103, '前端'); 7 | -------------------------------------------------------------------------------- /tests/fixture/keyword_statistic.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO `keyword_statistic` (`id`, `keyword_id`, `educations`, `city_jobs_count`, `salary`, `financing_stage`, `work_years`, `created_at`, `updated_at`) 2 | VALUES 3 | (1, 100, 4 | '{"不限": 1, "大专": 2, "本科": 3, "本科": 4, "硕士": 5, "博士": 6, "unknown": 7}', 5 | '{"北京": 8, "深圳": 9, "广州": 10}', 6 | '{"10k以下": 11, "11k-20k": 12, "21k-35k": 13, "36k-60k": 14, "61k以上": 15}', 7 | '{"未融资": 16, "天使轮": 17, "A轮": 18, "B轮": 19, "C轮": 20, "D轮及以上": 21, "上市公司": 22, "不需要融资": 23, "unknown": 24}', 8 | '{"不限": 25, "应届毕业生": 26, "1年以下": 27, "1-3年": 28, "3-5年": 29, "5-10年": 30, "10年以上": 31, "unknown": 32}', 9 | '2018-02-01 19:01:44', '2018-02-05 01:01:48'); 10 | -------------------------------------------------------------------------------- /tests/schema.sql: -------------------------------------------------------------------------------- 1 | -- CREATE DATABASE `spider` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; 2 | 3 | CREATE TABLE IF NOT EXISTS `job` ( 4 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 5 | `lg_job_id`INT UNSIGNED NOT NULL COMMENT '所使用的职位id', 6 | `city_id` INT UNSIGNED NOT NULL COMMENT '城市 id', 7 | `company_id` INT UNSIGNED NOT NULL COMMENT '公司 id', 8 | `title` VARCHAR(64) NOT NULL COMMENT '职位标题', 9 | `work_year` TINYINT NOT NULL DEFAULT 0 COMMENT '工作年限要求', 10 | `department` VARCHAR(64) NOT NULL DEFAULT '' COMMENT '招聘部门', 11 | `salary` VARCHAR(32) NOT NULL DEFAULT '' COMMENT '薪水', 12 | `education` TINYINT NOT NULL DEFAULT 0 COMMENT '教育背景要求', 13 | `nature` TINYINT NOT NULL DEFAULT 0 COMMENT '工作性质', 14 | `description` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '额外描述', 15 | `advantage` VARCHAR(256) NOT NULL DEFAULT '' COMMENT '职位优势', 16 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 17 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 18 | UNIQUE KEY (`lg_job_id`), 19 | KEY `idx_company_id` (`company_id`) 20 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位表'; 21 | 22 | 23 | CREATE TABLE IF NOT EXISTS `company` ( 24 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 25 | `lg_company_id` INT UNSIGNED NOT NULL COMMENT '所使用的公司id', 26 | `city_id` INT UNSIGNED NOT NULL COMMENT '所在城市 id', 27 | `shortname` VARCHAR(64) NOT NULL COMMENT '公司名称', 28 | `fullname` VARCHAR(128) NOT NULL COMMENT '公司全称', 29 | `finance_stage` TINYINT NOT NULL DEFAULT 0 COMMENT '融资阶段', 30 | `size` TINYINT NOT NULL DEFAULT 0 COMMENT '公司规模', 31 | `address` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '公司地址', 32 | `features` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '公司特点', 33 | `process_rate` TINYINT NOT NULL DEFAULT 0 COMMENT '简历处理率', 34 | `introduce` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '公司简介', 35 | `advantage` VARCHAR(256) NOT NULL DEFAULT '' COMMENT '公司优势', 36 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 37 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 38 | UNIQUE KEY (`lg_company_id`), 39 | KEY `idx_city_id` (`city_id`) 40 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='公司表'; 41 | 42 | 43 | CREATE TABLE IF NOT EXISTS `city` ( 44 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 45 | `name` VARCHAR(64) NOT NULL COMMENT '城市名', 46 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 47 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 48 | UNIQUE KEY (`name`) 49 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='城市表'; 50 | 51 | 52 | CREATE TABLE IF NOT EXISTS `industry` ( 53 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 54 | `name` VARCHAR(64) NOT NULL COMMENT '行业名称', 55 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 56 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 57 | UNIQUE KEY (`name`) 58 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='行业表'; 59 | 60 | 61 | CREATE TABLE IF NOT EXISTS `company_industry` ( 62 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 63 | `company_id` INT UNSIGNED NOT NULL COMMENT '公司 id', 64 | `industry_id` INT UNSIGNED NOT NULL COMMENT '行业 id', 65 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 66 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 67 | UNIQUE KEY(`company_id`, `industry_id`) 68 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='公司行业表'; 69 | 70 | 71 | -- 预置行业类型 72 | INSERT INTO `industry` (`id`, `name`) 73 | VALUES 74 | (24,'移动互联网'), 75 | (25,'电子商务'), 76 | (26,'社交网络'), 77 | (27,'企业服务'), 78 | (28,'O2O'), 79 | (29,'教育'), 80 | (31,'游戏'), 81 | (32,'旅游'), 82 | (33,'金融'), 83 | (34,'医疗健康'), 84 | (35,'生活服务'), 85 | (38,'信息安全'), 86 | (41,'数据服务'), 87 | (43,'广告营销'), 88 | (45,'文化娱乐'), 89 | (47,'硬件'), 90 | (48,'分类信息'), 91 | (49,'招聘'), 92 | (10594,'其他'); 93 | 94 | 95 | CREATE TABLE IF NOT EXISTS `keyword` ( 96 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 97 | `name` VARCHAR(64) NOT NULL COMMENT '关键词名称', 98 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 99 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 100 | UNIQUE KEY (`name`) 101 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='关键词'; 102 | 103 | 104 | CREATE TABLE IF NOT EXISTS `job_keyword` ( 105 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 106 | `job_id` INT NOT NULL COMMENT '工作 id', 107 | `keyword_id` INT NOT NULL COMMENT '关键词 id', 108 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 109 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 110 | UNIQUE KEY(`job_id`, `keyword_id`), 111 | KEY `idx_keyword_id` (`keyword_id`) 112 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位关键词'; 113 | 114 | 115 | CREATE TABLE IF NOT EXISTS `jobs_count` ( 116 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 117 | `date` INT NOT NULL COMMENT '日期', 118 | `keyword_id` INT NOT NULL COMMENT '关键词 id', 119 | `all_city` INT NOT NULL DEFAULT 0 COMMENT '全国岗位数量', 120 | `beijing` INT NOT NULL DEFAULT 0 COMMENT '北京岗位数量', 121 | `guangzhou` INT NOT NULL DEFAULT 0 COMMENT '广州岗位数量', 122 | `shenzhen` INT NOT NULL DEFAULT 0 COMMENT '深圳岗位数量', 123 | `shanghai` INT NOT NULL DEFAULT 0 COMMENT '上海岗位数量', 124 | `hangzhou` INT NOT NULL DEFAULT 0 COMMENT '杭州岗位数量', 125 | `chengdu` INT NOT NULL DEFAULT 0 COMMENT '成都岗位数量', 126 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 127 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 128 | UNIQUE KEY(`date`, `keyword_id`), 129 | KEY `idx_keyword_id` (`keyword_id`) 130 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位每日数量统计'; 131 | 132 | 133 | CREATE TABLE IF NOT EXISTS `keyword_statistic` ( 134 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT, 135 | `keyword_id` INT UNSIGNED NOT NULL COMMENT '关键词 id', 136 | `educations` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '教育背景要求情况', 137 | `city_jobs_count`VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '城市职位数量情况', 138 | `salary` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '薪水分布情况', 139 | `financing_stage` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '公司融资阶段情况', 140 | `work_years` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '要求的工作年限情况', 141 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 142 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', 143 | UNIQUE KEY(`keyword_id`) 144 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='关键词分析表' COMMENT='关键词分析'; 145 | -------------------------------------------------------------------------------- /tests/test_controllers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/tests/test_controllers/__init__.py -------------------------------------------------------------------------------- /tests/test_controllers/test_city_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from tests import BaseTestCase 3 | from webspider.controllers import city_ctl 4 | from webspider.models import CityModel 5 | 6 | 7 | class TestCityController(BaseTestCase): 8 | def test_get_city_id_by_name(self): 9 | city_id = city_ctl.get_city_id_by_name(name='北京') 10 | self.assertEqual(city_id, 2) 11 | 12 | with self.assertRaises(ValueError): 13 | city_ctl.get_city_id_by_name(name='通利福尼亚') 14 | 15 | def test_insert_city_if_not_exist(self): 16 | city_id = city_ctl.insert_city_if_not_exist('湛江') 17 | self.assertTrue(city_id > 0) 18 | city = CityModel.get_by_pk(pk=city_id) 19 | self.assertEqual(city.name, '湛江') 20 | 21 | self.assertIsNone(city_ctl.insert_city_if_not_exist('湛江')) 22 | 23 | def test_get_city_name_dict(self): 24 | city_name_dict = city_ctl.get_city_name_dict() 25 | self.assertDictEqual(city_name_dict, {'北京': 2, '上海': 3, '广州': 4}) 26 | -------------------------------------------------------------------------------- /tests/test_controllers/test_industry_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from tests import BaseTestCase 3 | from webspider.controllers import industry_ctl 4 | from webspider.models import IndustryModel 5 | 6 | 7 | class TestIndustryController(BaseTestCase): 8 | def test_get_industry_id_by_name(self): 9 | industry_id = industry_ctl.get_industry_id_by_name(name='开网吧') 10 | self.assertEqual(industry_id, 1000001) 11 | 12 | with self.assertRaises(ValueError): 13 | industry_ctl.get_industry_id_by_name(name='开飞机') 14 | 15 | def test_insert_industry_if_not_exist(self): 16 | industry_name = '开飞机' 17 | industry_id = industry_ctl.insert_industry_if_not_exist(industry_name) 18 | self.assertTrue(industry_id > 0) 19 | industry = IndustryModel.get_by_pk(pk=industry_id) 20 | self.assertEqual(industry.name, industry_name) 21 | 22 | self.assertIsNone(industry_ctl.insert_industry_if_not_exist(industry_name)) 23 | -------------------------------------------------------------------------------- /tests/test_controllers/test_job_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from webspider.controllers import job_ctl 5 | 6 | 7 | class TestJobController(TestCase): 8 | 9 | def test_get_salary_section(self): 10 | salary = '15k-25k' 11 | left, right = job_ctl.get_salary_section(salary) 12 | self.assertEqual(left, 15) 13 | self.assertEqual(right, 25) 14 | 15 | salary = '15k以上' 16 | left, right = job_ctl.get_salary_section(salary) 17 | self.assertEqual(left, 15) 18 | self.assertEqual(right, 20) 19 | 20 | salary = '15k以下' 21 | left, right = job_ctl.get_salary_section(salary) 22 | self.assertEqual(left, 10) 23 | self.assertEqual(right, 15) 24 | 25 | with self.assertRaises(ValueError): 26 | left, right = job_ctl.get_salary_section('15k30k') 27 | -------------------------------------------------------------------------------- /tests/test_controllers/test_job_keyword_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from tests import BaseTestCase 3 | from webspider.controllers import job_keyword_ctl 4 | 5 | 6 | class TestJobKeywordController(BaseTestCase): 7 | def test_get_most_frequently_keyword_ids(self): 8 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids() 9 | self.assertEqual(keyword_ids, [100, 101, 102]) 10 | 11 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=2) 12 | self.assertEqual(keyword_ids, [100, 101]) 13 | 14 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(offset=1) 15 | self.assertEqual(keyword_ids, [101, 102]) 16 | 17 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=1, offset=1) 18 | self.assertEqual(keyword_ids, [101]) 19 | -------------------------------------------------------------------------------- /tests/test_controllers/test_keyword_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from tests import BaseTestCase 3 | from webspider.controllers import keyword_ctl 4 | from webspider.models import KeywordModel 5 | 6 | 7 | class TestKeywordController(BaseTestCase): 8 | def test_get_keyword_name_by_id(self): 9 | keyword_name = keyword_ctl.get_keyword_name_by_id(keyword_id=100) 10 | self.assertEqual(keyword_name, 'python') 11 | 12 | with self.assertRaises(ValueError): 13 | keyword_ctl.get_keyword_name_by_id(keyword_id=10001) 14 | 15 | def test_get_keyword_id_by_name(self): 16 | keyword_id = keyword_ctl.get_keyword_id_by_name(name='python') 17 | self.assertEqual(keyword_id, 100) 18 | 19 | with self.assertRaises(ValueError): 20 | keyword_ctl.get_keyword_id_by_name(name='go') 21 | 22 | def test_insert_keyword_if_not_exist(self): 23 | keyword_name = 'C--' 24 | keyword_id = keyword_ctl.insert_keyword_if_not_exist(keyword_name) 25 | self.assertTrue(keyword_id > 0) 26 | keyword = KeywordModel.get_by_pk(pk=keyword_id) 27 | self.assertEqual(keyword.name, keyword_name) 28 | 29 | self.assertIsNone(keyword_ctl.insert_keyword_if_not_exist(keyword_name)) 30 | -------------------------------------------------------------------------------- /tests/test_controllers/test_keyword_statistic_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from tests import BaseTestCase 3 | from webspider.controllers import keyword_statistic_ctl 4 | from webspider.models import JobModel 5 | from webspider.constants import EDUCATION_REQUEST_DICT, WORK_YEARS_REQUEST_DICT 6 | 7 | 8 | class TestKeywordStatisticController(BaseTestCase): 9 | def test_get_salary_statistic(self): 10 | test_jobs_model = [JobModel(salary='5k-9k'), JobModel(salary='10-15k'), JobModel(salary='15k-20k'), 11 | JobModel(salary='16-18k'), JobModel(salary='20k-30k'), JobModel(salary='30k-35k'), 12 | JobModel(salary='20k以上'), JobModel(salary='60k-100k'), JobModel(salary='40k-42k')] 13 | salary_statistic = keyword_statistic_ctl.get_salary_statistic(test_jobs_model) 14 | self.assertDictEqual(salary_statistic, { 15 | '10k及以下': 2, 16 | '11k-20k': 5, 17 | '21k-35k': 3, 18 | '36k-60k': 2, 19 | '61k以上': 1, 20 | }) 21 | 22 | def test_get_finance_stage_statistic(self): 23 | test_jobs_model = [JobModel(company_id=1), JobModel(company_id=2), JobModel(company_id=3)] 24 | finance_stage_statistic = keyword_statistic_ctl.get_finance_stage_statistic(test_jobs_model) 25 | self.assertDictEqual(finance_stage_statistic, { 26 | '未融资': 2, 27 | 'A轮': 1, 28 | }) 29 | 30 | def test_get_educations_statistic(self): 31 | test_jobs_model = [JobModel(education=EDUCATION_REQUEST_DICT['大专']), 32 | JobModel(education=EDUCATION_REQUEST_DICT['本科']), 33 | JobModel(education=EDUCATION_REQUEST_DICT['本科'])] 34 | educations_statistic = keyword_statistic_ctl.get_educations_statistic(test_jobs_model) 35 | self.assertDictEqual(educations_statistic, { 36 | '本科': 2, 37 | '大专': 1, 38 | }) 39 | 40 | def test_get_work_years_statistic(self): 41 | test_jobs_model = [JobModel(work_year=WORK_YEARS_REQUEST_DICT['应届毕业生']), 42 | JobModel(work_year=WORK_YEARS_REQUEST_DICT['应届毕业生']), 43 | JobModel(work_year=WORK_YEARS_REQUEST_DICT['1-3年'])] 44 | work_years_statistic = keyword_statistic_ctl.get_work_years_statistic(test_jobs_model) 45 | self.assertDictEqual(work_years_statistic, { 46 | '应届毕业生': 2, 47 | '1-3年': 1, 48 | }) 49 | 50 | def test_get_city_jobs_count_statistic(self): 51 | test_jobs_model = [JobModel(city_id=2), JobModel(city_id=2), JobModel(city_id=2), JobModel(city_id=2), 52 | JobModel(city_id=3), JobModel(city_id=3), JobModel(city_id=3), 53 | JobModel(city_id=4), JobModel(city_id=4)] 54 | sorted_city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(test_jobs_model) 55 | self.assertDictEqual(sorted_city_jobs_count_statistic, { 56 | '北京': 4, 57 | '上海': 3, 58 | '广州': 2, 59 | }) 60 | 61 | sorted_city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(test_jobs_model, 2) 62 | self.assertDictEqual(sorted_city_jobs_count_statistic, { 63 | '北京': 4, 64 | '上海': 3 65 | }) 66 | -------------------------------------------------------------------------------- /tests/test_models/test_job.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from datetime import datetime 3 | 4 | from sqlalchemy import and_ 5 | 6 | from tests import BaseTestCase 7 | from webspider.models import JobModel, CityModel 8 | 9 | test_job_dict = dict(id=1, 10 | lg_job_id=10001, 11 | city_id=2, 12 | company_id=1, 13 | title='高级前端开发工程师', 14 | work_year=5, 15 | department='贝壳金控交易研发部-交易前端组招聘', 16 | salary='15k-30k', 17 | education=3, 18 | nature=1, 19 | description='职位介绍A', 20 | advantage='15薪,工作居住证,六险一金,双休', 21 | created_at=datetime.strptime('2018-01-29 19:11:33', '%Y-%m-%d %H:%M:%S'), 22 | updated_at=datetime.strptime('2018-01-30 17:22:30', '%Y-%m-%d %H:%M:%S')) 23 | 24 | 25 | class TestJobModel(BaseTestCase): 26 | def test_pk_name(self): 27 | self.assertEqual(JobModel.pk_name, 'id') 28 | 29 | def test_pk(self): 30 | self.assertEqual(JobModel.pk, JobModel.id) 31 | 32 | def test_model_instance_to_dict(self): 33 | job = JobModel.get_by_pk(pk=1).dict() 34 | self.assertTrue(isinstance(job, dict)) 35 | self.assertDictEqual(job, test_job_dict) 36 | 37 | def test_get_by_pk(self): 38 | job = JobModel.get_by_pk(pk=1) 39 | self.assertDictEqual(job.dict(), test_job_dict) 40 | 41 | def test_count(self): 42 | jobs_count = JobModel.count() 43 | self.assertEqual(jobs_count, 3) 44 | 45 | jobs_count = JobModel.count(filter_by={'city_id': 4}) 46 | self.assertEqual(jobs_count, 2) 47 | 48 | jobs_count = JobModel.count(filter=(and_(JobModel.city_id == 4, JobModel.company_id == 3))) 49 | self.assertEqual(jobs_count, 1) 50 | 51 | jobs_count = JobModel.count(filter=(JobModel.id == 1)) 52 | self.assertEqual(jobs_count, 1) 53 | 54 | def test_is_exist(self): 55 | is_exist = JobModel.is_exist(filter=(JobModel.id == 1)) 56 | self.assertEqual(is_exist, True) 57 | 58 | def test_add(self): 59 | to_add_data_dict = dict(lg_job_id=10004, 60 | city_id=3, 61 | company_id=1, 62 | title='Python 开发工程师', 63 | work_year=5, 64 | department='吖吖项目组', 65 | salary='15k-35k', 66 | education=2, 67 | nature=1, 68 | description='职位介绍D', 69 | advantage='16薪,工作居住证,六十八险一金,双休', ) 70 | job_id = JobModel.add(**to_add_data_dict) 71 | self.assertTrue(job_id > 0) 72 | job = JobModel.get_by_pk(pk=job_id) 73 | self.assertDictContainsSubset(to_add_data_dict, job.dict()) 74 | 75 | def test_get_one(self): 76 | job = JobModel.get_one(filter_by={'id': 1}) 77 | self.assertDictEqual(job.dict(), test_job_dict) 78 | 79 | job = JobModel.get_one(filter=(JobModel.id == 1)) 80 | self.assertDictEqual(job.dict(), test_job_dict) 81 | 82 | def test_list(self): 83 | # test list 84 | jobs = JobModel.list() 85 | self.assertEqual(len(jobs), 3) 86 | self.assertDictEqual(jobs[0].dict(), test_job_dict) 87 | 88 | # test list limit 89 | jobs = JobModel.list(limit=1) 90 | self.assertEqual(len(jobs), 1) 91 | 92 | # test list offset 93 | jobs = JobModel.list(offset=1) 94 | self.assertEqual(len(jobs), 2) 95 | 96 | # test list filter_by 97 | jobs = JobModel.list(filter_by={'id': 1}) 98 | self.assertEqual(len(jobs), 1) 99 | self.assertEqual(jobs[0].dict(), test_job_dict) 100 | 101 | def test_update(self): 102 | init_job_data_dict = JobModel.get_by_pk(pk=1).dict() 103 | to_update_data_dict = dict(title=u'后端吃饭工程师', 104 | work_year=1, 105 | city_id=1, 106 | company_id=1, 107 | department='飞天面条神教招聘', 108 | salary='20k-32k', 109 | education=2, 110 | description=u'日常工作:吃饭!') 111 | 112 | affect_rows = JobModel.update(filter_by={'id': 1}, values=to_update_data_dict) 113 | self.assertEqual(affect_rows, 1) 114 | 115 | # 更新后预期的结果 116 | init_job_data_dict.update(**to_update_data_dict) 117 | predictive_job_data_dict = init_job_data_dict 118 | init_updated_at = init_job_data_dict.pop('updated_at') 119 | 120 | new_job_data_dict = JobModel.get_by_pk(pk=1).dict() 121 | self.assertDictContainsSubset(predictive_job_data_dict, new_job_data_dict) 122 | self.assertGreater(new_job_data_dict.updated_at, init_updated_at) 123 | 124 | # 其他记录不受影响 125 | self.assertEqual(JobModel.get_by_pk(pk=2).title, u'前端开发工程师') 126 | 127 | # 批量更改 128 | affect_rows = JobModel.update(filter_by={'city_id': 4}, values={'title': '测试'}) 129 | self.assertEqual(affect_rows, 2) 130 | jobs = JobModel.list(filter_by={'city_id': 4}) 131 | self.assertTrue(all([job.title == u'测试' for job in jobs])) 132 | 133 | def test_update_by_pk(self): 134 | affect_rows = JobModel.update_by_pk(pk=1, values={'title': '你好啊啊'}) 135 | self.assertEqual(affect_rows, 1) 136 | self.assertEqual(JobModel.get_by_pk(pk=1).title, u'你好啊啊') 137 | 138 | def test_execute_sql_string(self): 139 | job_rows = JobModel.execute_sql_string( 140 | 'SELECT id, title FROM job WHERE id = :id', {'id': 1}) 141 | self.assertEqual(len(job_rows), 1) 142 | self.assertEqual(job_rows[0][0], 1) 143 | self.assertEqual(job_rows[0][1], u'高级前端开发工程师') 144 | 145 | job_rows = JobModel.execute_sql_string('SELECT id, title FROM job') 146 | self.assertEqual(len(job_rows), 3) 147 | self.assertEqual(job_rows[0][0], 1) 148 | self.assertEqual(job_rows[0][1], u'高级前端开发工程师') 149 | 150 | affect_rows = JobModel.execute_sql_string( 151 | "UPDATE job SET title = '测试' WHERE id = :id", {'id': 1}) 152 | self.assertEqual(affect_rows, 1) 153 | job = JobModel.get_by_pk(pk=1) 154 | self.assertEqual(job.title, u'测试') 155 | 156 | def test_batch_add(self): 157 | # 插入了其他的类实例 158 | init_jobs_count = JobModel.count() 159 | model_instances = [CityModel(name='你好'), 160 | JobModel(title='招聘资深前端工程师', city_id=1, company_id=2, lg_job_id=100056), 161 | JobModel(title='招聘资深中端工程师', city_id=1, company_id=2, lg_job_id=100055), ] 162 | 163 | with self.assertRaises(ValueError): 164 | JobModel.batch_add(model_instances) 165 | 166 | self.assertEqual(JobModel.count(), init_jobs_count) 167 | 168 | model_instances = [JobModel(title='招聘资深前端工程师', city_id=1, company_id=2, lg_job_id=100056), 169 | JobModel(title='招聘资深中端工程师', city_id=1, company_id=2, lg_job_id=100055), ] 170 | 171 | JobModel.batch_add(model_instances) 172 | 173 | self.assertEqual(JobModel.count(), init_jobs_count + 2) 174 | -------------------------------------------------------------------------------- /tests/test_utils/test_cache.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import time 3 | from unittest import TestCase 4 | 5 | from webspider.utils.cache import simple_cache, cache_clear, redis_instance 6 | 7 | test_number = 0 8 | 9 | 10 | @simple_cache() 11 | def incr_then_return_test_number(keyword=None): 12 | global test_number 13 | test_number += 1 14 | return test_number 15 | 16 | 17 | @simple_cache(ex=1) 18 | def incr_then_return_test_number_with_ex(keyword=None): 19 | global test_number 20 | test_number += 1 21 | return test_number 22 | 23 | 24 | class TestClass(object): 25 | def __init__(self, name): 26 | self.name = name 27 | 28 | 29 | class TestUtilCache(TestCase): 30 | 31 | def setUp(self): 32 | keys = redis_instance.keys('*incr_then_return_test_number*') 33 | if keys: 34 | redis_instance.delete(*keys) 35 | 36 | keys = redis_instance.keys('*return_what_you_put*') 37 | if keys: 38 | redis_instance.delete(*redis_instance.keys('*return_what_you_put*')) 39 | 40 | def test_simple_cache(self): 41 | """测试缓存""" 42 | global test_number 43 | test_number = 0 44 | self.assertEqual(1, incr_then_return_test_number('test')) 45 | self.assertEqual(1, incr_then_return_test_number('test')) 46 | self.assertEqual(2, incr_then_return_test_number('test_1')) 47 | self.assertEqual(2, incr_then_return_test_number('test_1')) 48 | self.assertEqual(3, incr_then_return_test_number('test_2')) 49 | 50 | with self.assertRaises(ValueError): 51 | incr_then_return_test_number(keyword='test') 52 | 53 | def test_simple_cache_with_ex(self): 54 | """测试设置了过期时间的缓存""" 55 | global test_number 56 | test_number = 0 57 | self.assertEqual(1, incr_then_return_test_number_with_ex('test')) 58 | self.assertEqual(1, incr_then_return_test_number_with_ex('test')) 59 | time.sleep(1.1) 60 | self.assertEqual(2, incr_then_return_test_number_with_ex('test')) 61 | 62 | def test_cache_clear(self): 63 | """测试清除缓存""" 64 | global test_number 65 | test_number = 0 66 | self.assertEqual(1, incr_then_return_test_number('test')) 67 | self.assertEqual(2, incr_then_return_test_number('test_1')) 68 | # 清除全部函数缓存 69 | cache_clear(incr_then_return_test_number) 70 | self.assertEqual(3, incr_then_return_test_number('test')) 71 | self.assertEqual(4, incr_then_return_test_number('test_1')) 72 | 73 | # 清除部分函数缓存 74 | cache_clear(incr_then_return_test_number, 'test_1') 75 | self.assertEqual(3, incr_then_return_test_number('test')) 76 | self.assertEqual(5, incr_then_return_test_number('test_1')) 77 | 78 | def test_cache_class_instance(self): 79 | """测试缓存类实例""" 80 | 81 | @simple_cache() 82 | def return_what_you_input(whatever): 83 | return whatever 84 | 85 | instance = TestClass('测试类实例') 86 | # cache class 87 | instance = return_what_you_input(instance) 88 | # get result from redis 89 | cache_instance = return_what_you_input(instance) 90 | self.assertTrue(instance is not cache_instance) 91 | self.assertTrue(isinstance(cache_instance, TestClass)) 92 | self.assertEqual(cache_instance.name, '测试类实例') 93 | 94 | def tearDown(self): 95 | keys = redis_instance.keys('*incr_then_return_test_number*') 96 | if keys: 97 | redis_instance.delete(*redis_instance.keys('*incr_then_return_test_number*')) 98 | 99 | keys = redis_instance.keys('*return_what_you_put*') 100 | if keys: 101 | redis_instance.delete(*redis_instance.keys('*return_what_you_put*')) 102 | -------------------------------------------------------------------------------- /tests/test_utils/test_classproperty.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from webspider.utils.classproperty import classproperty 5 | 6 | 7 | class TestClass(object): 8 | _name = '阿河' 9 | 10 | @classproperty 11 | def name(cls): 12 | return cls._name 13 | 14 | 15 | class TestUtilClassProperty(TestCase): 16 | def test_read_class_property(self): 17 | self.assertEqual(TestClass.name, '阿河') 18 | -------------------------------------------------------------------------------- /tests/test_utils/test_common.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from webspider.utils.common import get_key_from_dict_by_value, get_field_statistics 5 | 6 | 7 | class TestUtilCommon(TestCase): 8 | def test_get_key_from_dict_by_value(self): 9 | dictionary = { 10 | '全国': 1, 11 | '北京': 2, 12 | '广州': 3, 13 | } 14 | key = get_key_from_dict_by_value(1, dictionary) 15 | self.assertEqual(key, '全国') 16 | 17 | # no key 18 | with self.assertRaises(ValueError): 19 | get_key_from_dict_by_value(4, dictionary) 20 | 21 | dictionary = { 22 | '全国': 1, 23 | '北京': 1, 24 | '广州': 3, 25 | } 26 | key = get_key_from_dict_by_value(3, dictionary) 27 | self.assertEqual(key, '广州') 28 | # multi key 29 | with self.assertRaises(AttributeError): 30 | get_key_from_dict_by_value(1, dictionary) 31 | 32 | def test_get_field_statistics(self): 33 | statistics = get_field_statistics([0, 0, 0, 1, 1], {'男': 0, '女': 1, '不明': 2}) 34 | self.assertDictEqual(statistics, {'男': 3, '女': 2}) 35 | -------------------------------------------------------------------------------- /tests/test_utils/test_convert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from webspider.constants import WORK_YEARS_REQUEST_DICT, JOB_NATURE_DICT, COMPANY_SIZE_DICT 5 | from webspider.utils.convert import convert_dict_field_to_constants, convert_field_to_constants 6 | 7 | 8 | class TestUtilConvert(TestCase): 9 | def test_convert_dict_field_to_constants(self): 10 | init_dict = { 11 | 'work_year': '应届毕业生', 12 | 'size': '没有人', 13 | 'nature': '全职', 14 | 'name': '沙师弟', 15 | 'id': 3, 16 | 'value': None 17 | } 18 | convert_dict_field_to_constants(init_dict) 19 | self.assertDictEqual(init_dict, { 20 | 'work_year': WORK_YEARS_REQUEST_DICT['应届毕业生'], 21 | 'size': COMPANY_SIZE_DICT['unknown'], 22 | 'nature': JOB_NATURE_DICT['全职'], 23 | 'name': '沙师弟', 24 | 'id': 3, 25 | 'value': None 26 | }) 27 | 28 | def test_convert_field_to_constants(self): 29 | constant_value = convert_field_to_constants(field_name='work_year', field_value='应届毕业生') 30 | self.assertEqual(constant_value, WORK_YEARS_REQUEST_DICT['应届毕业生']) 31 | 32 | constant_value = convert_field_to_constants(field_name='work_year', field_value='家里蹲') 33 | self.assertEqual(constant_value, WORK_YEARS_REQUEST_DICT['unknown']) 34 | 35 | with self.assertRaises(ValueError): 36 | convert_field_to_constants(field_name='dinner', field_value='牛肉饭') 37 | -------------------------------------------------------------------------------- /tests/test_utils/test_http_tools.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase, mock 3 | 4 | from webspider.utils.http_tools import generate_http_request_headers, requests_get, requests_post 5 | 6 | 7 | class TestUtilHttpTools(TestCase): 8 | def test_generate_http_request_headers(self): 9 | header = generate_http_request_headers() 10 | self.assertTrue(isinstance(header, dict)) 11 | 12 | header = generate_http_request_headers(referer='https://www.zhihu.com') 13 | self.assertEqual(header['Referer'], 'https://www.zhihu.com') 14 | 15 | @mock.patch('requests.get') 16 | def test_request_get(self, mock_get): 17 | mock_get.return_value = '200' 18 | response = requests_get(url='https://baidu.com', need_sleep=False) 19 | self.assertEqual(response, '200') 20 | 21 | @mock.patch('requests.post') 22 | def test_request_post(self, mock_post): 23 | mock_post.return_value = '200' 24 | response = requests_post(url='https://baidu.com', need_sleep=False) 25 | self.assertEqual(response, '200') 26 | -------------------------------------------------------------------------------- /tests/test_utils/test_pagination.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from webspider.utils.pagination import Pagination 5 | 6 | 7 | class TestUtilPagination(TestCase): 8 | def test_pagination(self): 9 | pagination = Pagination(page=2, total=20, per_page=6) 10 | self.assertEqual(pagination.pages, 4) 11 | self.assertEqual(pagination.prev_num, 1) 12 | self.assertEqual(pagination.has_prev, True) 13 | self.assertEqual(pagination.next_num, 3) 14 | self.assertEqual(pagination.has_next, True) 15 | self.assertEqual([page for page in pagination.iter_pages], [1, 2, 3, 4]) 16 | 17 | def test_pagination_no_pages(self): 18 | pagination = Pagination(page=2, total=20, per_page=0) 19 | self.assertEqual(pagination.pages, 0) 20 | 21 | def test_pagination_no_pre(self): 22 | pagination = Pagination(page=1, total=20, per_page=6) 23 | self.assertEqual(pagination.has_prev, False) 24 | self.assertEqual(pagination.prev_num, None) 25 | self.assertEqual(pagination.has_next, True) 26 | self.assertEqual(pagination.next_num, 2) 27 | 28 | def test_pagination_no_next(self): 29 | pagination = Pagination(page=4, total=20, per_page=6) 30 | self.assertEqual(pagination.has_prev, True) 31 | self.assertEqual(pagination.prev_num, 3) 32 | self.assertEqual(pagination.has_next, False) 33 | self.assertEqual(pagination.next_num, None) 34 | -------------------------------------------------------------------------------- /tests/test_utils/test_text.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from webspider.utils.text import to_plaintext 5 | 6 | 7 | class TestUtilText(TestCase): 8 | def test_to_plaintext(self): 9 | init_text = '
abcd \n ' 10 | self.assertEqual(to_plaintext(content=init_text, strip=False), 'abcd ') 11 | 12 | init_text = '
abcd \n ' 13 | self.assertEqual(to_plaintext(content=init_text, strip=True), 'abcd') 14 | 15 | init_text = '
abcd \n ' 16 | self.assertEqual(to_plaintext(content=init_text, pattern=u'a|b', strip=False), 'cd \n ') 17 | -------------------------------------------------------------------------------- /tests/test_utils/test_time_tools.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from datetime import datetime 3 | from unittest import TestCase 4 | 5 | from webspider.utils.time_tools import (datetime_to_timestamp, timestamp_to_datetime, timestamp_to_datetime_str) 6 | 7 | 8 | class TestUtilTimeTools(TestCase): 9 | def test_datetime_to_timestamp(self): 10 | datetime_obj = datetime(year=2017, month=5, day=10) 11 | timestamp = datetime_to_timestamp(datetime_obj) 12 | self.assertEqual(int(datetime_obj.timestamp()), timestamp) 13 | 14 | def test_timestamp_to_datetime(self): 15 | timestamp = int(datetime(year=2017, month=5, day=10).timestamp()) 16 | datetime_obj = timestamp_to_datetime(timestamp=timestamp) 17 | self.assertEqual(datetime_obj.isoformat(), '2017-05-10T00:00:00') 18 | 19 | def test_timestamp_to_datetime_str(self): 20 | timestamp = int(datetime(year=2017, month=5, day=10).timestamp()) 21 | datetime_str = timestamp_to_datetime_str(ts=timestamp) 22 | self.assertEqual(datetime_str, '2017-05-10') 23 | 24 | timestamp = int(datetime(year=2018, month=2, day=1, hour=19, minute=46, second=57).timestamp()) 25 | datetime_str = timestamp_to_datetime_str(ts=timestamp, time_format='%Y/%m/%d %H:%M:%S') 26 | self.assertEqual(datetime_str, '2018/02/01 19:46:57') 27 | -------------------------------------------------------------------------------- /tests/test_web/base.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | from urllib.parse import urlencode 5 | 6 | from tornado.testing import AsyncHTTPTestCase 7 | from tornado.escape import json_encode, json_decode 8 | 9 | from webspider.utils.sql import get_session 10 | from webspider.web.app import make_web_app 11 | from tests.util import create_test_db, drop_test_db 12 | 13 | logger = logging.getLogger(__file__) 14 | 15 | 16 | class BaseHandlerTestCase(AsyncHTTPTestCase): 17 | session = get_session() 18 | 19 | def setUp(self): 20 | create_test_db(self.session) 21 | super(BaseHandlerTestCase, self).setUp() 22 | 23 | def tearDown(self): 24 | drop_test_db(self.session) 25 | super(BaseHandlerTestCase, self).tearDown() 26 | 27 | def get_app(self): 28 | return make_web_app() 29 | 30 | def request(self, method, url, headers=None, data=None, json=None, form=None, **kwargs): 31 | if not headers: 32 | headers = {} 33 | 34 | if json is not None: 35 | headers['Content-Type'] = 'application/json' 36 | data = json_encode(json) 37 | 38 | elif form is not None: 39 | headers['Content-Type'] = 'application/x-www-form-urlencoded' 40 | data = urlencode(form) 41 | 42 | response = self.fetch(url, method=method, headers=headers, body=data, allow_nonstandard_methods=True, 43 | **kwargs) 44 | 45 | if response.code / 100 != 2: 46 | logger.error(response.body) 47 | 48 | return response 49 | 50 | def get(self, url, **kwargs): 51 | return self.request(url=url, method="GET", **kwargs) 52 | 53 | def post(self, url, **kwargs): 54 | return self.request(url=url, method="POST", **kwargs) 55 | 56 | def put(self, url, **kwargs): 57 | return self.request(url=url, method="PUT", **kwargs) 58 | 59 | def fetch_json(self, path, **kwargs): 60 | response = self.request('GET', path, **kwargs) 61 | if response.code / 100 != 2: 62 | raise ValueError('fetch json expect http code 2xx, got {}'.format(response.code)) 63 | return json_decode(response.body) 64 | -------------------------------------------------------------------------------- /tests/test_web/test_formatter.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from unittest import TestCase 3 | 4 | from webspider.exceptions import DowngradeException 5 | from webspider.web.formatter.base import Field, Downgrade, Formatter 6 | 7 | """ 8 | 准备测试数据 9 | """ 10 | 11 | 12 | class TestFormatter(Formatter): 13 | FIELDS = [ 14 | Field('name', converter=lambda name: 'Mr.' + name), 15 | Field('value', converter=lambda value: int(value), downgrade=Downgrade(0)), 16 | Field('count'), 17 | ] 18 | 19 | 20 | class TestModel(object): 21 | def __init__(self, name=None, value=None, count=None): 22 | self.name = name 23 | self.value = value 24 | self.count = count 25 | 26 | 27 | class TestModelB(object): 28 | pass 29 | 30 | 31 | formatter_mappings = { 32 | TestModel: TestFormatter 33 | } 34 | 35 | """end""" 36 | 37 | 38 | class TestFormatter(TestCase): 39 | 40 | def test_register_formatter(self): 41 | Formatter.register_formatter(formatter_mappings) 42 | self.assertDictContainsSubset(formatter_mappings, Formatter._FORMATTER_MAPS) 43 | 44 | def test_get_formatter(self): 45 | Formatter.register_formatter(formatter_mappings) 46 | 47 | formatter = Formatter.get_formatter(TestModel) 48 | self.assertTrue(formatter is formatter_mappings[TestModel]) 49 | 50 | formatter = Formatter.get_formatter(TestModel()) 51 | self.assertTrue(formatter is formatter_mappings[TestModel]) 52 | 53 | formatter = Formatter.get_formatter(TestModelB) 54 | self.assertTrue(formatter is None) 55 | 56 | def test_downgrade(self): 57 | # 测试降级 58 | Formatter.register_formatter(formatter_mappings) 59 | test_model = TestModel(name='He', value='10a', count=100) 60 | format_result = Formatter.format(test_model) 61 | self.assertDictEqual(format_result, { 62 | 'name': 'Mr.He', 63 | 'value': 0, 64 | 'count': 100 65 | }) 66 | 67 | def test_field(self): 68 | with self.assertRaises(DowngradeException): 69 | Field(name='hi', downgrade=0) 70 | 71 | def test_format(self): 72 | Formatter.register_formatter(formatter_mappings) 73 | 74 | test_model = TestModel(name='He', value='10', count=100) 75 | format_result = Formatter.format(test_model) 76 | self.assertDictEqual(format_result, { 77 | 'name': 'Mr.He', 78 | 'value': 10, 79 | 'count': 100 80 | }) 81 | 82 | # 测试 list format 83 | test_models = [TestModel(name='He', value='10', count=100), 84 | TestModel(name='Wei', value='20', count=1)] 85 | format_result = Formatter.format(test_models) 86 | self.assertDictEqual(format_result[0], { 87 | 'name': 'Mr.He', 88 | 'value': 10, 89 | 'count': 100 90 | }) 91 | self.assertDictEqual(format_result[1], { 92 | 'name': 'Mr.Wei', 93 | 'value': 20, 94 | 'count': 1 95 | }) 96 | 97 | # 测试嵌套 format 98 | test_models = TestModel(name='He', value='10', count=TestModel(name='child', value='20', count=1)) 99 | format_result = Formatter.format(test_models) 100 | self.assertDictEqual(format_result, { 101 | 'name': 'Mr.He', 102 | 'value': 10, 103 | 'count': { 104 | 'name': 'Mr.child', 105 | 'value': 20, 106 | 'count': 1, 107 | } 108 | }) 109 | -------------------------------------------------------------------------------- /tests/test_web/test_keyword_statistic.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from datetime import datetime 3 | 4 | from tornado.escape import json_decode 5 | 6 | from tests.test_web.base import BaseHandlerTestCase 7 | from webspider.utils.time_tools import datetime_to_timestamp 8 | 9 | predictive_keyword_statistic_dict = { 10 | 'educations': {'不限': 1, '大专': 2, '本科': 4, '硕士': 5, '博士': 6, 'unknown': 7}, 11 | 'city_jobs_count': {'北京': 8, '深圳': 9, '广州': 10}, 12 | 'salary': {'10k以下': 11, '11k-20k': 12, '21k-35k': 13, '36k-60k': 14, '61k以上': 15}, 13 | 'financing_stage': {'未融资': 16, '天使轮': 17, 'A轮': 18, 'B轮': 19, 'C轮': 20, 14 | 'D轮及以上': 21, '上市公司': 22, '不需要融资': 23, 'unknown': 24}, 15 | 'work_years': {'不限': 25, '应届毕业生': 26, '1年以下': 27, '1-3年': 28, '3-5年': 29, 16 | '5-10年': 30, '10年以上': 31, 'unknown': 32}, 17 | 'per_day_jobs_count': [ 18 | { 19 | 'date': 20180128, 'all_city': 576, 'beijing': 198, 'guangzhou': 35, 'shenzhen': 93, 'shanghai': 80, 20 | 'hangzhou': 41, 'chengdu': 26, 21 | 'created_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S')), 22 | 'updated_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S')) 23 | }, 24 | { 25 | 'date': 20180129, 'all_city': 580, 'beijing': 200, 'guangzhou': 36, 'shenzhen': 100, 'shanghai': 82, 26 | 'hangzhou': 44, 'chengdu': 30, 27 | 'created_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S')), 28 | 'updated_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S')) 29 | }], 30 | 'created_at': datetime_to_timestamp(datetime.strptime('2018-02-01 19:01:44', '%Y-%m-%d %H:%M:%S')), 31 | 'updated_at': datetime_to_timestamp(datetime.strptime('2018-02-05 01:01:48', '%Y-%m-%d %H:%M:%S')), 32 | } 33 | 34 | 35 | class TestKeywordStatisticsApiHandler(BaseHandlerTestCase): 36 | 37 | def test_get(self): 38 | response = self.fetch_json('/api/statistics?keyword_name=python') 39 | self.assertDictEqual(predictive_keyword_statistic_dict, response) 40 | 41 | def test_get_when_error(self): 42 | response = self.get('/api/statistics') 43 | self.assertEqual(response.code, 404) 44 | predictive_response_content = { 45 | u"error": { 46 | u"message": u"请输入关键词", 47 | u"code": 4041, 48 | u"name": u"ResourceNotFoundWebException", 49 | u'data': '', 50 | u'debug_message': '', 51 | } 52 | } 53 | self.assertDictEqual(predictive_response_content, json_decode(response.body)) 54 | 55 | response = self.get('/api/statistics?keyword_name=种田') 56 | self.assertEqual(response.code, 404) 57 | predictive_response_content = { 58 | u"error": { 59 | u"message": u"找不到该关键词", 60 | u"code": 4041, 61 | u"name": u"ResourceNotFoundWebException", 62 | u'data': '', 63 | u'debug_message': '', 64 | } 65 | } 66 | self.assertDictEqual(predictive_response_content, json_decode(response.body)) 67 | 68 | response = self.get('/api/statistics?keyword_name=java') 69 | self.assertEqual(response.code, 404) 70 | predictive_response_content = { 71 | u"error": { 72 | u"message": u"暂无该关键词的统计结果", 73 | u"code": 4041, 74 | u"name": u"ResourceNotFoundWebException", 75 | u'data': '', 76 | u'debug_message': '', 77 | } 78 | } 79 | self.assertDictEqual(predictive_response_content, json_decode(response.body)) 80 | 81 | 82 | class TestKeywordStatisticsPageHandler(BaseHandlerTestCase): 83 | 84 | def test_get(self): 85 | response = self.get('/statistics?keyword_name=python') 86 | self.assertEqual(response.code, 200) 87 | 88 | def test_get_when_error(self): 89 | response = self.get('/api/statistics') 90 | self.assertEqual(response.code, 404) 91 | 92 | response = self.get('/api/statistics?keyword_name=种田') 93 | self.assertEqual(response.code, 404) 94 | 95 | response = self.get('/api/statistics?keyword_name=java') 96 | self.assertEqual(response.code, 404) 97 | -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | 4 | from sqlalchemy import text 5 | 6 | 7 | def execute_sql_file(file_paths, db_session, predictive_db_name=''): 8 | if predictive_db_name: 9 | assert get_current_database_name(db_session) == predictive_db_name 10 | for file_path in file_paths: 11 | sql_file = open(file_path, 'r') 12 | 13 | sql_command = '' 14 | 15 | for line in sql_file: 16 | if not line.startswith('--'): 17 | sql_command += line.strip('\n') 18 | 19 | if sql_command.endswith(';'): 20 | db_session.execute(text(sql_command)) 21 | db_session.flush() 22 | sql_command = '' 23 | 24 | 25 | def get_current_database_name(db_session): 26 | return db_session.execute('select database();').scalar() 27 | 28 | 29 | def create_test_db(session, db_name='test_spider'): 30 | """转载数据库""" 31 | # 清除测试数据库 32 | drop_test_db(session) 33 | # 创建测试数据库 34 | session.execute("CREATE DATABASE {db_name} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;".format( 35 | db_name=db_name)) 36 | # 指定测试数据库 test_spider 37 | session.execute("USE {db_name};".format(db_name=db_name)) 38 | 39 | path = os.path.dirname(__file__) 40 | # 创建表 41 | execute_sql_file( 42 | file_paths=[os.path.join(path, "schema.sql"), ], 43 | db_session=session, 44 | predictive_db_name=db_name 45 | ) 46 | fixture_path = os.path.join(path, 'fixture') 47 | # 装载表数据 48 | fixture_file_paths = [os.path.join(fixture_path, file) for file in os.listdir(fixture_path)] 49 | execute_sql_file( 50 | file_paths=fixture_file_paths, 51 | db_session=session, 52 | predictive_db_name=db_name 53 | ) 54 | assert get_current_database_name(session) == 'test_spider' 55 | 56 | 57 | def drop_test_db(session, db_name='test_spider'): 58 | # 清除测试数据库 59 | session.execute("DROP DATABASE IF EXISTS {db_name};".format(db_name=db_name)) 60 | -------------------------------------------------------------------------------- /webspider/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __version__ = '0.0.2' 3 | -------------------------------------------------------------------------------- /webspider/constants.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # flake8: noqa 3 | import os 4 | 5 | """ 6 | 工作类型 7 | """ 8 | 9 | 10 | class LGJobType(object): 11 | all = '全部' 12 | technology = '技术' 13 | product = '产品' 14 | design = '设计' 15 | operation = '运营' 16 | sell_and_market = '市场与销售' 17 | function = '职能' 18 | 19 | 20 | """ 21 | 公司融资阶段 22 | """ 23 | FINANCE_STAGE_DICT = { 24 | 'unknown': 0, 25 | '未融资': 1, 26 | '天使轮': 2, 27 | 'A轮': 3, 28 | 'B轮': 4, 29 | 'C轮': 5, 30 | 'D轮及以上': 6, 31 | '上市公司': 7, 32 | '不需要融资': 8, 33 | } 34 | 35 | """ 36 | 工作性质 37 | """ 38 | JOB_NATURE_DICT = { 39 | 'unknown': 0, 40 | '全职': 1, 41 | '兼职': 2, 42 | '实习': 3, 43 | } 44 | 45 | """ 46 | 工作年限要求 47 | """ 48 | WORK_YEARS_REQUEST_DICT = { 49 | 'unknown': 0, 50 | '不限': 1, 51 | '应届毕业生': 2, 52 | '1年以下': 3, 53 | '1-3年': 4, 54 | '3-5年': 5, 55 | '5-10年': 6, 56 | '10年以上': 7, 57 | } 58 | 59 | """ 60 | 学历要求 61 | """ 62 | EDUCATION_REQUEST_DICT = { 63 | 'unknown': 0, 64 | '不限': 1, 65 | '大专': 2, 66 | '本科': 3, 67 | '硕士': 4, 68 | '博士': 5, 69 | } 70 | 71 | """ 72 | 公司规模 73 | """ 74 | COMPANY_SIZE_DICT = { 75 | 'unknown': 0, 76 | '少于15人': 1, 77 | '15-50人': 2, 78 | '50-150人': 3, 79 | '150-500人': 4, 80 | '500-2000人': 5, 81 | '2000人以上': 6, 82 | } 83 | 84 | """ 85 | 其他常量 86 | """ 87 | 88 | DEBUG = (os.environ.get('ENV', 'dev') == 'dev') 89 | 90 | SECONDS_OF_DAY = 60 * 60 * 24 91 | 92 | REQUEST_TIMEOUT = 4 93 | 94 | # 爬虫最小睡眠时间 95 | MIN_SLEEP_SECS = 3 96 | 97 | # 爬虫最大睡眠时间 98 | MAX_SLEEP_SECS = 5 99 | 100 | """ 101 | REDIS KEY 相关 102 | """ 103 | 104 | CRAWLED_COMPANY_JOBS_REDIS_KEY = 'crawled_company_jobs_{lg_company_id}' 105 | 106 | """ 107 | 字段长度限制 108 | """ 109 | COMPANY_INTRODUCE_MAX_LEN = 2048 110 | COMPANY_ADVANTAGE_MAX_LEN = 256 111 | JOB_DESCRIPTION_MAX_LEN = 2048 112 | JOB_ADVANTAGE_MAX_LEN = 256 113 | 114 | """ 115 | retry 相关 116 | """ 117 | # 用来设定最大的尝试次数,超过该次数就停止重试 118 | RETRY_TIMES = 3 119 | # 函数最久持续时间 120 | STOP_MAX_DELAY = 1000 * 30 121 | # 设置在两次retrying之间的停留时间 122 | WAIT_FIXED = 1000 * 2 123 | 124 | """ 125 | HTTP 相关 (基于减少 lg 的网站负载考虑, 屏蔽以下常量的实际内容) 126 | """ 127 | HTTP_HEADER = {} 128 | 129 | USER_AGENT_LIST = ['for_test'] 130 | 131 | """ 132 | 相关网页 (基于减少 lg 的网站负载考虑, 屏蔽以下常量的实际内容) 133 | """ 134 | 135 | JOB_JSON_URL = '' 136 | 137 | JOB_DETAIL_URL = '' 138 | 139 | COMPANY_DETAIL_URL = '' 140 | 141 | ALL_CITY_URL = '' 142 | 143 | COMPANIES_URL = '' 144 | 145 | COMPANY_JOBS_URL = '' 146 | 147 | # COMPANIES_URL sort field 148 | SORTED_BY_JOBS_COUNT = 1 149 | 150 | # 生产环境 和 个人开发环境加载真实常量的值 151 | if os.environ.get('ENV', '') in ('production', 'dev'): 152 | from webspider.security_constants import * 153 | -------------------------------------------------------------------------------- /webspider/controllers/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /webspider/controllers/city_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from sqlalchemy.exc import IntegrityError 4 | 5 | from webspider.models.city import CityModel 6 | 7 | 8 | def get_city_id_by_name(name): 9 | city = CityModel.get_one(filter_by={'name': name}) 10 | if not city: 11 | raise ValueError('Get None when city name is {}'.format(name)) 12 | return city.id 13 | 14 | 15 | def insert_city_if_not_exist(name): 16 | if CityModel.is_exist(filter_by={'name': name}): 17 | return 18 | try: 19 | city_id = CityModel.add(name=name) 20 | return city_id 21 | except IntegrityError: 22 | pass 23 | 24 | 25 | def get_city_name_dict(): 26 | """ 27 | :return: dict{city_name: city_id, ....} eg: {'北京': 2, '上海':3, ......} 28 | """ 29 | cities = CityModel.list() 30 | return {city.name: city.id for city in cities} 31 | -------------------------------------------------------------------------------- /webspider/controllers/industry_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from sqlalchemy.exc import IntegrityError 3 | 4 | from webspider.models.industry import IndustryModel 5 | 6 | 7 | def insert_industry_if_not_exist(name): 8 | if IndustryModel.is_exist(filter_by={'name': name}): 9 | return 10 | try: 11 | industry_id = IndustryModel.add(name=name) 12 | return industry_id 13 | except IntegrityError: 14 | pass 15 | 16 | 17 | def get_industry_id_by_name(name): 18 | industry = IndustryModel.get_one(filter_by={'name': name}) 19 | if not industry: 20 | raise ValueError('Get None when industry name is {}'.format(name)) 21 | return industry.id 22 | -------------------------------------------------------------------------------- /webspider/controllers/job_ctl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | 5 | def get_salary_section(string): 6 | """ 7 | e.g: 8 | 15k-25k -> (15, 25) 9 | 15k以上 -> (15, 20) 10 | 15k以下 -> (10, 15) 11 | :param string: 15k-25k 12 | :return: 15,25 13 | """ 14 | pattern = r'K|k|以上|以下' 15 | replace_char = '' 16 | 17 | if string.find('-') != -1: 18 | string = re.sub(pattern=pattern, repl=replace_char, string=string) 19 | start, end = string.split('-') 20 | elif string.endswith('以下'): 21 | string = re.sub(pattern=pattern, repl=replace_char, string=string) 22 | start, end = int(string) - 5 if int(string) - 5 >= 0 else 1, string 23 | elif string.endswith('以上'): 24 | string = re.sub(pattern=pattern, repl=replace_char, string=string) 25 | start, end = string, int(string) + 5 26 | else: 27 | raise ValueError('error salary' + string) 28 | 29 | return int(start), int(end) 30 | -------------------------------------------------------------------------------- /webspider/controllers/job_keyword_ctl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sqlalchemy import func 3 | 4 | from webspider.models.job_keyword import JobKeywordModel 5 | 6 | 7 | def get_most_frequently_keyword_ids(limit=None, offset=None): 8 | """ 9 | 获得出现最为频繁的关键词 id 10 | :param limit: 11 | :param offset: 12 | :return: 关键词 id 集合 13 | :rtype: List[int] 14 | """ 15 | result = JobKeywordModel.list(columns=JobKeywordModel.keyword_id, group_by=JobKeywordModel.keyword_id, 16 | order_by=func.count(JobKeywordModel.id).desc(), limit=limit, offset=offset) 17 | return [item[0] for item in result] 18 | -------------------------------------------------------------------------------- /webspider/controllers/keyword_ctl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from sqlalchemy.exc import IntegrityError 4 | 5 | from webspider.models.keyword import KeywordModel 6 | 7 | 8 | def insert_keyword_if_not_exist(name): 9 | if KeywordModel.is_exist(filter_by={'name': name}): 10 | return 11 | try: 12 | keyword_id = KeywordModel.add(name=name) 13 | return keyword_id 14 | except IntegrityError: 15 | pass 16 | 17 | 18 | def get_keyword_name_by_id(keyword_id): 19 | keyword = KeywordModel.get_by_pk(keyword_id) 20 | if not keyword: 21 | raise ValueError('Get None when keyword id is {}'.format(keyword_id)) 22 | return keyword.name 23 | 24 | 25 | def get_keyword_id_by_name(name): 26 | keyword = KeywordModel.get_one(filter_by={'name': name}) 27 | if not keyword: 28 | raise ValueError('Get None when keyword id is {}'.format(name)) 29 | return keyword.id 30 | -------------------------------------------------------------------------------- /webspider/controllers/keyword_statistic_ctl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from collections import Counter 3 | 4 | from webspider import utils 5 | from webspider import constants 6 | from webspider.models import CompanyModel 7 | from webspider.controllers import city_ctl, job_ctl 8 | 9 | 10 | def get_salary_statistic(jobs): 11 | """ 12 | 获取薪水统计情况 13 | 14 | :param jobs: webspider.models.JobModel instances list 15 | :return: collections.Counter 16 | """ 17 | salary_statistic = Counter() 18 | for job in jobs: 19 | start_salary, end_salary = job_ctl.get_salary_section(job.salary) 20 | if start_salary <= 10: 21 | salary_statistic['10k及以下'] += 1 22 | if start_salary <= 20 and end_salary >= 11: 23 | salary_statistic['11k-20k'] += 1 24 | if start_salary <= 35 and end_salary >= 21: 25 | salary_statistic['21k-35k'] += 1 26 | if start_salary <= 60 and end_salary >= 36: 27 | salary_statistic['36k-60k'] += 1 28 | if end_salary >= 61: 29 | salary_statistic['61k以上'] += 1 30 | return salary_statistic 31 | 32 | 33 | def get_finance_stage_statistic(jobs): 34 | """ 35 | 获取 jobs 的公司的统治情况统计 36 | 37 | :param jobs: webspider.models.JobModel instances list 38 | :return: collections.Counter 39 | """ 40 | company_ids = [job.company_id for job in jobs] 41 | companies = CompanyModel.list(filter=CompanyModel.id.in_(company_ids)) 42 | 43 | finance_stage_statistic = utils.common.get_field_statistics(values=[company.finance_stage for company in companies], 44 | constants_dict=constants.FINANCE_STAGE_DICT) 45 | return finance_stage_statistic 46 | 47 | 48 | def get_educations_statistic(jobs): 49 | """ 50 | 获取教育背景要求统计 51 | 52 | :param jobs: webspider.models.JobModel instances list 53 | :return: collections.Counter 54 | """ 55 | return utils.common.get_field_statistics(values=[job.education for job in jobs], 56 | constants_dict=constants.EDUCATION_REQUEST_DICT) 57 | 58 | 59 | def get_work_years_statistic(jobs): 60 | """ 61 | 获取工作年限要求统计 62 | 63 | :param jobs: webspider.models.JobModel instances list 64 | :return: collections.Counter 65 | """ 66 | return utils.common.get_field_statistics(values=[job.work_year for job in jobs], 67 | constants_dict=constants.WORK_YEARS_REQUEST_DICT) 68 | 69 | 70 | def get_city_jobs_count_statistic(jobs, limit=10): 71 | """ 72 | 获取各城市职位统计 73 | :param jobs: webspider.models.JobModel instances list 74 | :param limit: 指定获取职位数量前几位的城市 75 | :return: collections.Counter 76 | """ 77 | city_name_dict = city_ctl.get_city_name_dict() 78 | city_job_count = utils.common.get_field_statistics(values=[job.city_id for job in jobs], 79 | constants_dict=city_name_dict) 80 | city_job_count = sorted(city_job_count.items(), key=lambda x: x[1], reverse=True) 81 | if limit: 82 | city_job_count = city_job_count[:limit] 83 | return Counter({item[0]: item[1] for item in city_job_count}) 84 | -------------------------------------------------------------------------------- /webspider/crawlers/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from webspider.crawlers.lg_cites import get_cites_from_lg 3 | from webspider.crawlers.lg_companies import (get_companies_pagination_from_lg, get_companies_from_lg, 4 | get_company_detail_from_lg, clean_lg_company_data, ) 5 | from webspider.crawlers.lg_jobs import (get_jobs_pagination_from_lg, get_jobs_from_lg, 6 | get_job_detail_from_lg, clean_lg_job_data, ) 7 | from webspider.crawlers.lg_jobs_count import get_jobs_count_from_lg 8 | 9 | __all__ = ['get_cites_from_lg', 'get_companies_pagination_from_lg', 'get_companies_from_lg', 10 | 'get_company_detail_from_lg', 'clean_lg_company_data', 'get_jobs_pagination_from_lg', 11 | 'get_jobs_from_lg', 'get_job_detail_from_lg', 'clean_lg_job_data', 'get_jobs_count_from_lg'] 12 | -------------------------------------------------------------------------------- /webspider/crawlers/lagou_cites.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import re 3 | import logging 4 | 5 | import requests 6 | from lxml import etree 7 | from tornado.util import ObjectDict 8 | 9 | from webspider import constants 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def get_cites_from_lg(): 15 | """ 16 | 爬取城市数据 17 | 18 | 返回的 dict 组成: 19 | id: 20 | type: int 21 | meaning: 城市 id 22 | eg: 1 23 | name: 24 | type: str 25 | meaning: 城市名 26 | eg: 北京 27 | 28 | :return: 城市数据集合 29 | :rtype: List[tornado.util.ObjectDict] 30 | """ 31 | logger.info(u'begin crawl cities info......') 32 | 33 | response_html = etree.HTML(requests.get(constants.ALL_CITY_URL).text) 34 | cities_html_list = response_html.xpath("//ul[@class='city_list']/li/a") 35 | 36 | cities_dicts = [] 37 | for city_html in cities_html_list: 38 | city_name = city_html.xpath('./text()')[0] 39 | city_id = re.findall(pattern=r'/(\d+)-\d+-\d+', string=city_html.xpath('./@href')[0])[0] 40 | cities_dicts.append(ObjectDict(id=city_id, name=city_name)) 41 | 42 | logger.info(u'crawl cities info finished! cites quantity is {cities_count}'.format( 43 | cities_count=len(cities_dicts))) 44 | return cities_dicts 45 | -------------------------------------------------------------------------------- /webspider/crawlers/lagou_companies.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import re 3 | import json 4 | import logging 5 | 6 | from lxml import etree 7 | from tornado.util import ObjectDict 8 | 9 | from webspider import utils 10 | from webspider import constants 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def get_companies_pagination_from_lg(city_id=0, finance_stage_id=0, industry_id=0, page_no=1): 16 | """ 17 | 爬取公司分页数据 18 | 19 | :param city_id: 城市 id 20 | :param finance_stage_id: 融资阶段 id 21 | :param industry_id: 行业 id 22 | :param page_no: 页码 23 | :return: 公司分页数据 24 | :rtype: utils.pagination.Pagination 25 | """ 26 | url = constants.COMPANIES_URL.format(city_id=city_id, 27 | finance_stage_id=finance_stage_id, 28 | industry_id=industry_id) 29 | 30 | params = {'pn': page_no, 'sortField': constants.SORTED_BY_JOBS_COUNT} 31 | response_json = utils.http_tools.requests_get(url=url, params=params).json() 32 | pagination = utils.pagination.Pagination(per_page=int(response_json['pageSize']), 33 | total=int(response_json['totalCount'])) 34 | 35 | return pagination 36 | 37 | 38 | def get_companies_from_lg(city_id=0, finance_stage_id=0, industry_id=0, page_no=1): 39 | """ 40 | 爬取公司数据 41 | 42 | 返回的 dict 组成: 43 | lg_company_id: 44 | type: int 45 | meaning: 接口使用的公司 id 46 | eg: 1 47 | fullname: 48 | type: str 49 | meaning: 公司全称 50 | eg: 智者四海北京科技有限公司 51 | city_name: 52 | type: str 53 | meaning: 城市名 54 | eg: 北京 55 | shortname: 56 | type: str 57 | meaning: 公司简称 58 | eg: 知乎 59 | fullname: 60 | type: str 61 | meaning: 公司全称 62 | eg: 智者四海北京科技有限公司 63 | finance_stage: 64 | type: str 65 | meaning: 融资阶段 66 | eg: D轮 67 | features: 68 | type: str 69 | meaning: 公司slogan, 一句话简介 70 | eg: 发现更大的世界 71 | process_rate: 72 | type: int 73 | meaning: 简历处理率 74 | eg: 94 75 | industries: 76 | type: str 77 | meaning: 所处行业 78 | eg: '互联网,社交' or '互联网' 79 | advantage: 80 | type: List[str] 81 | meaning: 公司优势 82 | eg: ['双休', '五险一金', ......] 83 | address: 84 | type: str 85 | meaning: 公司地址 86 | eg: 北京市海淀区学院路768创意园 87 | size: 88 | type: str 89 | meaning: 公司规模 90 | eg: 2000人以上 91 | introduce: 92 | type: List[str] 93 | meaning: 公司介绍 94 | eg: ['我们的愿景:', 'blablabla', '我们处于一个知识 balala...'] 95 | 96 | :param city_id: 城市 id 97 | :param finance_stage_id: 融资阶段 id 98 | :param industry_id: 行业 id 99 | :param page_no: 页码 100 | :return: 公司数据集合 101 | :rtype: List[tornado.util.ObjectDict] 102 | """ 103 | url = constants.COMPANIES_URL.format(city_id=city_id, 104 | finance_stage_id=finance_stage_id, 105 | industry_id=industry_id) 106 | params = {'pn': page_no, 'sortField': constants.SORTED_BY_JOBS_COUNT} 107 | companies = utils.http_tools.requests_get(url=url, params=params).json()['result'] 108 | 109 | companies_dicts = [] 110 | for company in companies: 111 | lg_company_id = int(company.get('companyId')) 112 | 113 | company_detail = get_company_detail_from_lg(lg_company_id=lg_company_id) 114 | companies_dicts.append(ObjectDict( 115 | lg_company_id=lg_company_id, 116 | city_name=company.get('city'), 117 | shortname=company.get('companyShortName'), 118 | fullname=company.get('companyFullName'), 119 | finance_stage=company.get('financeStage'), 120 | features=company.get('companyFeatures'), 121 | process_rate=company.get('processRate'), 122 | industries=company.get('industryField'), 123 | # company detail 124 | advantage=company_detail.get('advantage'), 125 | address=company_detail.get('address'), 126 | size=company_detail.get('size'), 127 | introduce=company_detail.get('introduce') 128 | )) 129 | return companies_dicts 130 | 131 | 132 | def get_company_detail_from_lg(lg_company_id): 133 | """ 134 | 爬取公司详情页的数据 135 | 136 | 返回的 dict 组成: 137 | advantage: 138 | type: List[str] 139 | meaning: 公司优势 140 | eg: ['双休', '五险一金', ......] 141 | address: 142 | type: str 143 | meaning: 公司地址 144 | eg: 北京市海淀区学院路768创意园 145 | size: 146 | type: str 147 | meaning: 公司规模 148 | eg: 2000人以上 149 | introduce: 150 | type: List[str] 151 | meaning: 公司介绍 152 | eg: ['我们的愿景:', 'blablabla', '我们处于一个知识 balala...'] 153 | 154 | :param lg_company_id: 接口使用的公司 id 155 | :return: 公司详情页数据 156 | :rtype: tornado.util.ObjectDict 157 | """ 158 | response = utils.http_tools.requests_get( 159 | url=constants.COMPANY_DETAIL_URL.format(lg_company_id=lg_company_id)) 160 | company_detail_html = etree.HTML(response.text) 161 | 162 | advantage = company_detail_html.xpath('//div[@id="tags_container"]//li/text()') 163 | sizes = company_detail_html.xpath('//div[@id="basic_container"]//li[3]/span/text()') 164 | address = company_detail_html.xpath('//p[@class="mlist_li_desc"]/text()') 165 | introduces = company_detail_html.xpath('//span[@class="company_content"]//text()') 166 | 167 | if not sizes: 168 | logger.error( 169 | 'can not get size by lg_company_id = {}, html code is \n{}'.format(lg_company_id, response.text)) 170 | 171 | return ObjectDict( 172 | advantage=advantage, 173 | address=address[0] if address else '', 174 | size=sizes[0] if sizes else '', 175 | introduce=introduces, 176 | ) 177 | 178 | 179 | def clean_lg_company_data(company_dict): 180 | """ 181 | 清洗爬取到的公司信息 182 | 183 | :param company_dict: tornado.util.ObjectDict 184 | """ 185 | if 'size' in company_dict: 186 | company_dict.size = company_dict.size.strip() 187 | if 'finance_stage' in company_dict: 188 | company_dict.finance_stage = company_dict.finance_stage.strip() 189 | if 'features' in company_dict: 190 | company_dict.features = utils.text.to_plaintext(company_dict.features) 191 | if 'address' in company_dict: 192 | company_dict.address = utils.text.to_plaintext(company_dict.address) 193 | if 'introduce' in company_dict: 194 | company_dict.introduce = ''.join(company_dict.introduce) if company_dict.introduce else '' 195 | company_dict.introduce = company_dict.introduce[:constants.COMPANY_INTRODUCE_MAX_LEN] 196 | if 'advantage' in company_dict: 197 | company_dict.advantage = list(map(utils.text.to_plaintext, company_dict.advantage)) 198 | company_dict.advantage = json.dumps(company_dict.advantage)[ 199 | :constants.COMPANY_ADVANTAGE_MAX_LEN] 200 | if 'industries' in company_dict: 201 | company_dict.industries = set(re.split(r",|,|、|\s", company_dict.industries)) 202 | -------------------------------------------------------------------------------- /webspider/crawlers/lagou_jobs.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | 4 | from lxml import etree 5 | from tornado.util import ObjectDict 6 | 7 | from webspider import utils 8 | from webspider import constants 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def get_jobs_pagination_from_lg(lg_company_id, job_type, page_no=1, is_school_job=False): 14 | """ 15 | 爬取职位分页数据 16 | 17 | :param lg_company_id: 接口使用的公司 id 18 | :param job_type: 职位类型 19 | :param page_no: 页码 20 | :param is_school_job: 是否爬取校招职位 21 | :return: 22 | """ 23 | params = { 24 | 'companyId': lg_company_id, 25 | 'positionFirstType': job_type, 26 | 'schoolJob': is_school_job, 27 | 'pageNo': page_no, 28 | 'pageSize': 10, 29 | } 30 | response_json = utils.http_tools.requests_get( 31 | url=constants.COMPANY_JOBS_URL, params=params).json() 32 | pagination = utils.pagination.Pagination(per_page=int(response_json['content']['data']['page']['pageSize']), 33 | total=int(response_json['content']['data']['page']['totalCount'])) 34 | 35 | return pagination 36 | 37 | 38 | def get_jobs_from_lg(lg_company_id, job_type, page_no=1, is_school_job=False): 39 | """ 40 | 爬取职位数据 41 | 42 | 返回的 dict 组成: 43 | lg_job_id: 44 | type: int 45 | meaning: 接口使用的职位 id 46 | eg: 1 47 | city_name: 48 | type: str 49 | meaning: 城市名 50 | eg: 北京 51 | title: 52 | type: str 53 | meaning: 职位标题 54 | eg: 招聘后端工程师 55 | salary: 56 | type: str 57 | meaning: 薪酬范围 58 | eg: '10k~20k' 59 | education: 60 | type: str 61 | meaning: 教育背景要求 62 | eg: 本科或以上 63 | nature: 64 | type: str 65 | meaning: 职位性质 66 | eg: 全职 67 | work_year: 68 | type: str 69 | meaning: 工作年限要求 70 | eg: 1~3年 71 | advantage: 72 | type: str 73 | meaning: 职位优势 74 | eg: 大平台,五险一金 75 | department: 76 | type: str 77 | meaning: 招聘部门 78 | eg: 商业部 79 | keywords: 80 | type: List[str] 81 | meaning: 职位关键词 82 | eg: ['后端', 'Web', 'Python'] 83 | description: 84 | type: List[str] 85 | meaning: 职位介绍 86 | eg: ['职位要求:', 'blablabla', '.......'] 87 | 88 | :param lg_company_id: 接口使用的公司 id 89 | :param job_type: 职位类型 90 | :param page_no: 页码 91 | :param is_school_job: 是否爬取校招职位 92 | :param skip_exist: 是否跳过数据库已经存在的职位数据 93 | :return: 职位数据集合 94 | :rtype: List[tornado.util.ObjectDict] 95 | """ 96 | params = { 97 | 'companyId': lg_company_id, 98 | 'positionFirstType': job_type, 99 | 'schoolJob': is_school_job, 100 | 'pageNo': page_no, 101 | 'pageSize': 10, 102 | } 103 | response_json = utils.http_tools.requests_get( 104 | url=constants.COMPANY_JOBS_URL, params=params).json() 105 | jobs = response_json['content']['data']['page']['result'] 106 | 107 | jobs_dicts = [] 108 | for job in jobs: 109 | lg_job_id = job['positionId'] 110 | job_detail = get_job_detail_from_lg(lg_job_id=lg_job_id) 111 | jobs_dicts.append(ObjectDict( 112 | lg_job_id=lg_job_id, 113 | city_name=job.get('city'), 114 | title=job.get('positionName'), 115 | salary=job.get('salary'), 116 | education=job.get('education'), 117 | nature=job.get('jobNature'), 118 | work_year=job.get('workYear'), 119 | advantage=job.get('positionAdvantage', ''), 120 | # job detail 121 | department=job_detail.get('department'), 122 | keywords=job_detail.get('keywords'), 123 | description=job_detail.get('description'), 124 | )) 125 | return jobs_dicts 126 | 127 | 128 | def get_job_detail_from_lg(lg_job_id): 129 | """ 130 | 爬取职位详情页的数据 131 | 132 | 返回的 dict 组成: 133 | department: 134 | type: str 135 | meaning: 招聘部门 136 | eg: 商业部 137 | keywords: 138 | type: List[str] 139 | meaning: 职位关键词 140 | eg: ['后端', 'Web', 'Python'] 141 | description: 142 | type: List[str] 143 | meaning: 职位介绍 144 | eg: ['职位要求:', 'blablabla', '.......'] 145 | 146 | :param lg_job_id: 接口使用的职位 id 147 | :return: 职位详情页数据 148 | :rtype: tornado.util.ObjectDict 149 | """ 150 | response = utils.http_tools.requests_get( 151 | url=constants.JOB_DETAIL_URL.format(lg_job_id=lg_job_id)) 152 | job_detail_html = etree.HTML(response.text) 153 | 154 | department = job_detail_html.xpath('//div[@class="job-name"]/div[@class="company"]/text()') 155 | description = job_detail_html.xpath('//dd[@class="job_bt"]/div//text()') 156 | keywords = job_detail_html.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()') 157 | 158 | if not department: 159 | logger.error('can not get department by lg_job_id = {}, html is \n {}'.format( 160 | lg_job_id, response.text)) 161 | 162 | return ObjectDict( 163 | department=department[0] if department else '', 164 | description=description, 165 | keywords=keywords, 166 | ) 167 | 168 | 169 | def clean_lg_job_data(job_dict): 170 | """ 171 | 清洗爬取到的职位信息 172 | 173 | :param job_dict: tornado.util.ObjectDict 174 | """ 175 | if 'keywords' in job_dict: 176 | job_dict.keywords = set(map(lambda keyword: keyword.strip().lower(), job_dict.keywords)) 177 | if 'description' in job_dict: 178 | job_dict.description = ''.join(job_dict.description) if job_dict.description else '' 179 | job_dict.description = job_dict.description[:constants.JOB_DESCRIPTION_MAX_LEN] 180 | if 'advantage' in job_dict: 181 | job_dict.advantage = job_dict.advantage[:constants.JOB_ADVANTAGE_MAX_LEN] 182 | -------------------------------------------------------------------------------- /webspider/crawlers/lagou_jobs_count.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from webspider.constants import JOB_JSON_URL 3 | from webspider.utils.http_tools import requests_post, generate_http_request_headers 4 | 5 | 6 | def get_jobs_count_from_lg(city_name, keyword_name): 7 | """ 8 | 爬取职位数量 9 | 10 | :param city_name: 城市名 11 | :param keyword_name: 关键词名 12 | :return: 城市下的关于关键词的职位数量,如北京的 python 职位数量 13 | :rtype: int 14 | """ 15 | query_string = {'needAddtionalResult': False} 16 | if city_name != '全国': 17 | query_string['city'] = city_name 18 | form_data = { 19 | 'first': False, 20 | 'pn': 1, 21 | 'kd': keyword_name 22 | } 23 | headers = generate_http_request_headers( 24 | referer='https://www.lg.com/jobs/list_java?labelWords=&fromSearch=true') 25 | response_json = requests_post(url=JOB_JSON_URL, params=query_string, 26 | data=form_data, headers=headers).json() 27 | return int(response_json['content']['positionResult']['totalCount']) 28 | -------------------------------------------------------------------------------- /webspider/exceptions.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __all__ = ['BaseException', 'ResourceNotFoundWebException', 'DowngradeException'] 4 | 5 | 6 | class BaseException(Exception): 7 | ERROR_CODE = None 8 | STATUS_CODE = 200 9 | 10 | def __init__(self, message, data=None, debug_message=None): 11 | if self.ERROR_CODE is None: 12 | raise NotImplementedError() 13 | self._message = message 14 | self._data = dict(data) if data else None 15 | self._debug_message = debug_message 16 | 17 | @property 18 | def code(self): 19 | return self.ERROR_CODE 20 | 21 | @property 22 | def message(self): 23 | return self._message 24 | 25 | @property 26 | def data(self): 27 | return self._data 28 | 29 | @property 30 | def debug_message(self): 31 | return self._debug_message 32 | 33 | def __str__(self): 34 | return "Exception: code={code}, message={message}, data={data}, debug_message={debug_message}".format( 35 | code=self.code, message=self.message, data=self.data, debug_message=self.debug_message) 36 | 37 | def __repr__(self): 38 | return self.__str__() 39 | 40 | 41 | class ResourceNotFoundWebException(BaseException): 42 | """ 43 | Corresponding to HTTP code 404 44 | """ 45 | ERROR_CODE = 4041 46 | STATUS_CODE = 404 47 | 48 | def __init__(self, message=u'资源不存在', data=None, debug_message=None): 49 | super(ResourceNotFoundWebException, self).__init__(message, data, debug_message) 50 | 51 | 52 | class DowngradeException(BaseException): 53 | ERROR_CODE = 101 54 | 55 | def __init__(self, message=u'降级异常', data=None, debug_message=None): 56 | super(DowngradeException, self).__init__(message, data, debug_message) 57 | -------------------------------------------------------------------------------- /webspider/models/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from webspider.models.city import CityModel 3 | from webspider.models.job import JobModel 4 | from webspider.models.jobs_count import JobsCountModel 5 | from webspider.models.company import CompanyModel 6 | from webspider.models.company_industry import CompanyIndustryModel 7 | from webspider.models.industry import IndustryModel 8 | from webspider.models.job_keyword import JobKeywordModel 9 | from webspider.models.keyword import KeywordModel 10 | from webspider.models.keyword_statistic import KeywordStatisticModel 11 | 12 | __all__ = ['CityModel', 'JobModel', 'JobsCountModel', 'CompanyModel', 'CompanyIndustryModel', 'IndustryModel', 13 | 'JobKeywordModel', 'KeywordModel', 'KeywordStatisticModel'] 14 | -------------------------------------------------------------------------------- /webspider/models/base.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import logging 4 | 5 | from sqlalchemy import MetaData, inspect, func, text 6 | from sqlalchemy.ext.declarative import declarative_base 7 | from tornado.util import ObjectDict 8 | 9 | from webspider.utils import sql 10 | from webspider.utils.classproperty import classproperty 11 | 12 | __all__ = ['BaseModel'] 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | _Base = declarative_base() 17 | 18 | 19 | class BaseModel(_Base): 20 | __abstract__ = True 21 | __table_args__ = { 22 | 'mysql_engine': 'InnoDB', 23 | 'mysql_charset': 'utf8mb4', 24 | 'extend_existing': True, 25 | } 26 | 27 | metadata = MetaData(bind=sql.db_engine, reflect=True) 28 | 29 | @classproperty 30 | def session(cls): 31 | return sql.get_session() 32 | 33 | @classproperty 34 | def pk_name(cls): 35 | """主键名""" 36 | return inspect(cls).primary_key[0].name 37 | 38 | @classproperty 39 | def pk(cls): 40 | """表主键""" 41 | return getattr(cls, cls.pk_name) 42 | 43 | def dict(self): 44 | """sqlalchemy object -> dict""" 45 | columns = self.__table__.columns.keys() 46 | return ObjectDict((column, getattr(self, column)) for column in columns) 47 | 48 | @classmethod 49 | def count(cls, filter=None, filter_by=None): 50 | """ 51 | 获取数据库中记录的数目 52 | :param filter: apply the given filtering criterion to a copy of this Query, 53 | using SQL expressions. 54 | :param filter_by: apply the given filtering criterion to a copy of this Query, 55 | using keyword expressions as a dict. 56 | :return: 57 | """ 58 | query = cls.session.query(func.count(cls.pk)) 59 | 60 | if filter is not None: 61 | query = query.filter(filter) 62 | if filter_by is not None: 63 | query = query.filter_by(**filter_by) 64 | 65 | return query.scalar() 66 | 67 | @classmethod 68 | def add(cls, **values): 69 | """添加记录""" 70 | obj = cls(**values) 71 | cls.session.add(obj) 72 | cls.session.flush() 73 | return getattr(obj, obj.pk_name) 74 | 75 | @classmethod 76 | def get_by_pk(cls, pk): 77 | """通过主键值获取记录""" 78 | query = cls.session.query(cls).filter(cls.pk == pk) 79 | return query.scalar() 80 | 81 | @classmethod 82 | def get_one(cls, filter=None, filter_by=None): 83 | """ 84 | 获取记录 85 | :param filter: apply the given filtering criterion to a copy of this Query, 86 | using SQL expressions. 87 | :param filter_by: apply the given filtering criterion to a copy of this Query, 88 | using keyword expressions as a dict. 89 | :return: 90 | """ 91 | query = cls.session.query(cls) 92 | 93 | if filter is not None: 94 | query = query.filter(filter) 95 | if filter_by is not None: 96 | query = query.filter_by(**filter_by) 97 | 98 | return query.first() 99 | 100 | @classmethod 101 | def list(cls, columns=None, filter=None, filter_by=None, order_by=None, group_by=None, offset=None, limit=None): 102 | """ 103 | 批量获取记录 104 | :param columns: the columns you want to query, SQL expression, column, or mapped entity expected 105 | :param filter: apply the given filtering criterion to a copy of this Query, 106 | using SQL expressions. 107 | :param filter_by: apply the given filtering criterion to a copy of this Query, 108 | using keyword expressions as a dict. 109 | :param order_by: apply one or more ORDER BY criterion to the query and return 110 | the newly resulting ``Query`` 111 | :param group_by: apply one or more GROUP BY criterion to the query and return 112 | the newly resulting :class:`.Query` 113 | :param offset: Apply an ``OFFSET`` to the query and return the newly resulting 114 | ``Query``. 115 | :param limit: Apply a ``LIMIT`` to the query and return the newly resulting 116 | ``Query``. 117 | :return: 118 | """ 119 | query = cls.session.query(cls) 120 | if columns: 121 | query = cls.session.query(columns) 122 | if filter is not None: 123 | query = query.filter(filter) 124 | if filter_by is not None: 125 | query = query.filter_by(**filter_by) 126 | if group_by is not None: 127 | query = query.group_by(group_by) 128 | if order_by is not None: 129 | query = query.order_by(order_by) 130 | if offset is not None: 131 | query = query.offset(offset) 132 | if limit is not None: 133 | query = query.limit(limit) 134 | 135 | result = query.all() 136 | 137 | return result 138 | 139 | @classmethod 140 | def is_exist(cls, filter=None, filter_by=None): 141 | """ 142 | 判断某个记录是否存在 143 | :param filter: apply the given filtering criterion to a copy of this Query, 144 | using SQL expressions. 145 | :param filter_by: apply the given filtering criterion to a copy of this Query, 146 | using keyword expressions as a dict. 147 | :return: boolean 148 | """ 149 | 150 | return cls.count(filter=filter, filter_by=filter_by) != 0 151 | 152 | @classmethod 153 | def update(cls, filter=None, filter_by=None, values=None): 154 | """更新数据 155 | :param filter: apply the given filtering criterion to a copy of this Query, 156 | using SQL expressions. 157 | :param filter_by: apply the given filtering criterion to a copy of this Query, 158 | using keyword expressions as a dict. 159 | :param values: values to update 160 | :return: type: int, affected rows 161 | """ 162 | query = cls.session.query(cls) 163 | 164 | if filter is not None: 165 | query = query.filter(filter) 166 | 167 | if filter_by is not None: 168 | query = query.filter_by(**filter_by) 169 | 170 | affect_rows = query.update(values) 171 | return affect_rows 172 | 173 | @classmethod 174 | def update_by_pk(cls, pk, values): 175 | """主键更新数据 176 | 177 | :param pk: 主键值 178 | :param values: dict 要更新的值,key=value 形式 179 | :return: 返回变更的行数 180 | """ 181 | return cls.update(filter=(cls.pk == pk), values=values) 182 | 183 | @classmethod 184 | def execute_sql_string(cls, sql_string, parameters_dict=None): 185 | """ 186 | 直接执行 sql 语句 187 | eg: 188 | sql_string = 'select * from temp where id = :numbers' and parameters_dict = {'numbers': 1} 189 | >> select * from temp where id = 1 190 | :param sql_string: the sql string you want to execute 191 | :param parameters_dict: parameters 192 | :return: if query returns_rows return rows(List(tuple)) else return affect_rows(int) 193 | """ 194 | query = cls.session.execute(text(sql_string), parameters_dict) 195 | if query.returns_rows: 196 | return query.fetchall() 197 | else: 198 | return query.rowcount 199 | 200 | @classmethod 201 | def batch_add(cls, instances): 202 | """批量添加记录""" 203 | if not all([isinstance(instance, cls) for instance in instances]): 204 | raise ValueError('all instances must be {table_name} model instance'.format(table_name=cls.__tablename__)) 205 | cls.session.bulk_save_objects(instances) 206 | -------------------------------------------------------------------------------- /webspider/models/city.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP 6 | 7 | from webspider.models.base import BaseModel 8 | 9 | 10 | class CityModel(BaseModel): 11 | __tablename__ = 'city' 12 | 13 | id = Column(INTEGER, primary_key=True, nullable=False, autoincrement=True) 14 | name = Column(VARCHAR(64), nullable=False, doc=u'城市名') 15 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间') 16 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间') 17 | -------------------------------------------------------------------------------- /webspider/models/company.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP, TINYINT 6 | 7 | from webspider import constants 8 | from webspider.models.base import BaseModel 9 | 10 | 11 | class CompanyModel(BaseModel): 12 | __tablename__ = 'company' 13 | 14 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True) 15 | lg_company_id = Column(INTEGER, nullable=False, doc='所使用的公司id') 16 | city_id = Column(INTEGER, nullable=False, doc=u'所在城市 id') 17 | shortname = Column(VARCHAR(64), nullable=False, doc=u'公司名称') 18 | fullname = Column(VARCHAR(128), nullable=False, doc=u'公司全称') 19 | finance_stage = Column(TINYINT, nullable=False, doc=u'融资阶段') 20 | size = Column(TINYINT, nullable=False, doc=u'公司规模') 21 | address = Column(VARCHAR(128), nullable=False, doc=u'公司地址') 22 | features = Column(VARCHAR(128), nullable=False, doc=u'公司特点') 23 | process_rate = Column(TINYINT, nullable=False, doc=u'简历处理率') 24 | introduce = Column(VARCHAR(constants.COMPANY_INTRODUCE_MAX_LEN), nullable=False, doc=u'公司简介') 25 | advantage = Column(VARCHAR(constants.COMPANY_ADVANTAGE_MAX_LEN), nullable=False, doc=u'公司优势') 26 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间') 27 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, 28 | onupdate=datetime.now, doc=u'最后更新时间') 29 | -------------------------------------------------------------------------------- /webspider/models/company_industry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP 6 | 7 | from webspider.models.base import BaseModel 8 | 9 | 10 | class CompanyIndustryModel(BaseModel): 11 | __tablename__ = 'company_industry' 12 | 13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True) 14 | company_id = Column(INTEGER, nullable=False, doc=u'公司 id') 15 | industry_id = Column(INTEGER, nullable=False, doc=u'行业 id') 16 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间') 17 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间') 18 | -------------------------------------------------------------------------------- /webspider/models/industry.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP 6 | 7 | from webspider.models.base import BaseModel 8 | 9 | 10 | class IndustryModel(BaseModel): 11 | __tablename__ = 'industry' 12 | 13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True) 14 | name = Column(VARCHAR(64), nullable=False, doc=u'行业名称') 15 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间') 16 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间') 17 | -------------------------------------------------------------------------------- /webspider/models/job.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TINYINT, TIMESTAMP 6 | 7 | from webspider import constants 8 | from webspider.models.base import BaseModel 9 | 10 | 11 | class JobModel(BaseModel): 12 | __tablename__ = 'job' 13 | 14 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True) 15 | lg_job_id = Column(INTEGER, nullable=False, doc=u'接口使用的 job id') 16 | city_id = Column(INTEGER, nullable=False, doc=u'城市 id') 17 | company_id = Column(INTEGER, nullable=False, doc=u'公司 id') 18 | title = Column(VARCHAR(64), nullable=False, default='', doc=u'职位标题') 19 | work_year = Column(TINYINT, nullable=False, doc=u'工作年限要求') 20 | department = Column(VARCHAR(64), nullable=False, doc=u'招聘部门') 21 | salary = Column(VARCHAR(32), nullable=False, doc=u'薪水') 22 | education = Column(TINYINT, nullable=False, doc=u'教育背景要求') 23 | nature = Column(TINYINT, nullable=False, doc=u'工作性质') 24 | description = Column(VARCHAR(constants.JOB_DESCRIPTION_MAX_LEN), nullable=False, doc=u'额外描述') 25 | advantage = Column(VARCHAR(constants.JOB_ADVANTAGE_MAX_LEN), nullable=False, doc=u'职位优势') 26 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'职位创建时间') 27 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, 28 | onupdate=datetime.now, doc=u'职位创建时间') 29 | -------------------------------------------------------------------------------- /webspider/models/job_keyword.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP 6 | 7 | from webspider.models.base import BaseModel 8 | 9 | 10 | class JobKeywordModel(BaseModel): 11 | __tablename__ = 'job_keyword' 12 | 13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True) 14 | job_id = Column(INTEGER, nullable=False, doc=u'职位 id') 15 | keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id') 16 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间') 17 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间') 18 | -------------------------------------------------------------------------------- /webspider/models/jobs_count.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP 6 | 7 | from webspider.models.base import BaseModel 8 | 9 | 10 | class JobsCountModel(BaseModel): 11 | __tablename__ = 'jobs_count' 12 | 13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True) 14 | date = Column(INTEGER, nullable=False, doc=u'日期') 15 | keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id') 16 | all_city = Column(INTEGER, nullable=False, default=0, doc=u'全国岗位数量') 17 | beijing = Column(INTEGER, nullable=False, default=0, doc=u'北京岗位数量') 18 | guangzhou = Column(INTEGER, nullable=False, default=0, doc=u'广州岗位数量') 19 | shenzhen = Column(INTEGER, nullable=False, default=0, doc=u'深圳岗位数量') 20 | shanghai = Column(INTEGER, nullable=False, default=0, doc=u'上海岗位数量') 21 | hangzhou = Column(INTEGER, nullable=False, default=0, doc=u'杭州岗位数量') 22 | chengdu = Column(INTEGER, nullable=False, default=0, doc=u'成都岗位数量') 23 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间') 24 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间') 25 | -------------------------------------------------------------------------------- /webspider/models/keyword.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP 6 | 7 | from webspider.models.base import BaseModel 8 | 9 | 10 | class KeywordModel(BaseModel): 11 | __tablename__ = 'keyword' 12 | 13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True) 14 | name = Column(VARCHAR(64), nullable=False, doc=u'关键词名称') 15 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间') 16 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间') 17 | -------------------------------------------------------------------------------- /webspider/models/keyword_statistic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from datetime import datetime 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP 6 | 7 | from webspider.models.base import BaseModel 8 | from webspider.models.jobs_count import JobsCountModel 9 | 10 | 11 | class KeywordStatisticModel(BaseModel): 12 | __tablename__ = 'keyword_statistic' 13 | 14 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True) 15 | keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id') 16 | educations = Column(VARCHAR(2048), nullable=False, doc=u'教育背景要求统计') 17 | city_jobs_count = Column(VARCHAR(2048), nullable=False, doc=u'城市职位数量统计') 18 | salary = Column(VARCHAR(2048), nullable=False, doc=u'薪水分布统计') 19 | financing_stage = Column(VARCHAR(2048), nullable=False, doc=u'招聘公司的融资统计') 20 | work_years = Column(VARCHAR(2048), nullable=False, doc=u'职位薪水统计') 21 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间') 22 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间') 23 | 24 | @property 25 | def per_day_jobs_count(self): 26 | return JobsCountModel.list(filter_by={'keyword_id': self.keyword_id}, order_by=JobsCountModel.date.asc()) 27 | -------------------------------------------------------------------------------- /webspider/quickly_cmd.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # flake8: noqa 3 | import os 4 | import logging 5 | 6 | from tornado.options import options, define 7 | 8 | from webspider import constants 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def run_web_app_by_gunicorn(): 14 | define(name='port', default=8000, type=int, help='run on the given port') 15 | logger.info( 16 | '\n================ spider web server(require gunicorn and gevent) has started ================ ') 17 | logger.info('\n server start at port -> {}, debug mode = {} '.format(options.port, 18 | constants.DEBUG)) 19 | os.system( 20 | "env/bin/gunicorn 'webspider.web_app:make_wsgi_app()' -b 0.0.0.0:{port} -w 1 -k gevent".format( 21 | port=options.port 22 | ) 23 | ) 24 | 25 | 26 | def run_celery_default_worker(): 27 | os.system( 28 | u'env/bin/celery worker -A webspider.tasks.celery_app -Q default -n default_worker --loglevel=debug') 29 | 30 | 31 | def run_celery_lg_data_worker(): 32 | os.system( 33 | u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_data -n lg_data_worker --loglevel=debug') 34 | 35 | 36 | def run_celery_lg_jobs_data_worker(): 37 | os.system( 38 | u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_jobs_data -n lg_jobs_data_worker --loglevel=debug') 39 | 40 | 41 | def run_celery_lg_jobs_count_worker(): 42 | os.system( 43 | u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_jobs_count -n lg_jobs_count_worker --loglevel=debug ') 44 | 45 | 46 | def run_celery_beat(): 47 | os.system(u'env/bin/celery -A webspider.tasks.celery_app beat --loglevel=debug') 48 | 49 | 50 | def run_celery_flower(): 51 | os.system(u'env/bin/celery flower --broker=redis://localhost:6379/0 --broker_api=redis://localhost:6379/0') 52 | -------------------------------------------------------------------------------- /webspider/setting.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | 4 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | # smtp 7 | SMTP_HOST = os.environ.get('SMTP_HOST') 8 | SMTP_PORT = os.environ.get('SMTP_PORT') 9 | 10 | # email 11 | MAIL_USER_NAME = os.environ.get('MAIL_USER_NAME') 12 | MAIL_USER_PASSWORD = os.environ.get('MAIL_USER_PASSWORD') 13 | FROM_EMAIL_ADDRESS = os.environ.get('FROM_EMAIL_ADDRESS') 14 | TO_EMAIL_ADDRESS = os.environ.get('TO_EMAIL_ADDRESS') 15 | 16 | # MYSQL 17 | MYSQL_USERNAME = os.environ.get('MYSQL_USERNAME', 'root') 18 | MYSQL_PASSWORD = os.environ.get('MYSQL_PASSWORD', '') 19 | DB_HOST = os.environ.get('DB_HOST', 'localhost') 20 | DB_PORT = os.environ.get('DB_PORT', '3306') 21 | DB_NAME = os.environ.get('DB_NAME', 'spider') 22 | DB_CONNECT_STRING_FORMAT = 'mysql+mysqldb://{username}:{password}@{db_host}:{db_port}/{db_name}?charset=utf8mb4' 23 | 24 | # REDIS 25 | REDIS_HOST = os.environ.get('DB_HOST', 'localhost') 26 | REDIS_PORT = os.environ.get('DB_PORT', '6379') 27 | 28 | # MYSQL 配置 29 | MYSQL_CONF = { 30 | 'connect_string': DB_CONNECT_STRING_FORMAT.format( 31 | username=MYSQL_USERNAME, 32 | password=MYSQL_PASSWORD, 33 | db_host=DB_HOST, 34 | db_port=DB_PORT, 35 | db_name=DB_NAME 36 | ), 37 | 'host': DB_HOST, 38 | 'port': DB_PORT, 39 | 'username': MYSQL_USERNAME, 40 | 'password': MYSQL_PASSWORD, 41 | } 42 | 43 | SMTP_CONF = { 44 | 'host': SMTP_HOST, 45 | 'port': SMTP_PORT, 46 | 'from_email': FROM_EMAIL_ADDRESS, 47 | 'to_email': TO_EMAIL_ADDRESS, 48 | } 49 | 50 | MAIL_CONF = { 51 | 'username': MAIL_USER_NAME, 52 | 'password': MAIL_USER_PASSWORD, 53 | } 54 | 55 | REDIS_CONF = { 56 | 'host': REDIS_HOST, 57 | 'port': REDIS_PORT 58 | } 59 | -------------------------------------------------------------------------------- /webspider/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /webspider/tasks/actor/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /webspider/tasks/actor/keyword_statistic.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | import json 4 | 5 | from webspider.tasks.celery_app import celery_app 6 | from webspider.controllers import keyword_statistic_ctl 7 | from webspider.models import (KeywordModel, JobModel, JobKeywordModel, KeywordStatisticModel) 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @celery_app.task() 13 | def update_keywords_statistic_task(): 14 | """更新关键词统计任务""" 15 | keywords = KeywordModel.list() 16 | for keyword in keywords: 17 | update_single_keyword_statistic_task.delay(keyword.id) 18 | 19 | 20 | @celery_app.task() 21 | def update_single_keyword_statistic_task(keyword_id): 22 | """更新关键词统计任务""" 23 | 24 | job_keywords = JobKeywordModel.list(filter_by={'keyword_id': keyword_id}) 25 | jobs = JobModel.list(filter=(JobModel.id.in_([job_keyword.job_id for job_keyword in job_keywords]))) 26 | if not jobs: 27 | return 28 | 29 | educations_statistic = keyword_statistic_ctl.get_educations_statistic(jobs=jobs) 30 | finance_stage_statistic = keyword_statistic_ctl.get_finance_stage_statistic(jobs=jobs) 31 | city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(jobs=jobs) 32 | salary_statistic = keyword_statistic_ctl.get_salary_statistic(jobs=jobs) 33 | work_years_statistic = keyword_statistic_ctl.get_work_years_statistic(jobs=jobs) 34 | 35 | statistic_values = dict( 36 | keyword_id=keyword_id, 37 | educations=json.dumps(educations_statistic), 38 | city_jobs_count=json.dumps(city_jobs_count_statistic), 39 | salary=json.dumps(salary_statistic), 40 | financing_stage=json.dumps(finance_stage_statistic), 41 | work_years=json.dumps(work_years_statistic) 42 | ) 43 | 44 | if KeywordStatisticModel.is_exist(filter_by={'keyword_id': keyword_id}): 45 | KeywordStatisticModel.update(filter_by={'keyword_id': keyword_id}, values=statistic_values) 46 | else: 47 | KeywordStatisticModel.add(**statistic_values) 48 | -------------------------------------------------------------------------------- /webspider/tasks/actor/lagou_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | 4 | from webspider import utils 5 | from webspider import crawlers 6 | from webspider import constants 7 | from webspider.utils.cache import redis_instance 8 | from webspider.tasks.celery_app import celery_app 9 | from webspider.controllers import industry_ctl, keyword_ctl, city_ctl 10 | from webspider.models import (CityModel, CompanyModel, 11 | CompanyIndustryModel, JobModel, JobKeywordModel) 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | @celery_app.task() 17 | def crawl_lg_data_task(): 18 | """爬取数据任务""" 19 | 20 | # 清除抓取记录 21 | keys = redis_instance.keys('crawled_company_jobs*') 22 | if keys: 23 | redis_instance.delete(*keys) 24 | 25 | crawl_lg_city_data_task.delay() 26 | # 目前只抓取这几个城市 全国:0, 北京:2 上海:3 杭州:6 深圳:215 广州:213 成都:252 27 | lg_city_ids = [2, 3, 6, 215, 213, 252] 28 | lg_finance_stage_ids = [1, 2, 3, 4, 5, 6, 7, 8] 29 | lg_industry_ids = [24, 25, 33, 27, 29, 45, 31, 28, 30 | 47, 34, 35, 43, 32, 41, 26, 48, 38, 49, 10594] 31 | # 爬取公司数据 32 | for industry_id in lg_industry_ids: 33 | for city_id in lg_city_ids: 34 | for finance_stage_id in lg_finance_stage_ids: 35 | crawl_lg_company_data_task.delay(city_id=city_id, finance_stage_id=finance_stage_id, 36 | industry_id=industry_id) 37 | 38 | 39 | @celery_app.task() 40 | def crawl_lg_city_data_task(): 41 | """爬取城市数据任务""" 42 | city_dicts = crawlers.get_cites_from_lg() 43 | for city_dict in city_dicts: 44 | if CityModel.is_exist(filter_by={'id': city_dict.id}): 45 | CityModel.update_by_pk(pk=city_dict.id, values=city_dict) 46 | else: 47 | CityModel.add(**city_dict) 48 | 49 | 50 | @celery_app.task() 51 | def crawl_lg_company_data_task(city_id, finance_stage_id, industry_id): 52 | """爬取公司数据任务""" 53 | companies_pagination = crawlers.get_companies_pagination_from_lg(city_id=city_id, 54 | finance_stage_id=finance_stage_id, 55 | industry_id=industry_id) 56 | for page_no in companies_pagination.iter_pages: 57 | company_dicts = crawlers.get_companies_from_lg(city_id=city_id, 58 | finance_stage_id=finance_stage_id, 59 | industry_id=industry_id, 60 | page_no=page_no) 61 | if not company_dicts: 62 | break 63 | for company_dict in company_dicts: 64 | crawlers.clean_lg_company_data(company_dict) 65 | utils.convert.convert_dict_field_to_constants(company_dict) 66 | 67 | industries = company_dict.pop('industries') 68 | city_name = company_dict.pop('city_name') 69 | 70 | city_ctl.insert_city_if_not_exist(city_name) 71 | company_dict['city_id'] = city_ctl.get_city_id_by_name(city_name) 72 | 73 | company = CompanyModel.get_one( 74 | filter_by={'lg_company_id': company_dict.lg_company_id}) 75 | if company: 76 | CompanyModel.update_by_pk(pk=company.id, values=company_dict) 77 | else: 78 | company_id = CompanyModel.add(**company_dict) 79 | 80 | for industry in industries: 81 | industry_ctl.insert_industry_if_not_exist(name=industry) 82 | industry_id = industry_ctl.get_industry_id_by_name(name=industry) 83 | CompanyIndustryModel.add(industry_id=industry_id, company_id=company_id) 84 | 85 | crawl_lg_job_data_task.delay(company_dict.lg_company_id) 86 | 87 | 88 | @celery_app.task() 89 | def crawl_lg_job_data_task(lg_company_id): 90 | """爬取职位数据任务""" 91 | # 过滤本轮已经爬取过职位的公司 92 | if not redis_instance.setnx(constants.CRAWLED_COMPANY_JOBS_REDIS_KEY.format(lg_company_id=lg_company_id), 1): 93 | return 94 | jobs_pagination = crawlers.get_jobs_pagination_from_lg(lg_company_id=lg_company_id, 95 | job_type=constants.LGJobType.technology) 96 | for page_no in jobs_pagination.iter_pages: 97 | job_dicts = crawlers.get_jobs_from_lg(lg_company_id=lg_company_id, 98 | job_type=constants.LGJobType.technology, 99 | page_no=page_no) 100 | if not job_dicts: 101 | break 102 | for job_dict in job_dicts: 103 | crawlers.clean_lg_job_data(job_dict) 104 | utils.convert.convert_dict_field_to_constants(job_dict) 105 | 106 | keywords = job_dict.pop('keywords') 107 | city_name = job_dict.pop('city_name') 108 | 109 | city_ctl.insert_city_if_not_exist(city_name) 110 | job_dict['city_id'] = city_ctl.get_city_id_by_name(city_name) 111 | company = CompanyModel.get_one(filter_by={'lg_company_id': lg_company_id}) 112 | job_dict['company_id'] = company.id 113 | 114 | job = JobModel.get_one(filter_by={'lg_job_id': job_dict.lg_job_id}) 115 | if job: 116 | JobModel.update_by_pk(pk=job.id, values=job_dict) 117 | else: 118 | job_id = JobModel.add(**job_dict) 119 | 120 | for keyword in keywords: 121 | keyword_ctl.insert_keyword_if_not_exist(name=keyword) 122 | keyword_id = keyword_ctl.get_keyword_id_by_name(name=keyword) 123 | JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id) 124 | -------------------------------------------------------------------------------- /webspider/tasks/actor/lagou_jobs_count.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | from datetime import datetime 4 | 5 | from webspider import crawlers 6 | from webspider.tasks.celery_app import celery_app 7 | from webspider.controllers import keyword_ctl, job_keyword_ctl 8 | from webspider.models import JobsCountModel 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | @celery_app.task() 14 | def crawl_lg_jobs_count_task(): 15 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=1000) 16 | for keyword_id in keyword_ids: 17 | crawl_lg_keyword_jobs_count_task.delay(keyword_id) 18 | 19 | 20 | @celery_app.task() 21 | def crawl_lg_keyword_jobs_count_task(keyword_id): 22 | cities_name_map = { 23 | 'all_city': u'全国', 24 | 'beijing': u'北京', 25 | 'shanghai': u'上海', 26 | 'guangzhou': u'广州', 27 | 'shenzhen': u'深圳', 28 | 'hangzhou': u'杭州', 29 | 'chengdu': u'成都', 30 | } 31 | keyword_name = keyword_ctl.get_keyword_name_by_id(keyword_id) 32 | jobs_count_dict = dict(keyword_id=keyword_id) 33 | for city_name_key, city_name in cities_name_map.items(): 34 | jobs_count_dict[city_name_key] = crawlers.get_jobs_count_from_lg(city_name=city_name, 35 | keyword_name=keyword_name) 36 | jobs_count_dict['date'] = int(datetime.today().strftime('%Y%m%d')) 37 | 38 | JobsCountModel.add(**jobs_count_dict) 39 | -------------------------------------------------------------------------------- /webspider/tasks/celery_app.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from celery import Celery 4 | 5 | celery_app = Celery('tasks') 6 | celery_app.config_from_object('webspider.tasks.celery_config') 7 | -------------------------------------------------------------------------------- /webspider/tasks/celery_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from kombu import Queue 3 | from kombu import Exchange 4 | 5 | from celery.schedules import crontab 6 | 7 | BROKER_URL = 'redis://127.0.0.1:6379' # 指定 Broker 8 | 9 | CELERY_RESULT_BACKEND = 'redis://127.0.0.1:6379/0' # 指定 Backend 10 | 11 | CELERY_CREATE_MISSING_QUEUES = True # 某个程序中出现的队列,在broker中不存在,则立刻创建它 12 | 13 | CELERY_TIMEZONE = 'Asia/Shanghai' # 指定时区,默认是 UTC 14 | 15 | CELERYD_CONCURRENCY = 2 # 并发worker数 16 | 17 | CELERY_ENABLE_UTC = False 18 | 19 | CELERYD_FORCE_EXECV = True # 强制退出 20 | 21 | CELERY_TASK_SERIALIZER = 'json' # 任务序列化和反序列化 22 | 23 | CELERY_RESULT_SERIALIZER = 'json' # 读取任务结果一般性能要求不高,所以使用了可读性更好的JSON 24 | 25 | CELERY_IGNORE_RESULT = True # 忽略任务结果 26 | 27 | # CELERY_TASK_RESULT_EXPIRES = 60 * 60 * 1 # 任务结果过期时间 28 | 29 | CELERY_IMPORTS = ( # 指定导入的任务模块 30 | 'webspider.tasks.actor.lg_data', 31 | 'webspider.tasks.actor.lg_jobs_count', 32 | 'webspider.tasks.actor.keyword_statistic', 33 | ) 34 | 35 | CELERY_TASK_PUBLISH_RETRY = False # 重试 36 | 37 | CELERYBEAT_SCHEDULE = { 38 | 'crawl_lg_jobs_count_task': { 39 | 'task': 'webspider.tasks.actor.lg_jobs_count.crawl_lg_jobs_count_task', 40 | 'schedule': crontab(hour='01', minute='01', day_of_week='2, 5'), 41 | }, 42 | 'crawl_lg_data_task': { 43 | 'task': 'webspider.tasks.actor.lg_data.crawl_lg_data_task', 44 | 'schedule': crontab(hour='01', minute='01', day_of_month='1'), 45 | }, 46 | 'update_keyword_statistic': { 47 | 'task': 'webspider.tasks.actor.keyword_statistic.update_keywords_statistic_task', 48 | 'schedule': crontab(hour='01', minute='01', day_of_week='1, 4'), 49 | }, 50 | } 51 | 52 | default_exchange = Exchange('default', type='direct') 53 | lg_exchange = Exchange('lg', type='direct') 54 | 55 | CELERY_QUEUES = ( 56 | Queue(name='default', exchange=default_exchange, routing_key='default'), 57 | Queue(name='lg_data', exchange=lg_exchange, routing_key='for_lg_data'), 58 | Queue(name='lg_jobs_data', exchange=lg_exchange, routing_key='for_lg_jobs_data'), 59 | Queue(name='lg_jobs_count', exchange=lg_exchange, routing_key='for_lg_jobs_count'), 60 | ) 61 | 62 | CELERY_ROUTES = { 63 | 'webspider.tasks.actor.lg_data.crawl_lg_job_data_task': {'exchange': 'lg', 64 | 'routing_key': 'for_lg_jobs_data'}, 65 | 'webspider.tasks.actor.lg_jobs_count.*': {'exchange': 'lg', 'routing_key': 'for_lg_jobs_count'}, 66 | 'webspider.tasks.actor.lg_data.*': {'exchange': 'lg', 'routing_key': 'for_lg_data'}, 67 | 'webspider.tasks.actor.keyword_statistic.*': {'exchange': 'default', 'routing_key': 'default'} 68 | } 69 | -------------------------------------------------------------------------------- /webspider/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from webspider.utils import cache 3 | from webspider.utils import classproperty 4 | from webspider.utils import common 5 | from webspider.utils import convert 6 | from webspider.utils import http_tools 7 | from webspider.utils import log 8 | from webspider.utils import pagination 9 | from webspider.utils import sql 10 | from webspider.utils import text 11 | from webspider.utils import time_tools 12 | 13 | __all__ = ['cache', 'classproperty', 'common', 'convert', 'http_tools', 'log', 'pagination', 'sql', 'text', 14 | 'time_tools'] 15 | -------------------------------------------------------------------------------- /webspider/utils/cache.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | import pickle 4 | from functools import wraps 5 | 6 | import redis 7 | 8 | from webspider import setting 9 | 10 | redis_pool = redis.ConnectionPool(host=setting.REDIS_CONF['host'], 11 | port=setting.REDIS_CONF['port']) 12 | redis_instance = redis.Redis(connection_pool=redis_pool) 13 | 14 | 15 | def simple_cache(ex=None): 16 | """利用 redis 进行缓存,暂不支持 kwargs 类型的参数传入方式""" 17 | 18 | def decorator(func): 19 | @wraps(func) 20 | def wrapper(*args, **kwargs): 21 | if kwargs: 22 | raise ValueError( 23 | "args key generator does not accept kwargs arguments") 24 | redis_key = func.__name__ + '(' + ','.join(map(str, args)) + ')' 25 | result = redis_instance.get(redis_key) 26 | if result: 27 | logging.debug('cache: get func result from redis key - {}'.format(redis_key)) 28 | result = pickle.loads(result) 29 | else: 30 | logging.debug('cache: get func result from func key - {}'.format(redis_key)) 31 | result = func(*args) 32 | redis_instance.set(name=redis_key, value=pickle.dumps(result), ex=ex) 33 | return result 34 | 35 | return wrapper 36 | 37 | return decorator 38 | 39 | 40 | def cache_clear(func, *args): 41 | """失效缓存""" 42 | redis_key = func.__name__ 43 | if args: 44 | redis_key += ('(' + ','.join(map(str, args)) + ')') 45 | logging.info('remove cache redis-key: {}'.format(redis_key)) 46 | keys = redis_instance.keys('*' + redis_key + '*') 47 | if keys: 48 | remove_count = redis_instance.delete(*keys) 49 | logging.info('cache clear count {}'.format(remove_count)) 50 | return remove_count 51 | -------------------------------------------------------------------------------- /webspider/utils/classproperty.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | class ClassPropertyDescriptor(object): 5 | """类属性""" 6 | def __init__(self, fget, fset=None): 7 | self.fget = fget 8 | self.fset = fset 9 | 10 | def __get__(self, obj, obj_type=None): 11 | if obj_type is None: 12 | obj_type = type(obj) 13 | return self.fget.__get__(obj, obj_type)() 14 | 15 | def __set__(self, obj, value): 16 | raise AttributeError("can't set attribute") 17 | 18 | 19 | def classproperty(func): 20 | if not isinstance(func, (classmethod, staticmethod)): 21 | func = classmethod(func) 22 | 23 | return ClassPropertyDescriptor(func) 24 | -------------------------------------------------------------------------------- /webspider/utils/common.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | from collections import Counter 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def get_key_from_dict_by_value(value, dictionary): 9 | keys = [_key for (_key, _value) in dictionary.items() if _value == value] 10 | if not keys: 11 | raise ValueError(u'can not get key from dict by value {}'.format(value)) 12 | if len(keys) > 1: 13 | raise AttributeError(u'get multi keys from dict by value {}'.format(value)) 14 | return keys[0] 15 | 16 | 17 | def get_field_statistics(values, constants_dict): 18 | """ 19 | 获得某批数据的统计情况 20 | eg: 21 | >>>get_field_statistics([0, 0, 0, 1, 1], {'男': 0, '女': 1}) 22 | >>>{'男':3, '女':2} 23 | 24 | :param values: list[int], field values list 25 | :param constants_dict: Dict 26 | :return: collections.Counter 27 | """ 28 | statistics_counter = Counter() 29 | for value in values: 30 | field_name = get_key_from_dict_by_value(value=value, dictionary=constants_dict) 31 | statistics_counter[field_name] += 1 32 | return statistics_counter 33 | -------------------------------------------------------------------------------- /webspider/utils/convert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import logging 3 | 4 | from webspider import constants 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | CONSTANTS_MAP = { 9 | 'finance_stage': constants.FINANCE_STAGE_DICT, 10 | 'nature': constants.JOB_NATURE_DICT, 11 | 'work_year': constants.WORK_YEARS_REQUEST_DICT, 12 | 'education': constants.EDUCATION_REQUEST_DICT, 13 | 'size': constants.COMPANY_SIZE_DICT, 14 | } 15 | 16 | 17 | def convert_dict_field_to_constants(to_converted_dict, constants_map=CONSTANTS_MAP): 18 | """ 19 | 把dict的字段转换为相应常量 20 | :param to_converted_dict: 需要转换的字典 21 | :param constants_map: 字段常量对应关系 22 | :return: 转换后的字段 23 | """ 24 | for field_name, field_value in to_converted_dict.items(): 25 | if field_name in constants_map: 26 | to_converted_dict[field_name] = convert_field_to_constants(field_name, field_value, constants_map) 27 | 28 | 29 | def convert_field_to_constants(field_name, field_value, constants_map=CONSTANTS_MAP): 30 | """ 31 | 把字段转化为相应的常量, 如果无法转换,返回 -1 32 | 33 | eg: 34 | convert_field_to_constants(field_name='size', field_value='2000人以上', constants_map={'size': {'2000人以上': 1}}) 35 | return: 1 36 | :param field_name: 字段名 37 | :param field_value: 字段值 38 | :param constants_map: 字段常量对应关系 39 | :rtype: int 40 | """ 41 | if field_name not in constants_map: 42 | raise ValueError(u'can not find the field in constants_map, field name is {}'.find(field_name)) 43 | 44 | field_constant_map = constants_map[field_name] 45 | 46 | if field_value in field_constant_map: 47 | return field_constant_map[field_value] 48 | else: 49 | logger.error('error {field_name}, value is {field_value}'.format(field_name=field_name, 50 | field_value=field_value)) 51 | return field_constant_map['unknown'] if 'unknown' in field_constant_map else -1 52 | -------------------------------------------------------------------------------- /webspider/utils/http_tools.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import time 3 | import random 4 | 5 | import requests 6 | from retrying import retry 7 | 8 | from webspider import constants 9 | 10 | 11 | def generate_http_request_headers(referer=None): 12 | """构造 HTTP 请求头""" 13 | header = constants.HTTP_HEADER 14 | header['User-Agent'] = random.choice(constants.USER_AGENT_LIST) 15 | if referer: 16 | header['Referer'] = referer 17 | return header 18 | 19 | 20 | @retry(stop_max_attempt_number=constants.RETRY_TIMES, stop_max_delay=constants.STOP_MAX_DELAY, 21 | wait_fixed=constants.WAIT_FIXED) 22 | def requests_get(url, params=None, headers=None, allow_redirects=False, timeout=constants.REQUEST_TIMEOUT, 23 | need_sleep=True, **kwargs): 24 | if need_sleep: 25 | time.sleep(random.randint(constants.MIN_SLEEP_SECS, constants.MAX_SLEEP_SECS)) 26 | if not headers: 27 | headers = generate_http_request_headers() 28 | return requests.get(url=url, params=params, headers=headers, allow_redirects=allow_redirects, 29 | timeout=timeout, **kwargs) 30 | 31 | 32 | @retry(stop_max_attempt_number=constants.RETRY_TIMES, stop_max_delay=constants.STOP_MAX_DELAY, 33 | wait_fixed=constants.WAIT_FIXED) 34 | def requests_post(url, data=None, params=None, headers=None, allow_redirects=False, timeout=constants.REQUEST_TIMEOUT, 35 | need_sleep=True, **kwargs): 36 | if need_sleep: 37 | time.sleep(random.randint(constants.MIN_SLEEP_SECS, constants.MAX_SLEEP_SECS)) 38 | if not headers: 39 | headers = generate_http_request_headers() 40 | return requests.post(url=url, data=data, params=params, headers=headers, allow_redirects=allow_redirects, 41 | timeout=timeout, **kwargs) 42 | -------------------------------------------------------------------------------- /webspider/utils/log.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import logging.config 4 | 5 | from webspider import setting 6 | 7 | LOG_FILE_PATH = os.path.join(setting.BASE_DIR, 'log', 'spider_log.txt') 8 | 9 | LOGGING_CONFIG = { 10 | 'version': 1, 11 | 'disable_existing_loggers': True, 12 | 13 | 'formatters': { 14 | 'default': { 15 | 'format': '%(asctime)s- %(module)s:%(lineno)d [%(levelname)1.1s] %(name)s: %(message)s', 16 | 'datefmt': '%Y/%m/%d %H:%M:%S' 17 | }, 18 | }, 19 | 20 | 'handlers': { 21 | 'console': { 22 | 'level': 'DEBUG', 23 | 'formatter': 'default', 24 | 'class': 'logging.StreamHandler' 25 | }, 26 | 'smtp': { 27 | 'level': 'ERROR', 28 | 'class': 'logging.handlers.SMTPHandler', 29 | 'formatter': 'default', 30 | 'mailhost': (setting.SMTP_CONF['host'], setting.SMTP_CONF['port']), 31 | 'fromaddr': setting.SMTP_CONF['from_email'], 32 | 'toaddrs': [setting.SMTP_CONF['to_email'], ], 33 | 'subject': '爬虫系统出现异常', 34 | 'credentials': (setting.MAIL_CONF['username'], setting.MAIL_CONF['password']) 35 | }, 36 | 'file': { 37 | 'level': 'ERROR', 38 | 'formatter': 'default', 39 | 'class': 'logging.handlers.RotatingFileHandler', 40 | 'filename': LOG_FILE_PATH, 41 | 'encoding': 'utf8' 42 | }, 43 | }, 44 | 45 | 'loggers': { 46 | '': { 47 | 'handlers': ['console', 'file'], 48 | 'level': 'DEBUG', 49 | 'propagate': False, 50 | }, 51 | 'webspider': { 52 | 'handlers': ['console', 'file'], 53 | 'level': 'DEBUG', 54 | 'propagate': False, 55 | }, 56 | 'tornado': { 57 | 'handlers': ['console', 'file'], 58 | 'level': 'DEBUG', 59 | 'propagate': False, 60 | }, 61 | 'tornado.access': { 62 | 'handlers': ['console', 'file'], 63 | 'level': 'INFO', 64 | 'propagate': False, 65 | }, 66 | 'tornado.application': { 67 | 'handlers': ['console', 'file'], 68 | 'level': 'INFO', 69 | 'propagate': False, 70 | }, 71 | 'tornado.general': { 72 | 'handlers': ['console', 'file'], 73 | 'propagate': False, 74 | 'level': 'INFO', 75 | }, 76 | 'sqlalchemy.engine': { 77 | 'handlers': ['console', 'file'], 78 | 'level': 'INFO', 79 | 'propagate': False, 80 | }, 81 | 'gunicorn': { 82 | 'handlers': ['console', 'file'], 83 | 'level': 'INFO', 84 | 'propagate': False, 85 | }, 86 | 'celery': { 87 | 'handlers': ['console', 'file'], 88 | 'level': 'DEBUG', 89 | 'propagate': False, 90 | }, 91 | }, 92 | } 93 | 94 | 95 | def config_logging(): 96 | """配置日志""" 97 | logging.config.dictConfig(LOGGING_CONFIG) 98 | -------------------------------------------------------------------------------- /webspider/utils/pagination.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from math import ceil 3 | 4 | 5 | class Pagination(object): 6 | """分页""" 7 | def __init__(self, page=1, per_page=10, total=0): 8 | self.page = page 9 | self.per_page = per_page 10 | self.total = total 11 | 12 | @property 13 | def pages(self): 14 | if self.per_page == 0: 15 | pages = 0 16 | else: 17 | pages = int(ceil(self.total / float(self.per_page))) 18 | return pages 19 | 20 | @property 21 | def prev_num(self): 22 | if not self.has_prev: 23 | return None 24 | return self.page - 1 25 | 26 | @property 27 | def has_prev(self): 28 | return self.page > 1 29 | 30 | @property 31 | def has_next(self): 32 | return self.page < self.pages 33 | 34 | @property 35 | def next_num(self): 36 | if not self.has_next: 37 | return None 38 | return self.page + 1 39 | 40 | @property 41 | def iter_pages(self): 42 | for num in range(1, self.pages + 1): 43 | yield num 44 | -------------------------------------------------------------------------------- /webspider/utils/sql.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import logging 3 | 4 | from sqlalchemy import create_engine 5 | from sqlalchemy.orm import sessionmaker, scoped_session 6 | 7 | from webspider import setting 8 | from webspider import constants 9 | 10 | __all__ = ['get_session', 'remove_sessions', 'db_engine'] 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | db_engine = create_engine( 15 | setting.MYSQL_CONF['connect_string'], 16 | echo=constants.DEBUG, max_overflow=48, 17 | pool_timeout=0, pool_recycle=3600, 18 | logging_name='sql') 19 | 20 | _session = scoped_session(sessionmaker(bind=db_engine, autocommit=True, autoflush=True)) 21 | 22 | 23 | def get_session(): 24 | return _session 25 | 26 | 27 | def remove_sessions(): 28 | _session.remove() 29 | -------------------------------------------------------------------------------- /webspider/utils/text.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import re 3 | 4 | 5 | def to_plaintext(content, pattern=r'
|\n', strip=True): 6 | """ 7 | 根据 pattern 过滤文本 8 | :param content: 需要过滤的文本 9 | :param pattern: 需要过滤内容的正则表达式 10 | :param strip: 是否去掉首尾空格 11 | :return: 12 | """ 13 | plaintext = re.sub(pattern=pattern, repl='', string=content) 14 | if strip: 15 | plaintext = plaintext.strip() 16 | return plaintext 17 | -------------------------------------------------------------------------------- /webspider/utils/time_tools.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import time 3 | import datetime as datetime 4 | 5 | 6 | def datetime_to_timestamp(datetime_obj): 7 | return int(time.mktime(datetime_obj.timetuple())) 8 | 9 | 10 | def timestamp_to_datetime(timestamp): 11 | return datetime.datetime.fromtimestamp(timestamp) 12 | 13 | 14 | def timestamp_to_datetime_str(ts, time_format=None): 15 | """ 16 | 时间戳转化为日期字符串(1476547200->'2016-10-16') 17 | :param ts: 时间戳 18 | :param time_format: '日期格式' 19 | :return: 日期字符串 20 | """ 21 | if time_format is None or time_format == '': 22 | time_format = '%Y-%m-%d' 23 | ts = time.localtime(float(ts)) 24 | return time.strftime(time_format, ts) 25 | -------------------------------------------------------------------------------- /webspider/web/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /webspider/web/app.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # flake8: noqa 4 | 5 | import os 6 | import logging.config 7 | 8 | import tornado 9 | import tornado.web 10 | import tornado.ioloop 11 | import tornado.httpserver 12 | from tornado.options import options, define, parse_command_line 13 | from tornado.wsgi import WSGIAdapter 14 | 15 | from webspider import constants 16 | from webspider.web.urls import url_handlers 17 | from webspider.utils.log import config_logging 18 | 19 | config_logging() 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def make_wsgi_app(): 24 | web_app = make_web_app() 25 | return tornado.wsgi.WSGIAdapter(web_app) 26 | 27 | 28 | def make_web_app(): 29 | settings = { 30 | 'debug': constants.DEBUG, 31 | 'template_path': os.path.join( 32 | os.path.dirname(__file__), "templates" 33 | ), 34 | 'static_path': os.path.join( 35 | os.path.dirname(__file__), 'static' 36 | ) 37 | } 38 | 39 | app = tornado.web.Application(url_handlers, **settings) 40 | return app 41 | 42 | 43 | def main(): 44 | define(name='port', default=8000, type=int, help='run on the given port') 45 | parse_command_line() 46 | logger.info('====== web server starting at http://0.0.0.0:{} ======'.format(options.port)) 47 | if constants.DEBUG: 48 | logger.info('debug mode is enabled!!!') 49 | 50 | app = make_web_app() 51 | http_server = tornado.httpserver.HTTPServer(app) 52 | http_server.listen(options.port) 53 | http_server.start() 54 | 55 | tornado.ioloop.IOLoop.instance().start() 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /webspider/web/formatter/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from webspider.models import KeywordStatisticModel, JobsCountModel 3 | from webspider.web.formatter.jobs_count import JobsCountFormatter 4 | from webspider.web.formatter.keyword_statistic import KeywordStatisticFormatter 5 | 6 | from webspider.web.formatter.base import Formatter 7 | 8 | formatter_mappings = { 9 | JobsCountModel: JobsCountFormatter, 10 | KeywordStatisticModel: KeywordStatisticFormatter, 11 | } 12 | 13 | Formatter.register_formatter(formatter_mappings) 14 | -------------------------------------------------------------------------------- /webspider/web/formatter/base.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from tornado.util import ObjectDict 3 | 4 | from webspider.exceptions import DowngradeException 5 | 6 | 7 | class Downgrade(object): 8 | """降级""" 9 | def __init__(self, value): 10 | self.value = value 11 | 12 | 13 | class Field(object): 14 | """Formatter字段""" 15 | def __init__(self, name, converter=None, downgrade=None): 16 | self.name = name 17 | self.converter = converter 18 | if downgrade is not None and not isinstance(downgrade, Downgrade): 19 | raise DowngradeException(u'downgrade must be Downgrade instance') 20 | self.downgrade = downgrade 21 | 22 | 23 | class Formatter(object): 24 | """Formatter 根据设定的 FORMATTER_MAPS 自动渲染""" 25 | _FORMATTER_MAPS = {} 26 | FIELDS = {} 27 | 28 | @classmethod 29 | def register_formatter(cls, mapping): 30 | cls._FORMATTER_MAPS.update(mapping) 31 | 32 | @classmethod 33 | def format(cls, data): 34 | if isinstance(data, list): 35 | return [cls.format(item) for item in data] 36 | else: 37 | formatter = cls.get_formatter(data) 38 | if not formatter: 39 | raise ValueError(u'Can not find the formatter by model {}'.format(type(data))) 40 | 41 | format_result = ObjectDict() 42 | for field in formatter.FIELDS: 43 | if not isinstance(field, Field): 44 | raise ValueError('formatter field must be Field instance') 45 | try: 46 | value = getattr(data, field.name) 47 | # 可再次渲染 48 | if isinstance(value, list) or cls.get_formatter(value): 49 | value = cls.format(value) 50 | if field.converter: 51 | value = field.converter(value) 52 | except Exception: 53 | # Field 设置了降级 54 | if field.downgrade: 55 | value = field.downgrade.value 56 | else: 57 | raise 58 | format_result[field.name] = value 59 | 60 | return format_result 61 | 62 | @classmethod 63 | def get_formatter(cls, data): 64 | if data in cls._FORMATTER_MAPS: 65 | return cls._FORMATTER_MAPS[data] 66 | for model, formatter in cls._FORMATTER_MAPS.items(): 67 | if type(data) is model: 68 | return formatter 69 | -------------------------------------------------------------------------------- /webspider/web/formatter/jobs_count.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from webspider import utils 3 | from webspider.web.formatter.base import Field, Formatter 4 | 5 | 6 | class JobsCountFormatter(Formatter): 7 | FIELDS = [ 8 | Field('date'), 9 | Field('all_city'), 10 | Field('beijing'), 11 | Field('guangzhou'), 12 | Field('shenzhen'), 13 | Field('shanghai'), 14 | Field('hangzhou'), 15 | Field('chengdu'), 16 | Field('created_at', converter=utils.time_tools.datetime_to_timestamp), 17 | Field('updated_at', converter=utils.time_tools.datetime_to_timestamp), 18 | ] 19 | -------------------------------------------------------------------------------- /webspider/web/formatter/keyword_statistic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | 4 | from webspider import utils 5 | from webspider.web.formatter.base import Field, Downgrade, Formatter 6 | 7 | 8 | class KeywordStatisticFormatter(Formatter): 9 | FIELDS = [ 10 | Field('educations', converter=json.loads, downgrade=Downgrade({})), 11 | Field('city_jobs_count', converter=json.loads, downgrade=Downgrade({})), 12 | Field('salary', converter=json.loads, downgrade=Downgrade({})), 13 | Field('financing_stage', converter=json.loads, downgrade=Downgrade({})), 14 | Field('work_years', converter=json.loads, downgrade=Downgrade({})), 15 | Field('per_day_jobs_count'), 16 | Field('created_at', converter=utils.time_tools.datetime_to_timestamp), 17 | Field('updated_at', converter=utils.time_tools.datetime_to_timestamp), 18 | ] 19 | -------------------------------------------------------------------------------- /webspider/web/handlers/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from webspider.web.handlers.keyword_statistics import KeywordStatisticsApiHandler, KeywordStatisticsPageHandler 3 | 4 | __all__ = [ 5 | 'KeywordStatisticsApiHandler', 6 | 'KeywordStatisticsPageHandler' 7 | ] 8 | -------------------------------------------------------------------------------- /webspider/web/handlers/base.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from tornado.escape import json_encode 3 | from tornado.web import RequestHandler 4 | 5 | from webspider import constants 6 | from webspider.exceptions import BaseException, ResourceNotFoundWebException 7 | from webspider.web.formatter import Formatter 8 | from webspider.utils.sql import remove_sessions 9 | 10 | 11 | class BaseApiHandler(RequestHandler): 12 | def write_error(self, status_code, **kwargs): 13 | exception = kwargs['exc_info'][1] 14 | 15 | # TODO 后端改成纯 API 后,删除其逻辑 16 | # 生产环境下, 且请求非 API 接口, 渲染错误页面 17 | if not constants.DEBUG and isinstance(self, BasePageHandler): 18 | self._handler_production_page_error(exception) 19 | return 20 | 21 | if isinstance(exception, BaseException): 22 | self.render_exception(exception) 23 | else: 24 | RequestHandler.write_error(self, status_code=status_code, **kwargs) 25 | 26 | def auto_render(self, data): 27 | formatted_dict = Formatter.format(data) 28 | self.render_json(formatted_dict) 29 | 30 | def _handler_production_page_error(self, exception): 31 | """处理生产环境下页面的错误""" 32 | if isinstance(exception, ResourceNotFoundWebException): 33 | self.render('404.html') 34 | else: 35 | self.render('500.html') 36 | 37 | def render_exception(self, exception): 38 | self.set_status( 39 | status_code=exception.STATUS_CODE, 40 | reason=exception.message 41 | ) 42 | error_dict = { 43 | 'error': { 44 | 'code': exception.code, 45 | 'name': exception.__class__.__name__, 46 | 'message': exception.message, 47 | 'data': exception.data if exception.data else '', 48 | 'debug_message': exception.debug_message if exception.data else '' 49 | } 50 | } 51 | self.render_json(error_dict) 52 | 53 | def render_json(self, data): 54 | self.set_header('Content-Type', 'application/json') 55 | self.finish(json_encode(data)) 56 | 57 | def on_finish(self): 58 | remove_sessions() 59 | 60 | 61 | # TODO page to api 62 | class BasePageHandler(BaseApiHandler): 63 | """前后端代码混合型的页面 Handler""" 64 | pass 65 | -------------------------------------------------------------------------------- /webspider/web/handlers/keyword_statistics.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | from webspider.web.handlers.base import BasePageHandler, BaseApiHandler 5 | from webspider.exceptions import ResourceNotFoundWebException 6 | from webspider.models import KeywordModel, KeywordStatisticModel 7 | 8 | 9 | class KeywordStatisticsApiHandler(BaseApiHandler): 10 | def get(self): 11 | keyword_name = self.get_argument('keyword_name', '') 12 | if not keyword_name: 13 | raise ResourceNotFoundWebException(u'请输入关键词') 14 | 15 | keyword = KeywordModel.get_one(filter_by={'name': keyword_name}) 16 | if not keyword: 17 | raise ResourceNotFoundWebException(u'找不到该关键词') 18 | 19 | keyword_statistic = KeywordStatisticModel.get_one(filter_by={'keyword_id': keyword.id}) 20 | if not keyword_statistic: 21 | raise ResourceNotFoundWebException(u'暂无该关键词的统计结果') 22 | 23 | self.auto_render(keyword_statistic) 24 | 25 | 26 | class KeywordStatisticsPageHandler(BasePageHandler): 27 | def get(self): 28 | keyword_name = self.get_argument('keyword_name', '') 29 | if not keyword_name: 30 | raise ResourceNotFoundWebException(u'请输入关键词') 31 | 32 | keyword = KeywordModel.get_one(filter_by={'name': keyword_name}) 33 | if not keyword: 34 | raise ResourceNotFoundWebException(u'找不到该关键词') 35 | 36 | keyword_statistic = KeywordStatisticModel.get_one(filter_by={'keyword_id': keyword.id}) 37 | if not keyword_statistic: 38 | raise ResourceNotFoundWebException(u'暂无该关键词的统计结果') 39 | 40 | self.render( 41 | "statistics.html", 42 | keyword_name=keyword_name, 43 | educations_statistic=json.loads(keyword_statistic.educations), 44 | city_jobs_count_statistic=json.loads(keyword_statistic.city_jobs_count), 45 | salary_statistic=json.loads(keyword_statistic.salary), 46 | finance_stage_statistic=json.loads(keyword_statistic.financing_stage), 47 | work_years_statistic=json.loads(keyword_statistic.work_years), 48 | per_day_jobs_count_statistic=keyword_statistic.per_day_jobs_count 49 | ) 50 | -------------------------------------------------------------------------------- /webspider/web/static/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | -------------------------------------------------------------------------------- /webspider/web/static/bootstrap/css/bootstrap-theme.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap v3.3.7 (http://getbootstrap.com) 3 | * Copyright 2011-2016 Twitter, Inc. 4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 5 | */.btn-danger,.btn-default,.btn-info,.btn-primary,.btn-success,.btn-warning{text-shadow:0 -1px 0 rgba(0,0,0,.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 1px rgba(0,0,0,.075)}.btn-danger.active,.btn-danger:active,.btn-default.active,.btn-default:active,.btn-info.active,.btn-info:active,.btn-primary.active,.btn-primary:active,.btn-success.active,.btn-success:active,.btn-warning.active,.btn-warning:active{-webkit-box-shadow:inset 0 3px 5px rgba(0,0,0,.125);box-shadow:inset 0 3px 5px rgba(0,0,0,.125)}.btn-danger.disabled,.btn-danger[disabled],.btn-default.disabled,.btn-default[disabled],.btn-info.disabled,.btn-info[disabled],.btn-primary.disabled,.btn-primary[disabled],.btn-success.disabled,.btn-success[disabled],.btn-warning.disabled,.btn-warning[disabled],fieldset[disabled] .btn-danger,fieldset[disabled] .btn-default,fieldset[disabled] .btn-info,fieldset[disabled] .btn-primary,fieldset[disabled] .btn-success,fieldset[disabled] .btn-warning{-webkit-box-shadow:none;box-shadow:none}.btn-danger .badge,.btn-default .badge,.btn-info .badge,.btn-primary .badge,.btn-success .badge,.btn-warning .badge{text-shadow:none}.btn.active,.btn:active{background-image:none}.btn-default{text-shadow:0 1px 0 #fff;background-image:-webkit-linear-gradient(top,#fff 0,#e0e0e0 100%);background-image:-o-linear-gradient(top,#fff 0,#e0e0e0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fff),to(#e0e0e0));background-image:linear-gradient(to bottom,#fff 0,#e0e0e0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#ffe0e0e0', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#dbdbdb;border-color:#ccc}.btn-default:focus,.btn-default:hover{background-color:#e0e0e0;background-position:0 -15px}.btn-default.active,.btn-default:active{background-color:#e0e0e0;border-color:#dbdbdb}.btn-default.disabled,.btn-default.disabled.active,.btn-default.disabled.focus,.btn-default.disabled:active,.btn-default.disabled:focus,.btn-default.disabled:hover,.btn-default[disabled],.btn-default[disabled].active,.btn-default[disabled].focus,.btn-default[disabled]:active,.btn-default[disabled]:focus,.btn-default[disabled]:hover,fieldset[disabled] .btn-default,fieldset[disabled] .btn-default.active,fieldset[disabled] .btn-default.focus,fieldset[disabled] .btn-default:active,fieldset[disabled] .btn-default:focus,fieldset[disabled] .btn-default:hover{background-color:#e0e0e0;background-image:none}.btn-primary{background-image:-webkit-linear-gradient(top,#337ab7 0,#265a88 100%);background-image:-o-linear-gradient(top,#337ab7 0,#265a88 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#265a88));background-image:linear-gradient(to bottom,#337ab7 0,#265a88 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff265a88', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#245580}.btn-primary:focus,.btn-primary:hover{background-color:#265a88;background-position:0 -15px}.btn-primary.active,.btn-primary:active{background-color:#265a88;border-color:#245580}.btn-primary.disabled,.btn-primary.disabled.active,.btn-primary.disabled.focus,.btn-primary.disabled:active,.btn-primary.disabled:focus,.btn-primary.disabled:hover,.btn-primary[disabled],.btn-primary[disabled].active,.btn-primary[disabled].focus,.btn-primary[disabled]:active,.btn-primary[disabled]:focus,.btn-primary[disabled]:hover,fieldset[disabled] .btn-primary,fieldset[disabled] .btn-primary.active,fieldset[disabled] .btn-primary.focus,fieldset[disabled] .btn-primary:active,fieldset[disabled] .btn-primary:focus,fieldset[disabled] .btn-primary:hover{background-color:#265a88;background-image:none}.btn-success{background-image:-webkit-linear-gradient(top,#5cb85c 0,#419641 100%);background-image:-o-linear-gradient(top,#5cb85c 0,#419641 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5cb85c),to(#419641));background-image:linear-gradient(to bottom,#5cb85c 0,#419641 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff419641', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#3e8f3e}.btn-success:focus,.btn-success:hover{background-color:#419641;background-position:0 -15px}.btn-success.active,.btn-success:active{background-color:#419641;border-color:#3e8f3e}.btn-success.disabled,.btn-success.disabled.active,.btn-success.disabled.focus,.btn-success.disabled:active,.btn-success.disabled:focus,.btn-success.disabled:hover,.btn-success[disabled],.btn-success[disabled].active,.btn-success[disabled].focus,.btn-success[disabled]:active,.btn-success[disabled]:focus,.btn-success[disabled]:hover,fieldset[disabled] .btn-success,fieldset[disabled] .btn-success.active,fieldset[disabled] .btn-success.focus,fieldset[disabled] .btn-success:active,fieldset[disabled] .btn-success:focus,fieldset[disabled] .btn-success:hover{background-color:#419641;background-image:none}.btn-info{background-image:-webkit-linear-gradient(top,#5bc0de 0,#2aabd2 100%);background-image:-o-linear-gradient(top,#5bc0de 0,#2aabd2 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5bc0de),to(#2aabd2));background-image:linear-gradient(to bottom,#5bc0de 0,#2aabd2 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff2aabd2', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#28a4c9}.btn-info:focus,.btn-info:hover{background-color:#2aabd2;background-position:0 -15px}.btn-info.active,.btn-info:active{background-color:#2aabd2;border-color:#28a4c9}.btn-info.disabled,.btn-info.disabled.active,.btn-info.disabled.focus,.btn-info.disabled:active,.btn-info.disabled:focus,.btn-info.disabled:hover,.btn-info[disabled],.btn-info[disabled].active,.btn-info[disabled].focus,.btn-info[disabled]:active,.btn-info[disabled]:focus,.btn-info[disabled]:hover,fieldset[disabled] .btn-info,fieldset[disabled] .btn-info.active,fieldset[disabled] .btn-info.focus,fieldset[disabled] .btn-info:active,fieldset[disabled] .btn-info:focus,fieldset[disabled] .btn-info:hover{background-color:#2aabd2;background-image:none}.btn-warning{background-image:-webkit-linear-gradient(top,#f0ad4e 0,#eb9316 100%);background-image:-o-linear-gradient(top,#f0ad4e 0,#eb9316 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f0ad4e),to(#eb9316));background-image:linear-gradient(to bottom,#f0ad4e 0,#eb9316 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffeb9316', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#e38d13}.btn-warning:focus,.btn-warning:hover{background-color:#eb9316;background-position:0 -15px}.btn-warning.active,.btn-warning:active{background-color:#eb9316;border-color:#e38d13}.btn-warning.disabled,.btn-warning.disabled.active,.btn-warning.disabled.focus,.btn-warning.disabled:active,.btn-warning.disabled:focus,.btn-warning.disabled:hover,.btn-warning[disabled],.btn-warning[disabled].active,.btn-warning[disabled].focus,.btn-warning[disabled]:active,.btn-warning[disabled]:focus,.btn-warning[disabled]:hover,fieldset[disabled] .btn-warning,fieldset[disabled] .btn-warning.active,fieldset[disabled] .btn-warning.focus,fieldset[disabled] .btn-warning:active,fieldset[disabled] .btn-warning:focus,fieldset[disabled] .btn-warning:hover{background-color:#eb9316;background-image:none}.btn-danger{background-image:-webkit-linear-gradient(top,#d9534f 0,#c12e2a 100%);background-image:-o-linear-gradient(top,#d9534f 0,#c12e2a 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9534f),to(#c12e2a));background-image:linear-gradient(to bottom,#d9534f 0,#c12e2a 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc12e2a', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#b92c28}.btn-danger:focus,.btn-danger:hover{background-color:#c12e2a;background-position:0 -15px}.btn-danger.active,.btn-danger:active{background-color:#c12e2a;border-color:#b92c28}.btn-danger.disabled,.btn-danger.disabled.active,.btn-danger.disabled.focus,.btn-danger.disabled:active,.btn-danger.disabled:focus,.btn-danger.disabled:hover,.btn-danger[disabled],.btn-danger[disabled].active,.btn-danger[disabled].focus,.btn-danger[disabled]:active,.btn-danger[disabled]:focus,.btn-danger[disabled]:hover,fieldset[disabled] .btn-danger,fieldset[disabled] .btn-danger.active,fieldset[disabled] .btn-danger.focus,fieldset[disabled] .btn-danger:active,fieldset[disabled] .btn-danger:focus,fieldset[disabled] .btn-danger:hover{background-color:#c12e2a;background-image:none}.img-thumbnail,.thumbnail{-webkit-box-shadow:0 1px 2px rgba(0,0,0,.075);box-shadow:0 1px 2px rgba(0,0,0,.075)}.dropdown-menu>li>a:focus,.dropdown-menu>li>a:hover{background-color:#e8e8e8;background-image:-webkit-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-o-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f5f5f5),to(#e8e8e8));background-image:linear-gradient(to bottom,#f5f5f5 0,#e8e8e8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);background-repeat:repeat-x}.dropdown-menu>.active>a,.dropdown-menu>.active>a:focus,.dropdown-menu>.active>a:hover{background-color:#2e6da4;background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}.navbar-default{background-image:-webkit-linear-gradient(top,#fff 0,#f8f8f8 100%);background-image:-o-linear-gradient(top,#fff 0,#f8f8f8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fff),to(#f8f8f8));background-image:linear-gradient(to bottom,#fff 0,#f8f8f8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#fff8f8f8', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-radius:4px;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 5px rgba(0,0,0,.075);box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 5px rgba(0,0,0,.075)}.navbar-default .navbar-nav>.active>a,.navbar-default .navbar-nav>.open>a{background-image:-webkit-linear-gradient(top,#dbdbdb 0,#e2e2e2 100%);background-image:-o-linear-gradient(top,#dbdbdb 0,#e2e2e2 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dbdbdb),to(#e2e2e2));background-image:linear-gradient(to bottom,#dbdbdb 0,#e2e2e2 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdbdbdb', endColorstr='#ffe2e2e2', GradientType=0);background-repeat:repeat-x;-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,.075);box-shadow:inset 0 3px 9px rgba(0,0,0,.075)}.navbar-brand,.navbar-nav>li>a{text-shadow:0 1px 0 rgba(255,255,255,.25)}.navbar-inverse{background-image:-webkit-linear-gradient(top,#3c3c3c 0,#222 100%);background-image:-o-linear-gradient(top,#3c3c3c 0,#222 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#3c3c3c),to(#222));background-image:linear-gradient(to bottom,#3c3c3c 0,#222 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff3c3c3c', endColorstr='#ff222222', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-radius:4px}.navbar-inverse .navbar-nav>.active>a,.navbar-inverse .navbar-nav>.open>a{background-image:-webkit-linear-gradient(top,#080808 0,#0f0f0f 100%);background-image:-o-linear-gradient(top,#080808 0,#0f0f0f 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#080808),to(#0f0f0f));background-image:linear-gradient(to bottom,#080808 0,#0f0f0f 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff080808', endColorstr='#ff0f0f0f', GradientType=0);background-repeat:repeat-x;-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,.25);box-shadow:inset 0 3px 9px rgba(0,0,0,.25)}.navbar-inverse .navbar-brand,.navbar-inverse .navbar-nav>li>a{text-shadow:0 -1px 0 rgba(0,0,0,.25)}.navbar-fixed-bottom,.navbar-fixed-top,.navbar-static-top{border-radius:0}@media (max-width:767px){.navbar .navbar-nav .open .dropdown-menu>.active>a,.navbar .navbar-nav .open .dropdown-menu>.active>a:focus,.navbar .navbar-nav .open .dropdown-menu>.active>a:hover{color:#fff;background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}}.alert{text-shadow:0 1px 0 rgba(255,255,255,.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.25),0 1px 2px rgba(0,0,0,.05);box-shadow:inset 0 1px 0 rgba(255,255,255,.25),0 1px 2px rgba(0,0,0,.05)}.alert-success{background-image:-webkit-linear-gradient(top,#dff0d8 0,#c8e5bc 100%);background-image:-o-linear-gradient(top,#dff0d8 0,#c8e5bc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dff0d8),to(#c8e5bc));background-image:linear-gradient(to bottom,#dff0d8 0,#c8e5bc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffc8e5bc', GradientType=0);background-repeat:repeat-x;border-color:#b2dba1}.alert-info{background-image:-webkit-linear-gradient(top,#d9edf7 0,#b9def0 100%);background-image:-o-linear-gradient(top,#d9edf7 0,#b9def0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9edf7),to(#b9def0));background-image:linear-gradient(to bottom,#d9edf7 0,#b9def0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffb9def0', GradientType=0);background-repeat:repeat-x;border-color:#9acfea}.alert-warning{background-image:-webkit-linear-gradient(top,#fcf8e3 0,#f8efc0 100%);background-image:-o-linear-gradient(top,#fcf8e3 0,#f8efc0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fcf8e3),to(#f8efc0));background-image:linear-gradient(to bottom,#fcf8e3 0,#f8efc0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fff8efc0', GradientType=0);background-repeat:repeat-x;border-color:#f5e79e}.alert-danger{background-image:-webkit-linear-gradient(top,#f2dede 0,#e7c3c3 100%);background-image:-o-linear-gradient(top,#f2dede 0,#e7c3c3 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f2dede),to(#e7c3c3));background-image:linear-gradient(to bottom,#f2dede 0,#e7c3c3 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffe7c3c3', GradientType=0);background-repeat:repeat-x;border-color:#dca7a7}.progress{background-image:-webkit-linear-gradient(top,#ebebeb 0,#f5f5f5 100%);background-image:-o-linear-gradient(top,#ebebeb 0,#f5f5f5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#ebebeb),to(#f5f5f5));background-image:linear-gradient(to bottom,#ebebeb 0,#f5f5f5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffebebeb', endColorstr='#fff5f5f5', GradientType=0);background-repeat:repeat-x}.progress-bar{background-image:-webkit-linear-gradient(top,#337ab7 0,#286090 100%);background-image:-o-linear-gradient(top,#337ab7 0,#286090 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#286090));background-image:linear-gradient(to bottom,#337ab7 0,#286090 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff286090', GradientType=0);background-repeat:repeat-x}.progress-bar-success{background-image:-webkit-linear-gradient(top,#5cb85c 0,#449d44 100%);background-image:-o-linear-gradient(top,#5cb85c 0,#449d44 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5cb85c),to(#449d44));background-image:linear-gradient(to bottom,#5cb85c 0,#449d44 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff449d44', GradientType=0);background-repeat:repeat-x}.progress-bar-info{background-image:-webkit-linear-gradient(top,#5bc0de 0,#31b0d5 100%);background-image:-o-linear-gradient(top,#5bc0de 0,#31b0d5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5bc0de),to(#31b0d5));background-image:linear-gradient(to bottom,#5bc0de 0,#31b0d5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff31b0d5', GradientType=0);background-repeat:repeat-x}.progress-bar-warning{background-image:-webkit-linear-gradient(top,#f0ad4e 0,#ec971f 100%);background-image:-o-linear-gradient(top,#f0ad4e 0,#ec971f 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f0ad4e),to(#ec971f));background-image:linear-gradient(to bottom,#f0ad4e 0,#ec971f 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffec971f', GradientType=0);background-repeat:repeat-x}.progress-bar-danger{background-image:-webkit-linear-gradient(top,#d9534f 0,#c9302c 100%);background-image:-o-linear-gradient(top,#d9534f 0,#c9302c 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9534f),to(#c9302c));background-image:linear-gradient(to bottom,#d9534f 0,#c9302c 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc9302c', GradientType=0);background-repeat:repeat-x}.progress-bar-striped{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent)}.list-group{border-radius:4px;-webkit-box-shadow:0 1px 2px rgba(0,0,0,.075);box-shadow:0 1px 2px rgba(0,0,0,.075)}.list-group-item.active,.list-group-item.active:focus,.list-group-item.active:hover{text-shadow:0 -1px 0 #286090;background-image:-webkit-linear-gradient(top,#337ab7 0,#2b669a 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2b669a 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2b669a));background-image:linear-gradient(to bottom,#337ab7 0,#2b669a 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2b669a', GradientType=0);background-repeat:repeat-x;border-color:#2b669a}.list-group-item.active .badge,.list-group-item.active:focus .badge,.list-group-item.active:hover .badge{text-shadow:none}.panel{-webkit-box-shadow:0 1px 2px rgba(0,0,0,.05);box-shadow:0 1px 2px rgba(0,0,0,.05)}.panel-default>.panel-heading{background-image:-webkit-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-o-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f5f5f5),to(#e8e8e8));background-image:linear-gradient(to bottom,#f5f5f5 0,#e8e8e8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);background-repeat:repeat-x}.panel-primary>.panel-heading{background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}.panel-success>.panel-heading{background-image:-webkit-linear-gradient(top,#dff0d8 0,#d0e9c6 100%);background-image:-o-linear-gradient(top,#dff0d8 0,#d0e9c6 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dff0d8),to(#d0e9c6));background-image:linear-gradient(to bottom,#dff0d8 0,#d0e9c6 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffd0e9c6', GradientType=0);background-repeat:repeat-x}.panel-info>.panel-heading{background-image:-webkit-linear-gradient(top,#d9edf7 0,#c4e3f3 100%);background-image:-o-linear-gradient(top,#d9edf7 0,#c4e3f3 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9edf7),to(#c4e3f3));background-image:linear-gradient(to bottom,#d9edf7 0,#c4e3f3 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffc4e3f3', GradientType=0);background-repeat:repeat-x}.panel-warning>.panel-heading{background-image:-webkit-linear-gradient(top,#fcf8e3 0,#faf2cc 100%);background-image:-o-linear-gradient(top,#fcf8e3 0,#faf2cc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fcf8e3),to(#faf2cc));background-image:linear-gradient(to bottom,#fcf8e3 0,#faf2cc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fffaf2cc', GradientType=0);background-repeat:repeat-x}.panel-danger>.panel-heading{background-image:-webkit-linear-gradient(top,#f2dede 0,#ebcccc 100%);background-image:-o-linear-gradient(top,#f2dede 0,#ebcccc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f2dede),to(#ebcccc));background-image:linear-gradient(to bottom,#f2dede 0,#ebcccc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffebcccc', GradientType=0);background-repeat:repeat-x}.well{background-image:-webkit-linear-gradient(top,#e8e8e8 0,#f5f5f5 100%);background-image:-o-linear-gradient(top,#e8e8e8 0,#f5f5f5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#e8e8e8),to(#f5f5f5));background-image:linear-gradient(to bottom,#e8e8e8 0,#f5f5f5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffe8e8e8', endColorstr='#fff5f5f5', GradientType=0);background-repeat:repeat-x;border-color:#dcdcdc;-webkit-box-shadow:inset 0 1px 3px rgba(0,0,0,.05),0 1px 0 rgba(255,255,255,.1);box-shadow:inset 0 1px 3px rgba(0,0,0,.05),0 1px 0 rgba(255,255,255,.1)} 6 | /*# sourceMappingURL=bootstrap-theme.min.css.map */ -------------------------------------------------------------------------------- /webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff2 -------------------------------------------------------------------------------- /webspider/web/static/bootstrap/js/npm.js: -------------------------------------------------------------------------------- 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment. 2 | require('../../js/transition.js') 3 | require('../../js/alert.js') 4 | require('../../js/button.js') 5 | require('../../js/carousel.js') 6 | require('../../js/collapse.js') 7 | require('../../js/dropdown.js') 8 | require('../../js/modal.js') 9 | require('../../js/tooltip.js') 10 | require('../../js/popover.js') 11 | require('../../js/scrollspy.js') 12 | require('../../js/tab.js') 13 | require('../../js/affix.js') -------------------------------------------------------------------------------- /webspider/web/static/css/mystyle.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: "Hiragino Sans GB", "Microsoft Yahei", SimSun, Arial, "Helvetica Neue", Helvetica; 3 | color: #333; 4 | word-wrap: break-word; 5 | -webkit-font-smoothing: antialiased; 6 | font-size: 14px; 7 | } 8 | 9 | footer { 10 | font-size: 14px; 11 | border-radius: 5px; 12 | margin: 0 auto; 13 | width: 100%; 14 | text-align: center; 15 | padding: 10px 0; 16 | } 17 | 18 | .main-body { 19 | min-height: 780px; 20 | } 21 | 22 | .chart-div { 23 | width: 550px; 24 | height: 400px; 25 | padding-top: 30px; 26 | margin: 0 auto; 27 | } 28 | 29 | .large-chart-div { 30 | width: 1000px; 31 | height: 700px; 32 | padding-top: 30px; 33 | margin: 0 auto; 34 | } -------------------------------------------------------------------------------- /webspider/web/static/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/img/favicon.ico -------------------------------------------------------------------------------- /webspider/web/templates/404.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block body %} 3 |
4 |

你来到了没有知识的荒原 _(:з」∠)_

5 | {% end %} -------------------------------------------------------------------------------- /webspider/web/templates/500.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block body %} 3 |
4 |

服务器提了一个问题 _(:з」∠)_

5 | {% end %} -------------------------------------------------------------------------------- /webspider/web/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Web Spider|JustForFunnn 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | {% block header %}{% end %} 18 | 19 | 20 | 36 | 37 |
38 | {% block body %} 39 | {% end %} 40 |
41 | 42 |
43 |
44 |
45 |

46 | Design By    47 | JustForFunnn  48 | 49 |

50 |
51 |
52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /webspider/web/templates/city-jobs-count-chart-module.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | -------------------------------------------------------------------------------- /webspider/web/templates/education-chart-module.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | -------------------------------------------------------------------------------- /webspider/web/templates/finance-stage-chart-module.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | -------------------------------------------------------------------------------- /webspider/web/templates/pagination-module.html: -------------------------------------------------------------------------------- 1 |
2 |
    3 |
  • 4 | 5 | « 6 | 7 |
  • 8 | {% for p in pagination.iter_pages() %} 9 | {% if p %} 10 |
  • 11 | {{ p }} 12 |
  • 13 | {% else %} 14 |
  • 15 | {% end %} 16 | {% end %} 17 |
  • 18 | 19 | » 20 | 21 |
  • 22 |
23 |
24 | 25 | -------------------------------------------------------------------------------- /webspider/web/templates/per-day-jobs-count-chart-module.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | -------------------------------------------------------------------------------- /webspider/web/templates/salary-chart-module.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | -------------------------------------------------------------------------------- /webspider/web/templates/statistics.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block body %} 3 |
4 | 5 |

about {{ keyword_name }}:

6 |
7 | {% include 'work-year-chart-module.html' %} 8 | {% include 'salary-chart-module.html' %} 9 | {% include 'city-jobs-count-chart-module.html' %} 10 | {% include 'education-chart-module.html' %} 11 | {% include 'per-day-jobs-count-chart-module.html' %} 12 | {% include 'finance-stage-chart-module.html' %} 13 | 14 |
15 | {% end %} 16 | 17 | {% block header %} 18 | 19 | {% end %} -------------------------------------------------------------------------------- /webspider/web/templates/work-year-chart-module.html: -------------------------------------------------------------------------------- 1 |
2 | 3 | -------------------------------------------------------------------------------- /webspider/web/urls.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from tornado.web import URLSpec, RedirectHandler 3 | 4 | from webspider.web.handlers import KeywordStatisticsApiHandler, KeywordStatisticsPageHandler 5 | 6 | url_handlers = [ 7 | URLSpec(r"/", RedirectHandler, {'url': '/statistics?keyword_name=python'}), 8 | URLSpec(r"/api/statistics", KeywordStatisticsApiHandler), 9 | URLSpec(r"/statistics", KeywordStatisticsPageHandler), 10 | ] 11 | --------------------------------------------------------------------------------