├── .gitattributes
├── .gitignore
├── .landscape.yml
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── job-chart.jpg
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
├── base.py
├── fixture
│ ├── city.sql
│ ├── company.sql
│ ├── industry.sql
│ ├── job.sql
│ ├── job_keyword.sql
│ ├── jobs_count.sql
│ ├── keyword.sql
│ └── keyword_statistic.sql
├── schema.sql
├── test_controllers
│ ├── __init__.py
│ ├── test_city_ctl.py
│ ├── test_industry_ctl.py
│ ├── test_job_ctl.py
│ ├── test_job_keyword_ctl.py
│ ├── test_keyword_ctl.py
│ └── test_keyword_statistic_ctl.py
├── test_models
│ └── test_job.py
├── test_utils
│ ├── test_cache.py
│ ├── test_classproperty.py
│ ├── test_common.py
│ ├── test_convert.py
│ ├── test_http_tools.py
│ ├── test_pagination.py
│ ├── test_text.py
│ └── test_time_tools.py
├── test_web
│ ├── base.py
│ ├── test_formatter.py
│ └── test_keyword_statistic.py
└── util.py
└── webspider
├── __init__.py
├── constants.py
├── controllers
├── __init__.py
├── city_ctl.py
├── industry_ctl.py
├── job_ctl.py
├── job_keyword_ctl.py
├── keyword_ctl.py
└── keyword_statistic_ctl.py
├── crawlers
├── __init__.py
├── lagou_cites.py
├── lagou_companies.py
├── lagou_jobs.py
└── lagou_jobs_count.py
├── exceptions.py
├── models
├── __init__.py
├── base.py
├── city.py
├── company.py
├── company_industry.py
├── industry.py
├── job.py
├── job_keyword.py
├── jobs_count.py
├── keyword.py
└── keyword_statistic.py
├── quickly_cmd.py
├── setting.py
├── tasks
├── __init__.py
├── actor
│ ├── __init__.py
│ ├── keyword_statistic.py
│ ├── lagou_data.py
│ └── lagou_jobs_count.py
├── celery_app.py
└── celery_config.py
├── utils
├── __init__.py
├── cache.py
├── classproperty.py
├── common.py
├── convert.py
├── http_tools.py
├── log.py
├── pagination.py
├── sql.py
├── text.py
└── time_tools.py
└── web
├── __init__.py
├── app.py
├── formatter
├── __init__.py
├── base.py
├── jobs_count.py
└── keyword_statistic.py
├── handlers
├── __init__.py
├── base.py
└── keyword_statistics.py
├── static
├── __init__.py
├── bootstrap
│ ├── css
│ │ ├── bootstrap-theme.css
│ │ ├── bootstrap-theme.css.map
│ │ ├── bootstrap-theme.min.css
│ │ ├── bootstrap-theme.min.css.map
│ │ ├── bootstrap.css
│ │ ├── bootstrap.css.map
│ │ ├── bootstrap.min.css
│ │ └── bootstrap.min.css.map
│ ├── fonts
│ │ ├── glyphicons-halflings-regular.eot
│ │ ├── glyphicons-halflings-regular.svg
│ │ ├── glyphicons-halflings-regular.ttf
│ │ ├── glyphicons-halflings-regular.woff
│ │ └── glyphicons-halflings-regular.woff2
│ └── js
│ │ ├── bootstrap.js
│ │ ├── bootstrap.min.js
│ │ └── npm.js
├── css
│ └── mystyle.css
├── img
│ └── favicon.ico
└── js
│ ├── echarts.js
│ ├── echarts.min.js
│ └── jquery.min.js
├── templates
├── 404.html
├── 500.html
├── base.html
├── city-jobs-count-chart-module.html
├── education-chart-module.html
├── finance-stage-chart-module.html
├── pagination-module.html
├── per-day-jobs-count-chart-module.html
├── salary-chart-module.html
├── statistics.html
└── work-year-chart-module.html
└── urls.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.html linguist-language=python
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 | .idea/
91 | bin/
92 | spider_log.txt
93 | dump.rdb
94 | .DS_Store
95 | cover/
96 | celerybeat.pid
97 | oj.py
98 | /webspider/log
99 | /webspider/security_constants.py
100 | celerybeat-schedule
101 | cove
102 | nohup.out
103 |
--------------------------------------------------------------------------------
/.landscape.yml:
--------------------------------------------------------------------------------
1 | autodetect: yes
2 | test-warnings: true
3 | doc-warnings: true
4 | strictness: veryhigh
5 | max-line-length: 120
6 | python-targets: 3
7 |
8 | uses:
9 | - celery
10 |
11 | ignore-paths:
12 | - .git
13 | - coverage
14 | - env
15 | - test
16 | - webspider/web/templates
17 | - webspider/web/static
18 |
19 | pep8:
20 | run: true
21 | disable:
22 | - W291
23 | - E501
24 |
25 | pyflakes:
26 | run: true
27 |
28 | inherits: [flake8]
29 |
30 | requirements:
31 | - requirements.txt
32 |
33 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | sudo: false
4 |
5 | python:
6 | - "3.6"
7 |
8 | services:
9 | - mysql
10 | - redis-server
11 |
12 | before_install:
13 | - mysql -e 'CREATE DATABASE spider CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;'
14 |
15 | install:
16 | - make
17 |
18 | script:
19 | - make test
20 |
21 | after_success:
22 | - env/bin/codecov
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PYTHON:=$(shell which python3)
2 |
3 | all: python
4 |
5 | .PHONY: clean python test flake8
6 |
7 | python: setup.py requirements.txt
8 | pip install virtualenv
9 | echo "\n Creating python virtual environment......\n"
10 | virtualenv -p $(PYTHON) env
11 | echo "\n Use python virtual environment to install required packages......\n"
12 | env/bin/pip install -e .
13 | mkdir -p webspider/log
14 | touch webspider/log/spider_log.txt
15 |
16 | test: flake8
17 | env/bin/nosetests -vd
18 |
19 | flake8:
20 | env/bin/flake8
21 |
22 | clean:
23 | -rm -rf env cover *eggs *.egg-info *.egg webspider/log
24 | @find . -type f -name "*.py[co]" -delete
25 | @find . -type d -name "__pycache__" -delete
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #
2 |
3 | [](https://travis-ci.org/JustForFunnnn/webspider)
4 | [](https://codecov.io/gh/JustForFunnnn/webspider)
5 | [](https://landscape.io/github/JustForFunnnn/webspider/master)
6 | [](https://github.com/JustForFunnnn/webspider/blob/master/LICENSE)
7 | [](https://github.com/JustForFunnnn/webspider)
8 |
9 | | -- | -- |
10 | | -------- | ------------------------------------------ |
11 | | Version | 1.0.1 |
12 | | WebSite | http://119.23.223.90:8000 |
13 | | Source | https://github.com/JustForFunnnn/webspider |
14 | | Keywords | `Python3`, `Tornado`, `Celery`, `Requests` |
15 |
16 | ## Introduction
17 |
18 | This project crawls job&company data from job-seeking websites, cleans the data, modelizes, converts, and stores it in the database. then use [Echarts](https://echarts.apache.org/en/index.html) and [Bootstrap](https://getbootstrap.com/) to build a front-end page to display the IT job statistics, to show the newest requirements and trends of the IT job market.
19 |
20 | ## Demo
21 |
22 | You can input the keyword you are interested in into the search box, such as "Python", then click the search button, and the statistics of this keyword will show.
23 |
24 | * The first chart `Years of Working(工作年限要求`) is about the experience requirement of the `Python`, according to the data, the "3 ~ 5 years" is the most frequent requirement, then the following is `1 ~ 3 years` ([Chart Source Code](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/work-year-chart-module.html))
25 |
26 | * The second chart `Salary Range(薪水分布`) is about the salary of the `Python`, according to the data, the "11k ~ 20k" is the most frequent salary provided, then the following is `21k ~ 35k` ([Chart Source Code](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/salary-chart-module.html))
27 |
28 | and we also got charts:
29 | * [Education Requirement Data Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/education-chart-module.html)
30 | * [City Job Count Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/city-jobs-count-chart-module.html)
31 | * [Job Count Change Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/per-day-jobs-count-chart-module.html)
32 | * [Company Finance Stage Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/finance-stage-chart-module.html)
33 |
34 | Python Charts Example:
35 |
36 | 
37 |
38 | ## Quick Start
39 | > This tutorial is based on `Linux - Ubuntu`, for other systems, please find the corresponding command
40 |
41 | * Clone the project
42 |
43 | ```bash
44 | git clone git@github.com:JustForFunnnn/webspider.git
45 | ```
46 |
47 | * Install `MySQL`, `Redis`, `Python3`
48 |
49 | ```bash
50 | # install Redis
51 | apt-get install redis-server
52 |
53 | # run Redis in background
54 | nohup redis-server &
55 |
56 | # install Python3
57 | apt-get install python3
58 |
59 | # install MySQL
60 | apt-get install mysql-server
61 |
62 | # start MySQL
63 | sudo service mysql start
64 | ```
65 |
66 | * Config database and table
67 | ```
68 | # create database
69 | CREATE DATABASE `spider` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
70 | ```
71 | We still need to create the tables, copy the table definition SQL from `tests/schema.sql` and run it in MySQL
72 |
73 | * Build project
74 | ```bash
75 | # after a successful build, some executable jobs will be generated under the path env/bin
76 | make
77 | ```
78 |
79 | * Run unit-test
80 | ```bash
81 | make test
82 | ```
83 |
84 | * Run code style check
85 | ```bash
86 | make flake8
87 | ```
88 |
89 | * Start web service
90 | ```bash
91 | env/bin/web
92 | ```
93 |
94 | * Stat crawler
95 | ```bash
96 | # run task scheduler/dispatcher
97 | env/bin/celery_beat
98 | # run celery worker for job data
99 | env/bin/celery_lg_jobs_data_worker
100 | # run celery worker for job count
101 | env/bin/celery_lg_jobs_count_worker
102 | ```
103 |
104 | * Other jobs
105 | ```bash
106 | # start crawl job count immediately
107 | env/bin/crawl_lg_jobs_count
108 | # start crawl job data immediately
109 | env/bin/crawl_lg_data
110 | # start celery monitoring
111 | env/bin/celery_flower
112 | ```
113 |
114 | * Clean
115 | ```bash
116 | # clean the existing build result
117 | make clean
118 | ```
119 |
--------------------------------------------------------------------------------
/job-chart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/job-chart.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # All requirements put in this file
2 | # You **MUST** specify the package version in this file
3 |
4 | tornado == 4.5.3
5 | gevent == 1.2.2
6 | gunicorn == 19.7.1
7 | lxml == 4.1.0
8 | requests == 2.18.4
9 | mysqlclient == 1.3.12
10 | sqlalchemy == 1.2.2
11 | redis == 2.10.6
12 | python-redis == 0.1.7
13 | retrying == 1.3.3
14 | celery == 4.0.2
15 |
16 | flower == 0.9.2
17 | ipython == 6.2.1
18 | nose == 1.3.7
19 | coverage == 4.4.2
20 | flake8 == 3.5.0
21 | codecov == 2.0.15
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = W291
3 | max-line-length = 120
4 | exclude =
5 | .git,
6 | eggs,
7 | env,
8 | tests
9 |
10 | [nosetests]
11 | logging-clear-handlers = 1
12 | with-coverage = 1
13 | cover-package = webspider
14 | cover-erase = 1
15 | logging-level = DEBUG
16 | cover-xml = 1
17 | cover-xml-file = coverage.xml
18 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import os
4 | from setuptools import find_packages, setup
5 |
6 | from webspider import __version__
7 |
8 | # get the dependencies and installs
9 | here = os.path.abspath(os.path.dirname(__file__))
10 | with open(os.path.join(here, 'requirements.txt')) as f:
11 | all_requirements = f.read().split('\n')
12 |
13 | setup(
14 | name='webspider',
15 | version=__version__,
16 | license='MIT',
17 | author='JustForFunnn',
18 | author_email='',
19 | description='web spider',
20 | url='https://github.com/JustForFunnnn/webspider',
21 | packages=find_packages(exclude=['tests']),
22 | package_data={'webspider': ['README.md']},
23 | zip_safe=False,
24 | install_requires=all_requirements,
25 | entry_points={
26 | 'console_scripts': [
27 | 'web = webspider.web.app:main',
28 | 'production_web = webspider.quickly_cmd:run_web_app_by_gunicorn',
29 | 'crawl_lg_data = webspider.tasks.actor.lg_data:crawl_lg_data_task',
30 | 'crawl_lg_jobs_count = webspider.tasks.actor.lg_jobs_count:crawl_lg_jobs_count_task',
31 | # beat
32 | 'celery_beat = webspider.quickly_cmd:run_celery_beat',
33 | 'celery_flower = webspider.quickly_cmd.py:run_celery_flower',
34 | # worker
35 | 'celery_default_worker = webspider.quickly_cmd:run_celery_default_worker',
36 | 'celery_lg_data_worker = webspider.quickly_cmd:run_celery_lg_data_worker',
37 | 'celery_lg_jobs_data_worker = webspider.quickly_cmd:run_celery_lg_jobs_data_worker',
38 | 'celery_lg_jobs_count_worker = webspider.quickly_cmd:run_celery_lg_jobs_count_worker',
39 | ],
40 | }
41 | )
42 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import os
4 | from unittest import TestCase
5 |
6 | from webspider.utils.sql import get_session
7 | from tests.util import execute_sql_file, get_current_database_name
8 |
9 | here_dir = os.path.dirname(__file__)
10 |
11 |
12 | class BaseTestCase(TestCase):
13 | session = get_session()
14 |
15 | def setUp(self):
16 | test_db_name = 'test_spider'
17 | # 清除测试数据库
18 | self.session.execute("DROP DATABASE IF EXISTS {db_name};".format(db_name=test_db_name))
19 | # 创建测试数据库
20 | self.session.execute("CREATE DATABASE {db_name} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;".format(
21 | db_name=test_db_name))
22 | # 指定测试数据库 test_spider
23 | self.session.execute("USE {db_name};".format(db_name=test_db_name))
24 |
25 | path = os.path.dirname(__file__)
26 | # 创建表
27 | execute_sql_file(
28 | file_paths=[os.path.join(path, "schema.sql"), ],
29 | db_session=self.session,
30 | predictive_db_name=test_db_name
31 | )
32 | fixture_path = os.path.join(path, 'fixture')
33 | # 装载表数据
34 | fixture_file_paths = [os.path.join(fixture_path, file) for file in os.listdir(fixture_path)]
35 | execute_sql_file(
36 | file_paths=fixture_file_paths,
37 | db_session=self.session,
38 | predictive_db_name=test_db_name
39 | )
40 | assert get_current_database_name(self.session) == test_db_name
41 |
42 | def teardown(self):
43 | # 测试结束 销毁测试数据库
44 | self.session.execute('DROP DATABASE test_spider;')
45 |
--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import os
4 | from unittest import TestCase
5 |
6 | from webspider.utils.sql import get_session
7 | from tests.util import create_test_db, drop_test_db
8 |
9 | here_dir = os.path.dirname(__file__)
10 |
11 |
12 | class BaseTestCase(TestCase):
13 | session = get_session()
14 |
15 | def setUp(self):
16 | create_test_db(session=self.session)
17 |
18 | def tearDown(self):
19 | # 测试结束 销毁测试数据库
20 | drop_test_db(session=self.session)
21 |
--------------------------------------------------------------------------------
/tests/fixture/city.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `test_spider`.`city`(`id`, `name`)
2 | VALUE (2, '北京'),
3 | (3, '上海'),
4 | (4, '广州');
--------------------------------------------------------------------------------
/tests/fixture/company.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `company` (`id`, `lg_company_id`, `city_id`, `shortname`, `fullname`, `finance_stage`, `size`, `address`, `features`, `process_rate`, `introduce`, `advantage`, `created_at`, `updated_at`)
2 | VALUES
3 | (1, 168219, 2, '贝壳金控', '贝壳金控控股集团有限公司', 1, 5, '2017年5月,贝壳正式独立运作,是国内首家聚焦于居住领域的消费金融服务平台','不知道', 100, '我是简介', '[\"\\u4e13\\u9879\\u5956\\u91d1\", \"\\u5e74\\u7ec8\\u5206\\u7ea2\", \"\\u5b9a\\u671f\\u4f53\\u68c0\", \"\\u7ee9\\u6548\\u5956\\u91d1\", \"\\u5348\\u9910\\u8865\\u52a9\", \"\\u4ea4\\u901a\\u8865\\u52a9\"]', '2018-01-28 15:26:13', '2018-01-28 15:35:19'),
4 | (2, 142800, 2, '猫眼电影', '北京猫眼文化传媒有限公司', 1, 5, '北京朝阳区望京东路4号恒电大厦B座8层', '一网打尽好电影', 100, '猫眼电影简介\n猫眼电影(网站经营者:北京猫眼文化传媒有限公司)是美团。。。', '[]', '2018-01-28 15:26:13', '2018-01-28 15:35:19'),
5 | (3, 107435, 2, '熊猫直播', '上海熊猫互娱文化有限公司北京分公司', 3, 5, '北京朝阳区望京soho塔3,A座18层', '王思聪任CEO的视频直播平台', 100, '熊猫直播成立于2015年7月,由王思聪先生亲任CEO,并聚集了国内众多一线视频主播资源。', '[\"\\u5e74\\u5e95\\u53cc\\u85aa\", \"\\u5e26\\u85aa\\u5e74\\u5047\", \"\\u5348\\u9910\\u8865\\u52a9\", \"\\u7ee9\\u6548\\u5956\\u91d1\", \"\\u80a1\\u7968\\u671f\\u6743\"]', '2018-01-28 15:26:13', '2018-01-28 15:35:19');
6 |
--------------------------------------------------------------------------------
/tests/fixture/industry.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `test_spider`.`industry` (`id`, `name`, `created_at`, `updated_at`)
2 | VALUES
3 | (1000001, '开网吧', '2018-01-29 19:07:52', '2018-01-29 19:07:52'),
4 | (1000002, '开餐厅', '2018-01-29 19:07:52', '2018-01-29 19:07:52');
5 |
--------------------------------------------------------------------------------
/tests/fixture/job.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `test_spider`.`job` (`id`, `lg_job_id`, `city_id`, `company_id`, `title`, `work_year`, `department`, `salary`, `education`, `nature`, `description`, `advantage`, `created_at`, `updated_at`)
2 | VALUES
3 | (1, 10001, 2, 1, '高级前端开发工程师', 5, '贝壳金控交易研发部-交易前端组招聘', '15k-30k', 3, 1, '职位介绍A', '15薪,工作居住证,六险一金,双休', '2018-01-29 19:11:33', '2018-01-30 17:22:30'),
4 | (2, 10002, 4, 2, '前端开发工程师', 6, '贝壳金控技术产品中心招聘', '20k-40k', 3, 1, '职位介绍B', '高薪,大牛,六险一金,成长空间大', '2018-01-29 19:11:33', '2018-01-30 17:22:30'),
5 | (3, 10003, 4, 3, 'DBA工程师', 5, '贝壳金控运维技术部招聘', '15k-30k', 3, 1, '职位介绍C', '大牛,高薪,成长空间大,团队氛围好', '2018-01-29 19:11:33', '2018-01-30 17:22:30');
6 |
--------------------------------------------------------------------------------
/tests/fixture/job_keyword.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `job_keyword` (`id`, `job_id`, `keyword_id`, `created_at`, `updated_at`)
2 | VALUES
3 | (1, 1, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
4 | (2, 1, 101, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
5 | (3, 2, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
6 | (4, 2, 101, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
7 | (5, 2, 102, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
8 | (6, 3, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12');
9 |
--------------------------------------------------------------------------------
/tests/fixture/jobs_count.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `jobs_count` (`id`, `date`, `keyword_id`, `all_city`, `beijing`, `guangzhou`, `shenzhen`,
2 | `shanghai`, `hangzhou`, `chengdu`, `created_at`, `updated_at`)
3 | VALUES
4 | (1, 20180128, 100, 576, 198, 35, 93, 80, 41, 26, '2018-01-28 17:01:04', '2018-01-28 17:01:04'),
5 | (2, 20180129, 100, 580, 200, 36, 100, 82, 44, 30, '2018-01-28 17:01:04', '2018-01-28 17:01:04');
6 |
--------------------------------------------------------------------------------
/tests/fixture/keyword.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `test_spider`.`keyword` (`id`, `name`)
2 | VALUES
3 | (100, 'python'),
4 | (101, 'java'),
5 | (102, 'qt'),
6 | (103, '前端');
7 |
--------------------------------------------------------------------------------
/tests/fixture/keyword_statistic.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `keyword_statistic` (`id`, `keyword_id`, `educations`, `city_jobs_count`, `salary`, `financing_stage`, `work_years`, `created_at`, `updated_at`)
2 | VALUES
3 | (1, 100,
4 | '{"不限": 1, "大专": 2, "本科": 3, "本科": 4, "硕士": 5, "博士": 6, "unknown": 7}',
5 | '{"北京": 8, "深圳": 9, "广州": 10}',
6 | '{"10k以下": 11, "11k-20k": 12, "21k-35k": 13, "36k-60k": 14, "61k以上": 15}',
7 | '{"未融资": 16, "天使轮": 17, "A轮": 18, "B轮": 19, "C轮": 20, "D轮及以上": 21, "上市公司": 22, "不需要融资": 23, "unknown": 24}',
8 | '{"不限": 25, "应届毕业生": 26, "1年以下": 27, "1-3年": 28, "3-5年": 29, "5-10年": 30, "10年以上": 31, "unknown": 32}',
9 | '2018-02-01 19:01:44', '2018-02-05 01:01:48');
10 |
--------------------------------------------------------------------------------
/tests/schema.sql:
--------------------------------------------------------------------------------
1 | -- CREATE DATABASE `spider` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
2 |
3 | CREATE TABLE IF NOT EXISTS `job` (
4 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
5 | `lg_job_id`INT UNSIGNED NOT NULL COMMENT '所使用的职位id',
6 | `city_id` INT UNSIGNED NOT NULL COMMENT '城市 id',
7 | `company_id` INT UNSIGNED NOT NULL COMMENT '公司 id',
8 | `title` VARCHAR(64) NOT NULL COMMENT '职位标题',
9 | `work_year` TINYINT NOT NULL DEFAULT 0 COMMENT '工作年限要求',
10 | `department` VARCHAR(64) NOT NULL DEFAULT '' COMMENT '招聘部门',
11 | `salary` VARCHAR(32) NOT NULL DEFAULT '' COMMENT '薪水',
12 | `education` TINYINT NOT NULL DEFAULT 0 COMMENT '教育背景要求',
13 | `nature` TINYINT NOT NULL DEFAULT 0 COMMENT '工作性质',
14 | `description` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '额外描述',
15 | `advantage` VARCHAR(256) NOT NULL DEFAULT '' COMMENT '职位优势',
16 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
17 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
18 | UNIQUE KEY (`lg_job_id`),
19 | KEY `idx_company_id` (`company_id`)
20 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位表';
21 |
22 |
23 | CREATE TABLE IF NOT EXISTS `company` (
24 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
25 | `lg_company_id` INT UNSIGNED NOT NULL COMMENT '所使用的公司id',
26 | `city_id` INT UNSIGNED NOT NULL COMMENT '所在城市 id',
27 | `shortname` VARCHAR(64) NOT NULL COMMENT '公司名称',
28 | `fullname` VARCHAR(128) NOT NULL COMMENT '公司全称',
29 | `finance_stage` TINYINT NOT NULL DEFAULT 0 COMMENT '融资阶段',
30 | `size` TINYINT NOT NULL DEFAULT 0 COMMENT '公司规模',
31 | `address` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '公司地址',
32 | `features` VARCHAR(128) NOT NULL DEFAULT '' COMMENT '公司特点',
33 | `process_rate` TINYINT NOT NULL DEFAULT 0 COMMENT '简历处理率',
34 | `introduce` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '公司简介',
35 | `advantage` VARCHAR(256) NOT NULL DEFAULT '' COMMENT '公司优势',
36 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
37 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
38 | UNIQUE KEY (`lg_company_id`),
39 | KEY `idx_city_id` (`city_id`)
40 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='公司表';
41 |
42 |
43 | CREATE TABLE IF NOT EXISTS `city` (
44 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
45 | `name` VARCHAR(64) NOT NULL COMMENT '城市名',
46 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
47 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
48 | UNIQUE KEY (`name`)
49 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='城市表';
50 |
51 |
52 | CREATE TABLE IF NOT EXISTS `industry` (
53 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
54 | `name` VARCHAR(64) NOT NULL COMMENT '行业名称',
55 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
56 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
57 | UNIQUE KEY (`name`)
58 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='行业表';
59 |
60 |
61 | CREATE TABLE IF NOT EXISTS `company_industry` (
62 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
63 | `company_id` INT UNSIGNED NOT NULL COMMENT '公司 id',
64 | `industry_id` INT UNSIGNED NOT NULL COMMENT '行业 id',
65 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
66 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
67 | UNIQUE KEY(`company_id`, `industry_id`)
68 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='公司行业表';
69 |
70 |
71 | -- 预置行业类型
72 | INSERT INTO `industry` (`id`, `name`)
73 | VALUES
74 | (24,'移动互联网'),
75 | (25,'电子商务'),
76 | (26,'社交网络'),
77 | (27,'企业服务'),
78 | (28,'O2O'),
79 | (29,'教育'),
80 | (31,'游戏'),
81 | (32,'旅游'),
82 | (33,'金融'),
83 | (34,'医疗健康'),
84 | (35,'生活服务'),
85 | (38,'信息安全'),
86 | (41,'数据服务'),
87 | (43,'广告营销'),
88 | (45,'文化娱乐'),
89 | (47,'硬件'),
90 | (48,'分类信息'),
91 | (49,'招聘'),
92 | (10594,'其他');
93 |
94 |
95 | CREATE TABLE IF NOT EXISTS `keyword` (
96 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
97 | `name` VARCHAR(64) NOT NULL COMMENT '关键词名称',
98 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
99 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
100 | UNIQUE KEY (`name`)
101 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='关键词';
102 |
103 |
104 | CREATE TABLE IF NOT EXISTS `job_keyword` (
105 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
106 | `job_id` INT NOT NULL COMMENT '工作 id',
107 | `keyword_id` INT NOT NULL COMMENT '关键词 id',
108 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
109 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
110 | UNIQUE KEY(`job_id`, `keyword_id`),
111 | KEY `idx_keyword_id` (`keyword_id`)
112 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位关键词';
113 |
114 |
115 | CREATE TABLE IF NOT EXISTS `jobs_count` (
116 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
117 | `date` INT NOT NULL COMMENT '日期',
118 | `keyword_id` INT NOT NULL COMMENT '关键词 id',
119 | `all_city` INT NOT NULL DEFAULT 0 COMMENT '全国岗位数量',
120 | `beijing` INT NOT NULL DEFAULT 0 COMMENT '北京岗位数量',
121 | `guangzhou` INT NOT NULL DEFAULT 0 COMMENT '广州岗位数量',
122 | `shenzhen` INT NOT NULL DEFAULT 0 COMMENT '深圳岗位数量',
123 | `shanghai` INT NOT NULL DEFAULT 0 COMMENT '上海岗位数量',
124 | `hangzhou` INT NOT NULL DEFAULT 0 COMMENT '杭州岗位数量',
125 | `chengdu` INT NOT NULL DEFAULT 0 COMMENT '成都岗位数量',
126 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
127 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
128 | UNIQUE KEY(`date`, `keyword_id`),
129 | KEY `idx_keyword_id` (`keyword_id`)
130 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位每日数量统计';
131 |
132 |
133 | CREATE TABLE IF NOT EXISTS `keyword_statistic` (
134 | `id` INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
135 | `keyword_id` INT UNSIGNED NOT NULL COMMENT '关键词 id',
136 | `educations` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '教育背景要求情况',
137 | `city_jobs_count`VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '城市职位数量情况',
138 | `salary` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '薪水分布情况',
139 | `financing_stage` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '公司融资阶段情况',
140 | `work_years` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '要求的工作年限情况',
141 | `created_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
142 | `updated_at` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
143 | UNIQUE KEY(`keyword_id`)
144 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='关键词分析表' COMMENT='关键词分析';
145 |
--------------------------------------------------------------------------------
/tests/test_controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/tests/test_controllers/__init__.py
--------------------------------------------------------------------------------
/tests/test_controllers/test_city_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from tests import BaseTestCase
3 | from webspider.controllers import city_ctl
4 | from webspider.models import CityModel
5 |
6 |
7 | class TestCityController(BaseTestCase):
8 | def test_get_city_id_by_name(self):
9 | city_id = city_ctl.get_city_id_by_name(name='北京')
10 | self.assertEqual(city_id, 2)
11 |
12 | with self.assertRaises(ValueError):
13 | city_ctl.get_city_id_by_name(name='通利福尼亚')
14 |
15 | def test_insert_city_if_not_exist(self):
16 | city_id = city_ctl.insert_city_if_not_exist('湛江')
17 | self.assertTrue(city_id > 0)
18 | city = CityModel.get_by_pk(pk=city_id)
19 | self.assertEqual(city.name, '湛江')
20 |
21 | self.assertIsNone(city_ctl.insert_city_if_not_exist('湛江'))
22 |
23 | def test_get_city_name_dict(self):
24 | city_name_dict = city_ctl.get_city_name_dict()
25 | self.assertDictEqual(city_name_dict, {'北京': 2, '上海': 3, '广州': 4})
26 |
--------------------------------------------------------------------------------
/tests/test_controllers/test_industry_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from tests import BaseTestCase
3 | from webspider.controllers import industry_ctl
4 | from webspider.models import IndustryModel
5 |
6 |
7 | class TestIndustryController(BaseTestCase):
8 | def test_get_industry_id_by_name(self):
9 | industry_id = industry_ctl.get_industry_id_by_name(name='开网吧')
10 | self.assertEqual(industry_id, 1000001)
11 |
12 | with self.assertRaises(ValueError):
13 | industry_ctl.get_industry_id_by_name(name='开飞机')
14 |
15 | def test_insert_industry_if_not_exist(self):
16 | industry_name = '开飞机'
17 | industry_id = industry_ctl.insert_industry_if_not_exist(industry_name)
18 | self.assertTrue(industry_id > 0)
19 | industry = IndustryModel.get_by_pk(pk=industry_id)
20 | self.assertEqual(industry.name, industry_name)
21 |
22 | self.assertIsNone(industry_ctl.insert_industry_if_not_exist(industry_name))
23 |
--------------------------------------------------------------------------------
/tests/test_controllers/test_job_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from webspider.controllers import job_ctl
5 |
6 |
7 | class TestJobController(TestCase):
8 |
9 | def test_get_salary_section(self):
10 | salary = '15k-25k'
11 | left, right = job_ctl.get_salary_section(salary)
12 | self.assertEqual(left, 15)
13 | self.assertEqual(right, 25)
14 |
15 | salary = '15k以上'
16 | left, right = job_ctl.get_salary_section(salary)
17 | self.assertEqual(left, 15)
18 | self.assertEqual(right, 20)
19 |
20 | salary = '15k以下'
21 | left, right = job_ctl.get_salary_section(salary)
22 | self.assertEqual(left, 10)
23 | self.assertEqual(right, 15)
24 |
25 | with self.assertRaises(ValueError):
26 | left, right = job_ctl.get_salary_section('15k30k')
27 |
--------------------------------------------------------------------------------
/tests/test_controllers/test_job_keyword_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from tests import BaseTestCase
3 | from webspider.controllers import job_keyword_ctl
4 |
5 |
6 | class TestJobKeywordController(BaseTestCase):
7 | def test_get_most_frequently_keyword_ids(self):
8 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids()
9 | self.assertEqual(keyword_ids, [100, 101, 102])
10 |
11 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=2)
12 | self.assertEqual(keyword_ids, [100, 101])
13 |
14 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(offset=1)
15 | self.assertEqual(keyword_ids, [101, 102])
16 |
17 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=1, offset=1)
18 | self.assertEqual(keyword_ids, [101])
19 |
--------------------------------------------------------------------------------
/tests/test_controllers/test_keyword_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from tests import BaseTestCase
3 | from webspider.controllers import keyword_ctl
4 | from webspider.models import KeywordModel
5 |
6 |
7 | class TestKeywordController(BaseTestCase):
8 | def test_get_keyword_name_by_id(self):
9 | keyword_name = keyword_ctl.get_keyword_name_by_id(keyword_id=100)
10 | self.assertEqual(keyword_name, 'python')
11 |
12 | with self.assertRaises(ValueError):
13 | keyword_ctl.get_keyword_name_by_id(keyword_id=10001)
14 |
15 | def test_get_keyword_id_by_name(self):
16 | keyword_id = keyword_ctl.get_keyword_id_by_name(name='python')
17 | self.assertEqual(keyword_id, 100)
18 |
19 | with self.assertRaises(ValueError):
20 | keyword_ctl.get_keyword_id_by_name(name='go')
21 |
22 | def test_insert_keyword_if_not_exist(self):
23 | keyword_name = 'C--'
24 | keyword_id = keyword_ctl.insert_keyword_if_not_exist(keyword_name)
25 | self.assertTrue(keyword_id > 0)
26 | keyword = KeywordModel.get_by_pk(pk=keyword_id)
27 | self.assertEqual(keyword.name, keyword_name)
28 |
29 | self.assertIsNone(keyword_ctl.insert_keyword_if_not_exist(keyword_name))
30 |
--------------------------------------------------------------------------------
/tests/test_controllers/test_keyword_statistic_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from tests import BaseTestCase
3 | from webspider.controllers import keyword_statistic_ctl
4 | from webspider.models import JobModel
5 | from webspider.constants import EDUCATION_REQUEST_DICT, WORK_YEARS_REQUEST_DICT
6 |
7 |
8 | class TestKeywordStatisticController(BaseTestCase):
9 | def test_get_salary_statistic(self):
10 | test_jobs_model = [JobModel(salary='5k-9k'), JobModel(salary='10-15k'), JobModel(salary='15k-20k'),
11 | JobModel(salary='16-18k'), JobModel(salary='20k-30k'), JobModel(salary='30k-35k'),
12 | JobModel(salary='20k以上'), JobModel(salary='60k-100k'), JobModel(salary='40k-42k')]
13 | salary_statistic = keyword_statistic_ctl.get_salary_statistic(test_jobs_model)
14 | self.assertDictEqual(salary_statistic, {
15 | '10k及以下': 2,
16 | '11k-20k': 5,
17 | '21k-35k': 3,
18 | '36k-60k': 2,
19 | '61k以上': 1,
20 | })
21 |
22 | def test_get_finance_stage_statistic(self):
23 | test_jobs_model = [JobModel(company_id=1), JobModel(company_id=2), JobModel(company_id=3)]
24 | finance_stage_statistic = keyword_statistic_ctl.get_finance_stage_statistic(test_jobs_model)
25 | self.assertDictEqual(finance_stage_statistic, {
26 | '未融资': 2,
27 | 'A轮': 1,
28 | })
29 |
30 | def test_get_educations_statistic(self):
31 | test_jobs_model = [JobModel(education=EDUCATION_REQUEST_DICT['大专']),
32 | JobModel(education=EDUCATION_REQUEST_DICT['本科']),
33 | JobModel(education=EDUCATION_REQUEST_DICT['本科'])]
34 | educations_statistic = keyword_statistic_ctl.get_educations_statistic(test_jobs_model)
35 | self.assertDictEqual(educations_statistic, {
36 | '本科': 2,
37 | '大专': 1,
38 | })
39 |
40 | def test_get_work_years_statistic(self):
41 | test_jobs_model = [JobModel(work_year=WORK_YEARS_REQUEST_DICT['应届毕业生']),
42 | JobModel(work_year=WORK_YEARS_REQUEST_DICT['应届毕业生']),
43 | JobModel(work_year=WORK_YEARS_REQUEST_DICT['1-3年'])]
44 | work_years_statistic = keyword_statistic_ctl.get_work_years_statistic(test_jobs_model)
45 | self.assertDictEqual(work_years_statistic, {
46 | '应届毕业生': 2,
47 | '1-3年': 1,
48 | })
49 |
50 | def test_get_city_jobs_count_statistic(self):
51 | test_jobs_model = [JobModel(city_id=2), JobModel(city_id=2), JobModel(city_id=2), JobModel(city_id=2),
52 | JobModel(city_id=3), JobModel(city_id=3), JobModel(city_id=3),
53 | JobModel(city_id=4), JobModel(city_id=4)]
54 | sorted_city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(test_jobs_model)
55 | self.assertDictEqual(sorted_city_jobs_count_statistic, {
56 | '北京': 4,
57 | '上海': 3,
58 | '广州': 2,
59 | })
60 |
61 | sorted_city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(test_jobs_model, 2)
62 | self.assertDictEqual(sorted_city_jobs_count_statistic, {
63 | '北京': 4,
64 | '上海': 3
65 | })
66 |
--------------------------------------------------------------------------------
/tests/test_models/test_job.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from datetime import datetime
3 |
4 | from sqlalchemy import and_
5 |
6 | from tests import BaseTestCase
7 | from webspider.models import JobModel, CityModel
8 |
9 | test_job_dict = dict(id=1,
10 | lg_job_id=10001,
11 | city_id=2,
12 | company_id=1,
13 | title='高级前端开发工程师',
14 | work_year=5,
15 | department='贝壳金控交易研发部-交易前端组招聘',
16 | salary='15k-30k',
17 | education=3,
18 | nature=1,
19 | description='职位介绍A',
20 | advantage='15薪,工作居住证,六险一金,双休',
21 | created_at=datetime.strptime('2018-01-29 19:11:33', '%Y-%m-%d %H:%M:%S'),
22 | updated_at=datetime.strptime('2018-01-30 17:22:30', '%Y-%m-%d %H:%M:%S'))
23 |
24 |
25 | class TestJobModel(BaseTestCase):
26 | def test_pk_name(self):
27 | self.assertEqual(JobModel.pk_name, 'id')
28 |
29 | def test_pk(self):
30 | self.assertEqual(JobModel.pk, JobModel.id)
31 |
32 | def test_model_instance_to_dict(self):
33 | job = JobModel.get_by_pk(pk=1).dict()
34 | self.assertTrue(isinstance(job, dict))
35 | self.assertDictEqual(job, test_job_dict)
36 |
37 | def test_get_by_pk(self):
38 | job = JobModel.get_by_pk(pk=1)
39 | self.assertDictEqual(job.dict(), test_job_dict)
40 |
41 | def test_count(self):
42 | jobs_count = JobModel.count()
43 | self.assertEqual(jobs_count, 3)
44 |
45 | jobs_count = JobModel.count(filter_by={'city_id': 4})
46 | self.assertEqual(jobs_count, 2)
47 |
48 | jobs_count = JobModel.count(filter=(and_(JobModel.city_id == 4, JobModel.company_id == 3)))
49 | self.assertEqual(jobs_count, 1)
50 |
51 | jobs_count = JobModel.count(filter=(JobModel.id == 1))
52 | self.assertEqual(jobs_count, 1)
53 |
54 | def test_is_exist(self):
55 | is_exist = JobModel.is_exist(filter=(JobModel.id == 1))
56 | self.assertEqual(is_exist, True)
57 |
58 | def test_add(self):
59 | to_add_data_dict = dict(lg_job_id=10004,
60 | city_id=3,
61 | company_id=1,
62 | title='Python 开发工程师',
63 | work_year=5,
64 | department='吖吖项目组',
65 | salary='15k-35k',
66 | education=2,
67 | nature=1,
68 | description='职位介绍D',
69 | advantage='16薪,工作居住证,六十八险一金,双休', )
70 | job_id = JobModel.add(**to_add_data_dict)
71 | self.assertTrue(job_id > 0)
72 | job = JobModel.get_by_pk(pk=job_id)
73 | self.assertDictContainsSubset(to_add_data_dict, job.dict())
74 |
75 | def test_get_one(self):
76 | job = JobModel.get_one(filter_by={'id': 1})
77 | self.assertDictEqual(job.dict(), test_job_dict)
78 |
79 | job = JobModel.get_one(filter=(JobModel.id == 1))
80 | self.assertDictEqual(job.dict(), test_job_dict)
81 |
82 | def test_list(self):
83 | # test list
84 | jobs = JobModel.list()
85 | self.assertEqual(len(jobs), 3)
86 | self.assertDictEqual(jobs[0].dict(), test_job_dict)
87 |
88 | # test list limit
89 | jobs = JobModel.list(limit=1)
90 | self.assertEqual(len(jobs), 1)
91 |
92 | # test list offset
93 | jobs = JobModel.list(offset=1)
94 | self.assertEqual(len(jobs), 2)
95 |
96 | # test list filter_by
97 | jobs = JobModel.list(filter_by={'id': 1})
98 | self.assertEqual(len(jobs), 1)
99 | self.assertEqual(jobs[0].dict(), test_job_dict)
100 |
101 | def test_update(self):
102 | init_job_data_dict = JobModel.get_by_pk(pk=1).dict()
103 | to_update_data_dict = dict(title=u'后端吃饭工程师',
104 | work_year=1,
105 | city_id=1,
106 | company_id=1,
107 | department='飞天面条神教招聘',
108 | salary='20k-32k',
109 | education=2,
110 | description=u'日常工作:吃饭!')
111 |
112 | affect_rows = JobModel.update(filter_by={'id': 1}, values=to_update_data_dict)
113 | self.assertEqual(affect_rows, 1)
114 |
115 | # 更新后预期的结果
116 | init_job_data_dict.update(**to_update_data_dict)
117 | predictive_job_data_dict = init_job_data_dict
118 | init_updated_at = init_job_data_dict.pop('updated_at')
119 |
120 | new_job_data_dict = JobModel.get_by_pk(pk=1).dict()
121 | self.assertDictContainsSubset(predictive_job_data_dict, new_job_data_dict)
122 | self.assertGreater(new_job_data_dict.updated_at, init_updated_at)
123 |
124 | # 其他记录不受影响
125 | self.assertEqual(JobModel.get_by_pk(pk=2).title, u'前端开发工程师')
126 |
127 | # 批量更改
128 | affect_rows = JobModel.update(filter_by={'city_id': 4}, values={'title': '测试'})
129 | self.assertEqual(affect_rows, 2)
130 | jobs = JobModel.list(filter_by={'city_id': 4})
131 | self.assertTrue(all([job.title == u'测试' for job in jobs]))
132 |
133 | def test_update_by_pk(self):
134 | affect_rows = JobModel.update_by_pk(pk=1, values={'title': '你好啊啊'})
135 | self.assertEqual(affect_rows, 1)
136 | self.assertEqual(JobModel.get_by_pk(pk=1).title, u'你好啊啊')
137 |
138 | def test_execute_sql_string(self):
139 | job_rows = JobModel.execute_sql_string(
140 | 'SELECT id, title FROM job WHERE id = :id', {'id': 1})
141 | self.assertEqual(len(job_rows), 1)
142 | self.assertEqual(job_rows[0][0], 1)
143 | self.assertEqual(job_rows[0][1], u'高级前端开发工程师')
144 |
145 | job_rows = JobModel.execute_sql_string('SELECT id, title FROM job')
146 | self.assertEqual(len(job_rows), 3)
147 | self.assertEqual(job_rows[0][0], 1)
148 | self.assertEqual(job_rows[0][1], u'高级前端开发工程师')
149 |
150 | affect_rows = JobModel.execute_sql_string(
151 | "UPDATE job SET title = '测试' WHERE id = :id", {'id': 1})
152 | self.assertEqual(affect_rows, 1)
153 | job = JobModel.get_by_pk(pk=1)
154 | self.assertEqual(job.title, u'测试')
155 |
156 | def test_batch_add(self):
157 | # 插入了其他的类实例
158 | init_jobs_count = JobModel.count()
159 | model_instances = [CityModel(name='你好'),
160 | JobModel(title='招聘资深前端工程师', city_id=1, company_id=2, lg_job_id=100056),
161 | JobModel(title='招聘资深中端工程师', city_id=1, company_id=2, lg_job_id=100055), ]
162 |
163 | with self.assertRaises(ValueError):
164 | JobModel.batch_add(model_instances)
165 |
166 | self.assertEqual(JobModel.count(), init_jobs_count)
167 |
168 | model_instances = [JobModel(title='招聘资深前端工程师', city_id=1, company_id=2, lg_job_id=100056),
169 | JobModel(title='招聘资深中端工程师', city_id=1, company_id=2, lg_job_id=100055), ]
170 |
171 | JobModel.batch_add(model_instances)
172 |
173 | self.assertEqual(JobModel.count(), init_jobs_count + 2)
174 |
--------------------------------------------------------------------------------
/tests/test_utils/test_cache.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import time
3 | from unittest import TestCase
4 |
5 | from webspider.utils.cache import simple_cache, cache_clear, redis_instance
6 |
7 | test_number = 0
8 |
9 |
10 | @simple_cache()
11 | def incr_then_return_test_number(keyword=None):
12 | global test_number
13 | test_number += 1
14 | return test_number
15 |
16 |
17 | @simple_cache(ex=1)
18 | def incr_then_return_test_number_with_ex(keyword=None):
19 | global test_number
20 | test_number += 1
21 | return test_number
22 |
23 |
24 | class TestClass(object):
25 | def __init__(self, name):
26 | self.name = name
27 |
28 |
29 | class TestUtilCache(TestCase):
30 |
31 | def setUp(self):
32 | keys = redis_instance.keys('*incr_then_return_test_number*')
33 | if keys:
34 | redis_instance.delete(*keys)
35 |
36 | keys = redis_instance.keys('*return_what_you_put*')
37 | if keys:
38 | redis_instance.delete(*redis_instance.keys('*return_what_you_put*'))
39 |
40 | def test_simple_cache(self):
41 | """测试缓存"""
42 | global test_number
43 | test_number = 0
44 | self.assertEqual(1, incr_then_return_test_number('test'))
45 | self.assertEqual(1, incr_then_return_test_number('test'))
46 | self.assertEqual(2, incr_then_return_test_number('test_1'))
47 | self.assertEqual(2, incr_then_return_test_number('test_1'))
48 | self.assertEqual(3, incr_then_return_test_number('test_2'))
49 |
50 | with self.assertRaises(ValueError):
51 | incr_then_return_test_number(keyword='test')
52 |
53 | def test_simple_cache_with_ex(self):
54 | """测试设置了过期时间的缓存"""
55 | global test_number
56 | test_number = 0
57 | self.assertEqual(1, incr_then_return_test_number_with_ex('test'))
58 | self.assertEqual(1, incr_then_return_test_number_with_ex('test'))
59 | time.sleep(1.1)
60 | self.assertEqual(2, incr_then_return_test_number_with_ex('test'))
61 |
62 | def test_cache_clear(self):
63 | """测试清除缓存"""
64 | global test_number
65 | test_number = 0
66 | self.assertEqual(1, incr_then_return_test_number('test'))
67 | self.assertEqual(2, incr_then_return_test_number('test_1'))
68 | # 清除全部函数缓存
69 | cache_clear(incr_then_return_test_number)
70 | self.assertEqual(3, incr_then_return_test_number('test'))
71 | self.assertEqual(4, incr_then_return_test_number('test_1'))
72 |
73 | # 清除部分函数缓存
74 | cache_clear(incr_then_return_test_number, 'test_1')
75 | self.assertEqual(3, incr_then_return_test_number('test'))
76 | self.assertEqual(5, incr_then_return_test_number('test_1'))
77 |
78 | def test_cache_class_instance(self):
79 | """测试缓存类实例"""
80 |
81 | @simple_cache()
82 | def return_what_you_input(whatever):
83 | return whatever
84 |
85 | instance = TestClass('测试类实例')
86 | # cache class
87 | instance = return_what_you_input(instance)
88 | # get result from redis
89 | cache_instance = return_what_you_input(instance)
90 | self.assertTrue(instance is not cache_instance)
91 | self.assertTrue(isinstance(cache_instance, TestClass))
92 | self.assertEqual(cache_instance.name, '测试类实例')
93 |
94 | def tearDown(self):
95 | keys = redis_instance.keys('*incr_then_return_test_number*')
96 | if keys:
97 | redis_instance.delete(*redis_instance.keys('*incr_then_return_test_number*'))
98 |
99 | keys = redis_instance.keys('*return_what_you_put*')
100 | if keys:
101 | redis_instance.delete(*redis_instance.keys('*return_what_you_put*'))
102 |
--------------------------------------------------------------------------------
/tests/test_utils/test_classproperty.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from webspider.utils.classproperty import classproperty
5 |
6 |
7 | class TestClass(object):
8 | _name = '阿河'
9 |
10 | @classproperty
11 | def name(cls):
12 | return cls._name
13 |
14 |
15 | class TestUtilClassProperty(TestCase):
16 | def test_read_class_property(self):
17 | self.assertEqual(TestClass.name, '阿河')
18 |
--------------------------------------------------------------------------------
/tests/test_utils/test_common.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from webspider.utils.common import get_key_from_dict_by_value, get_field_statistics
5 |
6 |
7 | class TestUtilCommon(TestCase):
8 | def test_get_key_from_dict_by_value(self):
9 | dictionary = {
10 | '全国': 1,
11 | '北京': 2,
12 | '广州': 3,
13 | }
14 | key = get_key_from_dict_by_value(1, dictionary)
15 | self.assertEqual(key, '全国')
16 |
17 | # no key
18 | with self.assertRaises(ValueError):
19 | get_key_from_dict_by_value(4, dictionary)
20 |
21 | dictionary = {
22 | '全国': 1,
23 | '北京': 1,
24 | '广州': 3,
25 | }
26 | key = get_key_from_dict_by_value(3, dictionary)
27 | self.assertEqual(key, '广州')
28 | # multi key
29 | with self.assertRaises(AttributeError):
30 | get_key_from_dict_by_value(1, dictionary)
31 |
32 | def test_get_field_statistics(self):
33 | statistics = get_field_statistics([0, 0, 0, 1, 1], {'男': 0, '女': 1, '不明': 2})
34 | self.assertDictEqual(statistics, {'男': 3, '女': 2})
35 |
--------------------------------------------------------------------------------
/tests/test_utils/test_convert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from webspider.constants import WORK_YEARS_REQUEST_DICT, JOB_NATURE_DICT, COMPANY_SIZE_DICT
5 | from webspider.utils.convert import convert_dict_field_to_constants, convert_field_to_constants
6 |
7 |
8 | class TestUtilConvert(TestCase):
9 | def test_convert_dict_field_to_constants(self):
10 | init_dict = {
11 | 'work_year': '应届毕业生',
12 | 'size': '没有人',
13 | 'nature': '全职',
14 | 'name': '沙师弟',
15 | 'id': 3,
16 | 'value': None
17 | }
18 | convert_dict_field_to_constants(init_dict)
19 | self.assertDictEqual(init_dict, {
20 | 'work_year': WORK_YEARS_REQUEST_DICT['应届毕业生'],
21 | 'size': COMPANY_SIZE_DICT['unknown'],
22 | 'nature': JOB_NATURE_DICT['全职'],
23 | 'name': '沙师弟',
24 | 'id': 3,
25 | 'value': None
26 | })
27 |
28 | def test_convert_field_to_constants(self):
29 | constant_value = convert_field_to_constants(field_name='work_year', field_value='应届毕业生')
30 | self.assertEqual(constant_value, WORK_YEARS_REQUEST_DICT['应届毕业生'])
31 |
32 | constant_value = convert_field_to_constants(field_name='work_year', field_value='家里蹲')
33 | self.assertEqual(constant_value, WORK_YEARS_REQUEST_DICT['unknown'])
34 |
35 | with self.assertRaises(ValueError):
36 | convert_field_to_constants(field_name='dinner', field_value='牛肉饭')
37 |
--------------------------------------------------------------------------------
/tests/test_utils/test_http_tools.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase, mock
3 |
4 | from webspider.utils.http_tools import generate_http_request_headers, requests_get, requests_post
5 |
6 |
7 | class TestUtilHttpTools(TestCase):
8 | def test_generate_http_request_headers(self):
9 | header = generate_http_request_headers()
10 | self.assertTrue(isinstance(header, dict))
11 |
12 | header = generate_http_request_headers(referer='https://www.zhihu.com')
13 | self.assertEqual(header['Referer'], 'https://www.zhihu.com')
14 |
15 | @mock.patch('requests.get')
16 | def test_request_get(self, mock_get):
17 | mock_get.return_value = '200'
18 | response = requests_get(url='https://baidu.com', need_sleep=False)
19 | self.assertEqual(response, '200')
20 |
21 | @mock.patch('requests.post')
22 | def test_request_post(self, mock_post):
23 | mock_post.return_value = '200'
24 | response = requests_post(url='https://baidu.com', need_sleep=False)
25 | self.assertEqual(response, '200')
26 |
--------------------------------------------------------------------------------
/tests/test_utils/test_pagination.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from webspider.utils.pagination import Pagination
5 |
6 |
7 | class TestUtilPagination(TestCase):
8 | def test_pagination(self):
9 | pagination = Pagination(page=2, total=20, per_page=6)
10 | self.assertEqual(pagination.pages, 4)
11 | self.assertEqual(pagination.prev_num, 1)
12 | self.assertEqual(pagination.has_prev, True)
13 | self.assertEqual(pagination.next_num, 3)
14 | self.assertEqual(pagination.has_next, True)
15 | self.assertEqual([page for page in pagination.iter_pages], [1, 2, 3, 4])
16 |
17 | def test_pagination_no_pages(self):
18 | pagination = Pagination(page=2, total=20, per_page=0)
19 | self.assertEqual(pagination.pages, 0)
20 |
21 | def test_pagination_no_pre(self):
22 | pagination = Pagination(page=1, total=20, per_page=6)
23 | self.assertEqual(pagination.has_prev, False)
24 | self.assertEqual(pagination.prev_num, None)
25 | self.assertEqual(pagination.has_next, True)
26 | self.assertEqual(pagination.next_num, 2)
27 |
28 | def test_pagination_no_next(self):
29 | pagination = Pagination(page=4, total=20, per_page=6)
30 | self.assertEqual(pagination.has_prev, True)
31 | self.assertEqual(pagination.prev_num, 3)
32 | self.assertEqual(pagination.has_next, False)
33 | self.assertEqual(pagination.next_num, None)
34 |
--------------------------------------------------------------------------------
/tests/test_utils/test_text.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from webspider.utils.text import to_plaintext
5 |
6 |
7 | class TestUtilText(TestCase):
8 | def test_to_plaintext(self):
9 | init_text = '
abcd \n '
10 | self.assertEqual(to_plaintext(content=init_text, strip=False), 'abcd ')
11 |
12 | init_text = '
abcd \n '
13 | self.assertEqual(to_plaintext(content=init_text, strip=True), 'abcd')
14 |
15 | init_text = '
abcd \n '
16 | self.assertEqual(to_plaintext(content=init_text, pattern=u'a|b', strip=False), 'cd \n ')
17 |
--------------------------------------------------------------------------------
/tests/test_utils/test_time_tools.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from datetime import datetime
3 | from unittest import TestCase
4 |
5 | from webspider.utils.time_tools import (datetime_to_timestamp, timestamp_to_datetime, timestamp_to_datetime_str)
6 |
7 |
8 | class TestUtilTimeTools(TestCase):
9 | def test_datetime_to_timestamp(self):
10 | datetime_obj = datetime(year=2017, month=5, day=10)
11 | timestamp = datetime_to_timestamp(datetime_obj)
12 | self.assertEqual(int(datetime_obj.timestamp()), timestamp)
13 |
14 | def test_timestamp_to_datetime(self):
15 | timestamp = int(datetime(year=2017, month=5, day=10).timestamp())
16 | datetime_obj = timestamp_to_datetime(timestamp=timestamp)
17 | self.assertEqual(datetime_obj.isoformat(), '2017-05-10T00:00:00')
18 |
19 | def test_timestamp_to_datetime_str(self):
20 | timestamp = int(datetime(year=2017, month=5, day=10).timestamp())
21 | datetime_str = timestamp_to_datetime_str(ts=timestamp)
22 | self.assertEqual(datetime_str, '2017-05-10')
23 |
24 | timestamp = int(datetime(year=2018, month=2, day=1, hour=19, minute=46, second=57).timestamp())
25 | datetime_str = timestamp_to_datetime_str(ts=timestamp, time_format='%Y/%m/%d %H:%M:%S')
26 | self.assertEqual(datetime_str, '2018/02/01 19:46:57')
27 |
--------------------------------------------------------------------------------
/tests/test_web/base.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import logging
4 | from urllib.parse import urlencode
5 |
6 | from tornado.testing import AsyncHTTPTestCase
7 | from tornado.escape import json_encode, json_decode
8 |
9 | from webspider.utils.sql import get_session
10 | from webspider.web.app import make_web_app
11 | from tests.util import create_test_db, drop_test_db
12 |
13 | logger = logging.getLogger(__file__)
14 |
15 |
16 | class BaseHandlerTestCase(AsyncHTTPTestCase):
17 | session = get_session()
18 |
19 | def setUp(self):
20 | create_test_db(self.session)
21 | super(BaseHandlerTestCase, self).setUp()
22 |
23 | def tearDown(self):
24 | drop_test_db(self.session)
25 | super(BaseHandlerTestCase, self).tearDown()
26 |
27 | def get_app(self):
28 | return make_web_app()
29 |
30 | def request(self, method, url, headers=None, data=None, json=None, form=None, **kwargs):
31 | if not headers:
32 | headers = {}
33 |
34 | if json is not None:
35 | headers['Content-Type'] = 'application/json'
36 | data = json_encode(json)
37 |
38 | elif form is not None:
39 | headers['Content-Type'] = 'application/x-www-form-urlencoded'
40 | data = urlencode(form)
41 |
42 | response = self.fetch(url, method=method, headers=headers, body=data, allow_nonstandard_methods=True,
43 | **kwargs)
44 |
45 | if response.code / 100 != 2:
46 | logger.error(response.body)
47 |
48 | return response
49 |
50 | def get(self, url, **kwargs):
51 | return self.request(url=url, method="GET", **kwargs)
52 |
53 | def post(self, url, **kwargs):
54 | return self.request(url=url, method="POST", **kwargs)
55 |
56 | def put(self, url, **kwargs):
57 | return self.request(url=url, method="PUT", **kwargs)
58 |
59 | def fetch_json(self, path, **kwargs):
60 | response = self.request('GET', path, **kwargs)
61 | if response.code / 100 != 2:
62 | raise ValueError('fetch json expect http code 2xx, got {}'.format(response.code))
63 | return json_decode(response.body)
64 |
--------------------------------------------------------------------------------
/tests/test_web/test_formatter.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from unittest import TestCase
3 |
4 | from webspider.exceptions import DowngradeException
5 | from webspider.web.formatter.base import Field, Downgrade, Formatter
6 |
7 | """
8 | 准备测试数据
9 | """
10 |
11 |
12 | class TestFormatter(Formatter):
13 | FIELDS = [
14 | Field('name', converter=lambda name: 'Mr.' + name),
15 | Field('value', converter=lambda value: int(value), downgrade=Downgrade(0)),
16 | Field('count'),
17 | ]
18 |
19 |
20 | class TestModel(object):
21 | def __init__(self, name=None, value=None, count=None):
22 | self.name = name
23 | self.value = value
24 | self.count = count
25 |
26 |
27 | class TestModelB(object):
28 | pass
29 |
30 |
31 | formatter_mappings = {
32 | TestModel: TestFormatter
33 | }
34 |
35 | """end"""
36 |
37 |
38 | class TestFormatter(TestCase):
39 |
40 | def test_register_formatter(self):
41 | Formatter.register_formatter(formatter_mappings)
42 | self.assertDictContainsSubset(formatter_mappings, Formatter._FORMATTER_MAPS)
43 |
44 | def test_get_formatter(self):
45 | Formatter.register_formatter(formatter_mappings)
46 |
47 | formatter = Formatter.get_formatter(TestModel)
48 | self.assertTrue(formatter is formatter_mappings[TestModel])
49 |
50 | formatter = Formatter.get_formatter(TestModel())
51 | self.assertTrue(formatter is formatter_mappings[TestModel])
52 |
53 | formatter = Formatter.get_formatter(TestModelB)
54 | self.assertTrue(formatter is None)
55 |
56 | def test_downgrade(self):
57 | # 测试降级
58 | Formatter.register_formatter(formatter_mappings)
59 | test_model = TestModel(name='He', value='10a', count=100)
60 | format_result = Formatter.format(test_model)
61 | self.assertDictEqual(format_result, {
62 | 'name': 'Mr.He',
63 | 'value': 0,
64 | 'count': 100
65 | })
66 |
67 | def test_field(self):
68 | with self.assertRaises(DowngradeException):
69 | Field(name='hi', downgrade=0)
70 |
71 | def test_format(self):
72 | Formatter.register_formatter(formatter_mappings)
73 |
74 | test_model = TestModel(name='He', value='10', count=100)
75 | format_result = Formatter.format(test_model)
76 | self.assertDictEqual(format_result, {
77 | 'name': 'Mr.He',
78 | 'value': 10,
79 | 'count': 100
80 | })
81 |
82 | # 测试 list format
83 | test_models = [TestModel(name='He', value='10', count=100),
84 | TestModel(name='Wei', value='20', count=1)]
85 | format_result = Formatter.format(test_models)
86 | self.assertDictEqual(format_result[0], {
87 | 'name': 'Mr.He',
88 | 'value': 10,
89 | 'count': 100
90 | })
91 | self.assertDictEqual(format_result[1], {
92 | 'name': 'Mr.Wei',
93 | 'value': 20,
94 | 'count': 1
95 | })
96 |
97 | # 测试嵌套 format
98 | test_models = TestModel(name='He', value='10', count=TestModel(name='child', value='20', count=1))
99 | format_result = Formatter.format(test_models)
100 | self.assertDictEqual(format_result, {
101 | 'name': 'Mr.He',
102 | 'value': 10,
103 | 'count': {
104 | 'name': 'Mr.child',
105 | 'value': 20,
106 | 'count': 1,
107 | }
108 | })
109 |
--------------------------------------------------------------------------------
/tests/test_web/test_keyword_statistic.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from datetime import datetime
3 |
4 | from tornado.escape import json_decode
5 |
6 | from tests.test_web.base import BaseHandlerTestCase
7 | from webspider.utils.time_tools import datetime_to_timestamp
8 |
9 | predictive_keyword_statistic_dict = {
10 | 'educations': {'不限': 1, '大专': 2, '本科': 4, '硕士': 5, '博士': 6, 'unknown': 7},
11 | 'city_jobs_count': {'北京': 8, '深圳': 9, '广州': 10},
12 | 'salary': {'10k以下': 11, '11k-20k': 12, '21k-35k': 13, '36k-60k': 14, '61k以上': 15},
13 | 'financing_stage': {'未融资': 16, '天使轮': 17, 'A轮': 18, 'B轮': 19, 'C轮': 20,
14 | 'D轮及以上': 21, '上市公司': 22, '不需要融资': 23, 'unknown': 24},
15 | 'work_years': {'不限': 25, '应届毕业生': 26, '1年以下': 27, '1-3年': 28, '3-5年': 29,
16 | '5-10年': 30, '10年以上': 31, 'unknown': 32},
17 | 'per_day_jobs_count': [
18 | {
19 | 'date': 20180128, 'all_city': 576, 'beijing': 198, 'guangzhou': 35, 'shenzhen': 93, 'shanghai': 80,
20 | 'hangzhou': 41, 'chengdu': 26,
21 | 'created_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S')),
22 | 'updated_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S'))
23 | },
24 | {
25 | 'date': 20180129, 'all_city': 580, 'beijing': 200, 'guangzhou': 36, 'shenzhen': 100, 'shanghai': 82,
26 | 'hangzhou': 44, 'chengdu': 30,
27 | 'created_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S')),
28 | 'updated_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S'))
29 | }],
30 | 'created_at': datetime_to_timestamp(datetime.strptime('2018-02-01 19:01:44', '%Y-%m-%d %H:%M:%S')),
31 | 'updated_at': datetime_to_timestamp(datetime.strptime('2018-02-05 01:01:48', '%Y-%m-%d %H:%M:%S')),
32 | }
33 |
34 |
35 | class TestKeywordStatisticsApiHandler(BaseHandlerTestCase):
36 |
37 | def test_get(self):
38 | response = self.fetch_json('/api/statistics?keyword_name=python')
39 | self.assertDictEqual(predictive_keyword_statistic_dict, response)
40 |
41 | def test_get_when_error(self):
42 | response = self.get('/api/statistics')
43 | self.assertEqual(response.code, 404)
44 | predictive_response_content = {
45 | u"error": {
46 | u"message": u"请输入关键词",
47 | u"code": 4041,
48 | u"name": u"ResourceNotFoundWebException",
49 | u'data': '',
50 | u'debug_message': '',
51 | }
52 | }
53 | self.assertDictEqual(predictive_response_content, json_decode(response.body))
54 |
55 | response = self.get('/api/statistics?keyword_name=种田')
56 | self.assertEqual(response.code, 404)
57 | predictive_response_content = {
58 | u"error": {
59 | u"message": u"找不到该关键词",
60 | u"code": 4041,
61 | u"name": u"ResourceNotFoundWebException",
62 | u'data': '',
63 | u'debug_message': '',
64 | }
65 | }
66 | self.assertDictEqual(predictive_response_content, json_decode(response.body))
67 |
68 | response = self.get('/api/statistics?keyword_name=java')
69 | self.assertEqual(response.code, 404)
70 | predictive_response_content = {
71 | u"error": {
72 | u"message": u"暂无该关键词的统计结果",
73 | u"code": 4041,
74 | u"name": u"ResourceNotFoundWebException",
75 | u'data': '',
76 | u'debug_message': '',
77 | }
78 | }
79 | self.assertDictEqual(predictive_response_content, json_decode(response.body))
80 |
81 |
82 | class TestKeywordStatisticsPageHandler(BaseHandlerTestCase):
83 |
84 | def test_get(self):
85 | response = self.get('/statistics?keyword_name=python')
86 | self.assertEqual(response.code, 200)
87 |
88 | def test_get_when_error(self):
89 | response = self.get('/api/statistics')
90 | self.assertEqual(response.code, 404)
91 |
92 | response = self.get('/api/statistics?keyword_name=种田')
93 | self.assertEqual(response.code, 404)
94 |
95 | response = self.get('/api/statistics?keyword_name=java')
96 | self.assertEqual(response.code, 404)
97 |
--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os
3 |
4 | from sqlalchemy import text
5 |
6 |
7 | def execute_sql_file(file_paths, db_session, predictive_db_name=''):
8 | if predictive_db_name:
9 | assert get_current_database_name(db_session) == predictive_db_name
10 | for file_path in file_paths:
11 | sql_file = open(file_path, 'r')
12 |
13 | sql_command = ''
14 |
15 | for line in sql_file:
16 | if not line.startswith('--'):
17 | sql_command += line.strip('\n')
18 |
19 | if sql_command.endswith(';'):
20 | db_session.execute(text(sql_command))
21 | db_session.flush()
22 | sql_command = ''
23 |
24 |
25 | def get_current_database_name(db_session):
26 | return db_session.execute('select database();').scalar()
27 |
28 |
29 | def create_test_db(session, db_name='test_spider'):
30 | """转载数据库"""
31 | # 清除测试数据库
32 | drop_test_db(session)
33 | # 创建测试数据库
34 | session.execute("CREATE DATABASE {db_name} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;".format(
35 | db_name=db_name))
36 | # 指定测试数据库 test_spider
37 | session.execute("USE {db_name};".format(db_name=db_name))
38 |
39 | path = os.path.dirname(__file__)
40 | # 创建表
41 | execute_sql_file(
42 | file_paths=[os.path.join(path, "schema.sql"), ],
43 | db_session=session,
44 | predictive_db_name=db_name
45 | )
46 | fixture_path = os.path.join(path, 'fixture')
47 | # 装载表数据
48 | fixture_file_paths = [os.path.join(fixture_path, file) for file in os.listdir(fixture_path)]
49 | execute_sql_file(
50 | file_paths=fixture_file_paths,
51 | db_session=session,
52 | predictive_db_name=db_name
53 | )
54 | assert get_current_database_name(session) == 'test_spider'
55 |
56 |
57 | def drop_test_db(session, db_name='test_spider'):
58 | # 清除测试数据库
59 | session.execute("DROP DATABASE IF EXISTS {db_name};".format(db_name=db_name))
60 |
--------------------------------------------------------------------------------
/webspider/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | __version__ = '0.0.2'
3 |
--------------------------------------------------------------------------------
/webspider/constants.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # flake8: noqa
3 | import os
4 |
5 | """
6 | 工作类型
7 | """
8 |
9 |
10 | class LGJobType(object):
11 | all = '全部'
12 | technology = '技术'
13 | product = '产品'
14 | design = '设计'
15 | operation = '运营'
16 | sell_and_market = '市场与销售'
17 | function = '职能'
18 |
19 |
20 | """
21 | 公司融资阶段
22 | """
23 | FINANCE_STAGE_DICT = {
24 | 'unknown': 0,
25 | '未融资': 1,
26 | '天使轮': 2,
27 | 'A轮': 3,
28 | 'B轮': 4,
29 | 'C轮': 5,
30 | 'D轮及以上': 6,
31 | '上市公司': 7,
32 | '不需要融资': 8,
33 | }
34 |
35 | """
36 | 工作性质
37 | """
38 | JOB_NATURE_DICT = {
39 | 'unknown': 0,
40 | '全职': 1,
41 | '兼职': 2,
42 | '实习': 3,
43 | }
44 |
45 | """
46 | 工作年限要求
47 | """
48 | WORK_YEARS_REQUEST_DICT = {
49 | 'unknown': 0,
50 | '不限': 1,
51 | '应届毕业生': 2,
52 | '1年以下': 3,
53 | '1-3年': 4,
54 | '3-5年': 5,
55 | '5-10年': 6,
56 | '10年以上': 7,
57 | }
58 |
59 | """
60 | 学历要求
61 | """
62 | EDUCATION_REQUEST_DICT = {
63 | 'unknown': 0,
64 | '不限': 1,
65 | '大专': 2,
66 | '本科': 3,
67 | '硕士': 4,
68 | '博士': 5,
69 | }
70 |
71 | """
72 | 公司规模
73 | """
74 | COMPANY_SIZE_DICT = {
75 | 'unknown': 0,
76 | '少于15人': 1,
77 | '15-50人': 2,
78 | '50-150人': 3,
79 | '150-500人': 4,
80 | '500-2000人': 5,
81 | '2000人以上': 6,
82 | }
83 |
84 | """
85 | 其他常量
86 | """
87 |
88 | DEBUG = (os.environ.get('ENV', 'dev') == 'dev')
89 |
90 | SECONDS_OF_DAY = 60 * 60 * 24
91 |
92 | REQUEST_TIMEOUT = 4
93 |
94 | # 爬虫最小睡眠时间
95 | MIN_SLEEP_SECS = 3
96 |
97 | # 爬虫最大睡眠时间
98 | MAX_SLEEP_SECS = 5
99 |
100 | """
101 | REDIS KEY 相关
102 | """
103 |
104 | CRAWLED_COMPANY_JOBS_REDIS_KEY = 'crawled_company_jobs_{lg_company_id}'
105 |
106 | """
107 | 字段长度限制
108 | """
109 | COMPANY_INTRODUCE_MAX_LEN = 2048
110 | COMPANY_ADVANTAGE_MAX_LEN = 256
111 | JOB_DESCRIPTION_MAX_LEN = 2048
112 | JOB_ADVANTAGE_MAX_LEN = 256
113 |
114 | """
115 | retry 相关
116 | """
117 | # 用来设定最大的尝试次数,超过该次数就停止重试
118 | RETRY_TIMES = 3
119 | # 函数最久持续时间
120 | STOP_MAX_DELAY = 1000 * 30
121 | # 设置在两次retrying之间的停留时间
122 | WAIT_FIXED = 1000 * 2
123 |
124 | """
125 | HTTP 相关 (基于减少 lg 的网站负载考虑, 屏蔽以下常量的实际内容)
126 | """
127 | HTTP_HEADER = {}
128 |
129 | USER_AGENT_LIST = ['for_test']
130 |
131 | """
132 | 相关网页 (基于减少 lg 的网站负载考虑, 屏蔽以下常量的实际内容)
133 | """
134 |
135 | JOB_JSON_URL = ''
136 |
137 | JOB_DETAIL_URL = ''
138 |
139 | COMPANY_DETAIL_URL = ''
140 |
141 | ALL_CITY_URL = ''
142 |
143 | COMPANIES_URL = ''
144 |
145 | COMPANY_JOBS_URL = ''
146 |
147 | # COMPANIES_URL sort field
148 | SORTED_BY_JOBS_COUNT = 1
149 |
150 | # 生产环境 和 个人开发环境加载真实常量的值
151 | if os.environ.get('ENV', '') in ('production', 'dev'):
152 | from webspider.security_constants import *
153 |
--------------------------------------------------------------------------------
/webspider/controllers/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
--------------------------------------------------------------------------------
/webspider/controllers/city_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | from sqlalchemy.exc import IntegrityError
4 |
5 | from webspider.models.city import CityModel
6 |
7 |
8 | def get_city_id_by_name(name):
9 | city = CityModel.get_one(filter_by={'name': name})
10 | if not city:
11 | raise ValueError('Get None when city name is {}'.format(name))
12 | return city.id
13 |
14 |
15 | def insert_city_if_not_exist(name):
16 | if CityModel.is_exist(filter_by={'name': name}):
17 | return
18 | try:
19 | city_id = CityModel.add(name=name)
20 | return city_id
21 | except IntegrityError:
22 | pass
23 |
24 |
25 | def get_city_name_dict():
26 | """
27 | :return: dict{city_name: city_id, ....} eg: {'北京': 2, '上海':3, ......}
28 | """
29 | cities = CityModel.list()
30 | return {city.name: city.id for city in cities}
31 |
--------------------------------------------------------------------------------
/webspider/controllers/industry_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from sqlalchemy.exc import IntegrityError
3 |
4 | from webspider.models.industry import IndustryModel
5 |
6 |
7 | def insert_industry_if_not_exist(name):
8 | if IndustryModel.is_exist(filter_by={'name': name}):
9 | return
10 | try:
11 | industry_id = IndustryModel.add(name=name)
12 | return industry_id
13 | except IntegrityError:
14 | pass
15 |
16 |
17 | def get_industry_id_by_name(name):
18 | industry = IndustryModel.get_one(filter_by={'name': name})
19 | if not industry:
20 | raise ValueError('Get None when industry name is {}'.format(name))
21 | return industry.id
22 |
--------------------------------------------------------------------------------
/webspider/controllers/job_ctl.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 |
4 |
5 | def get_salary_section(string):
6 | """
7 | e.g:
8 | 15k-25k -> (15, 25)
9 | 15k以上 -> (15, 20)
10 | 15k以下 -> (10, 15)
11 | :param string: 15k-25k
12 | :return: 15,25
13 | """
14 | pattern = r'K|k|以上|以下'
15 | replace_char = ''
16 |
17 | if string.find('-') != -1:
18 | string = re.sub(pattern=pattern, repl=replace_char, string=string)
19 | start, end = string.split('-')
20 | elif string.endswith('以下'):
21 | string = re.sub(pattern=pattern, repl=replace_char, string=string)
22 | start, end = int(string) - 5 if int(string) - 5 >= 0 else 1, string
23 | elif string.endswith('以上'):
24 | string = re.sub(pattern=pattern, repl=replace_char, string=string)
25 | start, end = string, int(string) + 5
26 | else:
27 | raise ValueError('error salary' + string)
28 |
29 | return int(start), int(end)
30 |
--------------------------------------------------------------------------------
/webspider/controllers/job_keyword_ctl.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from sqlalchemy import func
3 |
4 | from webspider.models.job_keyword import JobKeywordModel
5 |
6 |
7 | def get_most_frequently_keyword_ids(limit=None, offset=None):
8 | """
9 | 获得出现最为频繁的关键词 id
10 | :param limit:
11 | :param offset:
12 | :return: 关键词 id 集合
13 | :rtype: List[int]
14 | """
15 | result = JobKeywordModel.list(columns=JobKeywordModel.keyword_id, group_by=JobKeywordModel.keyword_id,
16 | order_by=func.count(JobKeywordModel.id).desc(), limit=limit, offset=offset)
17 | return [item[0] for item in result]
18 |
--------------------------------------------------------------------------------
/webspider/controllers/keyword_ctl.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | from sqlalchemy.exc import IntegrityError
4 |
5 | from webspider.models.keyword import KeywordModel
6 |
7 |
8 | def insert_keyword_if_not_exist(name):
9 | if KeywordModel.is_exist(filter_by={'name': name}):
10 | return
11 | try:
12 | keyword_id = KeywordModel.add(name=name)
13 | return keyword_id
14 | except IntegrityError:
15 | pass
16 |
17 |
18 | def get_keyword_name_by_id(keyword_id):
19 | keyword = KeywordModel.get_by_pk(keyword_id)
20 | if not keyword:
21 | raise ValueError('Get None when keyword id is {}'.format(keyword_id))
22 | return keyword.name
23 |
24 |
25 | def get_keyword_id_by_name(name):
26 | keyword = KeywordModel.get_one(filter_by={'name': name})
27 | if not keyword:
28 | raise ValueError('Get None when keyword id is {}'.format(name))
29 | return keyword.id
30 |
--------------------------------------------------------------------------------
/webspider/controllers/keyword_statistic_ctl.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from collections import Counter
3 |
4 | from webspider import utils
5 | from webspider import constants
6 | from webspider.models import CompanyModel
7 | from webspider.controllers import city_ctl, job_ctl
8 |
9 |
10 | def get_salary_statistic(jobs):
11 | """
12 | 获取薪水统计情况
13 |
14 | :param jobs: webspider.models.JobModel instances list
15 | :return: collections.Counter
16 | """
17 | salary_statistic = Counter()
18 | for job in jobs:
19 | start_salary, end_salary = job_ctl.get_salary_section(job.salary)
20 | if start_salary <= 10:
21 | salary_statistic['10k及以下'] += 1
22 | if start_salary <= 20 and end_salary >= 11:
23 | salary_statistic['11k-20k'] += 1
24 | if start_salary <= 35 and end_salary >= 21:
25 | salary_statistic['21k-35k'] += 1
26 | if start_salary <= 60 and end_salary >= 36:
27 | salary_statistic['36k-60k'] += 1
28 | if end_salary >= 61:
29 | salary_statistic['61k以上'] += 1
30 | return salary_statistic
31 |
32 |
33 | def get_finance_stage_statistic(jobs):
34 | """
35 | 获取 jobs 的公司的统治情况统计
36 |
37 | :param jobs: webspider.models.JobModel instances list
38 | :return: collections.Counter
39 | """
40 | company_ids = [job.company_id for job in jobs]
41 | companies = CompanyModel.list(filter=CompanyModel.id.in_(company_ids))
42 |
43 | finance_stage_statistic = utils.common.get_field_statistics(values=[company.finance_stage for company in companies],
44 | constants_dict=constants.FINANCE_STAGE_DICT)
45 | return finance_stage_statistic
46 |
47 |
48 | def get_educations_statistic(jobs):
49 | """
50 | 获取教育背景要求统计
51 |
52 | :param jobs: webspider.models.JobModel instances list
53 | :return: collections.Counter
54 | """
55 | return utils.common.get_field_statistics(values=[job.education for job in jobs],
56 | constants_dict=constants.EDUCATION_REQUEST_DICT)
57 |
58 |
59 | def get_work_years_statistic(jobs):
60 | """
61 | 获取工作年限要求统计
62 |
63 | :param jobs: webspider.models.JobModel instances list
64 | :return: collections.Counter
65 | """
66 | return utils.common.get_field_statistics(values=[job.work_year for job in jobs],
67 | constants_dict=constants.WORK_YEARS_REQUEST_DICT)
68 |
69 |
70 | def get_city_jobs_count_statistic(jobs, limit=10):
71 | """
72 | 获取各城市职位统计
73 | :param jobs: webspider.models.JobModel instances list
74 | :param limit: 指定获取职位数量前几位的城市
75 | :return: collections.Counter
76 | """
77 | city_name_dict = city_ctl.get_city_name_dict()
78 | city_job_count = utils.common.get_field_statistics(values=[job.city_id for job in jobs],
79 | constants_dict=city_name_dict)
80 | city_job_count = sorted(city_job_count.items(), key=lambda x: x[1], reverse=True)
81 | if limit:
82 | city_job_count = city_job_count[:limit]
83 | return Counter({item[0]: item[1] for item in city_job_count})
84 |
--------------------------------------------------------------------------------
/webspider/crawlers/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from webspider.crawlers.lg_cites import get_cites_from_lg
3 | from webspider.crawlers.lg_companies import (get_companies_pagination_from_lg, get_companies_from_lg,
4 | get_company_detail_from_lg, clean_lg_company_data, )
5 | from webspider.crawlers.lg_jobs import (get_jobs_pagination_from_lg, get_jobs_from_lg,
6 | get_job_detail_from_lg, clean_lg_job_data, )
7 | from webspider.crawlers.lg_jobs_count import get_jobs_count_from_lg
8 |
9 | __all__ = ['get_cites_from_lg', 'get_companies_pagination_from_lg', 'get_companies_from_lg',
10 | 'get_company_detail_from_lg', 'clean_lg_company_data', 'get_jobs_pagination_from_lg',
11 | 'get_jobs_from_lg', 'get_job_detail_from_lg', 'clean_lg_job_data', 'get_jobs_count_from_lg']
12 |
--------------------------------------------------------------------------------
/webspider/crawlers/lagou_cites.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import re
3 | import logging
4 |
5 | import requests
6 | from lxml import etree
7 | from tornado.util import ObjectDict
8 |
9 | from webspider import constants
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | def get_cites_from_lg():
15 | """
16 | 爬取城市数据
17 |
18 | 返回的 dict 组成:
19 | id:
20 | type: int
21 | meaning: 城市 id
22 | eg: 1
23 | name:
24 | type: str
25 | meaning: 城市名
26 | eg: 北京
27 |
28 | :return: 城市数据集合
29 | :rtype: List[tornado.util.ObjectDict]
30 | """
31 | logger.info(u'begin crawl cities info......')
32 |
33 | response_html = etree.HTML(requests.get(constants.ALL_CITY_URL).text)
34 | cities_html_list = response_html.xpath("//ul[@class='city_list']/li/a")
35 |
36 | cities_dicts = []
37 | for city_html in cities_html_list:
38 | city_name = city_html.xpath('./text()')[0]
39 | city_id = re.findall(pattern=r'/(\d+)-\d+-\d+', string=city_html.xpath('./@href')[0])[0]
40 | cities_dicts.append(ObjectDict(id=city_id, name=city_name))
41 |
42 | logger.info(u'crawl cities info finished! cites quantity is {cities_count}'.format(
43 | cities_count=len(cities_dicts)))
44 | return cities_dicts
45 |
--------------------------------------------------------------------------------
/webspider/crawlers/lagou_companies.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import re
3 | import json
4 | import logging
5 |
6 | from lxml import etree
7 | from tornado.util import ObjectDict
8 |
9 | from webspider import utils
10 | from webspider import constants
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | def get_companies_pagination_from_lg(city_id=0, finance_stage_id=0, industry_id=0, page_no=1):
16 | """
17 | 爬取公司分页数据
18 |
19 | :param city_id: 城市 id
20 | :param finance_stage_id: 融资阶段 id
21 | :param industry_id: 行业 id
22 | :param page_no: 页码
23 | :return: 公司分页数据
24 | :rtype: utils.pagination.Pagination
25 | """
26 | url = constants.COMPANIES_URL.format(city_id=city_id,
27 | finance_stage_id=finance_stage_id,
28 | industry_id=industry_id)
29 |
30 | params = {'pn': page_no, 'sortField': constants.SORTED_BY_JOBS_COUNT}
31 | response_json = utils.http_tools.requests_get(url=url, params=params).json()
32 | pagination = utils.pagination.Pagination(per_page=int(response_json['pageSize']),
33 | total=int(response_json['totalCount']))
34 |
35 | return pagination
36 |
37 |
38 | def get_companies_from_lg(city_id=0, finance_stage_id=0, industry_id=0, page_no=1):
39 | """
40 | 爬取公司数据
41 |
42 | 返回的 dict 组成:
43 | lg_company_id:
44 | type: int
45 | meaning: 接口使用的公司 id
46 | eg: 1
47 | fullname:
48 | type: str
49 | meaning: 公司全称
50 | eg: 智者四海北京科技有限公司
51 | city_name:
52 | type: str
53 | meaning: 城市名
54 | eg: 北京
55 | shortname:
56 | type: str
57 | meaning: 公司简称
58 | eg: 知乎
59 | fullname:
60 | type: str
61 | meaning: 公司全称
62 | eg: 智者四海北京科技有限公司
63 | finance_stage:
64 | type: str
65 | meaning: 融资阶段
66 | eg: D轮
67 | features:
68 | type: str
69 | meaning: 公司slogan, 一句话简介
70 | eg: 发现更大的世界
71 | process_rate:
72 | type: int
73 | meaning: 简历处理率
74 | eg: 94
75 | industries:
76 | type: str
77 | meaning: 所处行业
78 | eg: '互联网,社交' or '互联网'
79 | advantage:
80 | type: List[str]
81 | meaning: 公司优势
82 | eg: ['双休', '五险一金', ......]
83 | address:
84 | type: str
85 | meaning: 公司地址
86 | eg: 北京市海淀区学院路768创意园
87 | size:
88 | type: str
89 | meaning: 公司规模
90 | eg: 2000人以上
91 | introduce:
92 | type: List[str]
93 | meaning: 公司介绍
94 | eg: ['我们的愿景:', 'blablabla', '我们处于一个知识 balala...']
95 |
96 | :param city_id: 城市 id
97 | :param finance_stage_id: 融资阶段 id
98 | :param industry_id: 行业 id
99 | :param page_no: 页码
100 | :return: 公司数据集合
101 | :rtype: List[tornado.util.ObjectDict]
102 | """
103 | url = constants.COMPANIES_URL.format(city_id=city_id,
104 | finance_stage_id=finance_stage_id,
105 | industry_id=industry_id)
106 | params = {'pn': page_no, 'sortField': constants.SORTED_BY_JOBS_COUNT}
107 | companies = utils.http_tools.requests_get(url=url, params=params).json()['result']
108 |
109 | companies_dicts = []
110 | for company in companies:
111 | lg_company_id = int(company.get('companyId'))
112 |
113 | company_detail = get_company_detail_from_lg(lg_company_id=lg_company_id)
114 | companies_dicts.append(ObjectDict(
115 | lg_company_id=lg_company_id,
116 | city_name=company.get('city'),
117 | shortname=company.get('companyShortName'),
118 | fullname=company.get('companyFullName'),
119 | finance_stage=company.get('financeStage'),
120 | features=company.get('companyFeatures'),
121 | process_rate=company.get('processRate'),
122 | industries=company.get('industryField'),
123 | # company detail
124 | advantage=company_detail.get('advantage'),
125 | address=company_detail.get('address'),
126 | size=company_detail.get('size'),
127 | introduce=company_detail.get('introduce')
128 | ))
129 | return companies_dicts
130 |
131 |
132 | def get_company_detail_from_lg(lg_company_id):
133 | """
134 | 爬取公司详情页的数据
135 |
136 | 返回的 dict 组成:
137 | advantage:
138 | type: List[str]
139 | meaning: 公司优势
140 | eg: ['双休', '五险一金', ......]
141 | address:
142 | type: str
143 | meaning: 公司地址
144 | eg: 北京市海淀区学院路768创意园
145 | size:
146 | type: str
147 | meaning: 公司规模
148 | eg: 2000人以上
149 | introduce:
150 | type: List[str]
151 | meaning: 公司介绍
152 | eg: ['我们的愿景:', 'blablabla', '我们处于一个知识 balala...']
153 |
154 | :param lg_company_id: 接口使用的公司 id
155 | :return: 公司详情页数据
156 | :rtype: tornado.util.ObjectDict
157 | """
158 | response = utils.http_tools.requests_get(
159 | url=constants.COMPANY_DETAIL_URL.format(lg_company_id=lg_company_id))
160 | company_detail_html = etree.HTML(response.text)
161 |
162 | advantage = company_detail_html.xpath('//div[@id="tags_container"]//li/text()')
163 | sizes = company_detail_html.xpath('//div[@id="basic_container"]//li[3]/span/text()')
164 | address = company_detail_html.xpath('//p[@class="mlist_li_desc"]/text()')
165 | introduces = company_detail_html.xpath('//span[@class="company_content"]//text()')
166 |
167 | if not sizes:
168 | logger.error(
169 | 'can not get size by lg_company_id = {}, html code is \n{}'.format(lg_company_id, response.text))
170 |
171 | return ObjectDict(
172 | advantage=advantage,
173 | address=address[0] if address else '',
174 | size=sizes[0] if sizes else '',
175 | introduce=introduces,
176 | )
177 |
178 |
179 | def clean_lg_company_data(company_dict):
180 | """
181 | 清洗爬取到的公司信息
182 |
183 | :param company_dict: tornado.util.ObjectDict
184 | """
185 | if 'size' in company_dict:
186 | company_dict.size = company_dict.size.strip()
187 | if 'finance_stage' in company_dict:
188 | company_dict.finance_stage = company_dict.finance_stage.strip()
189 | if 'features' in company_dict:
190 | company_dict.features = utils.text.to_plaintext(company_dict.features)
191 | if 'address' in company_dict:
192 | company_dict.address = utils.text.to_plaintext(company_dict.address)
193 | if 'introduce' in company_dict:
194 | company_dict.introduce = ''.join(company_dict.introduce) if company_dict.introduce else ''
195 | company_dict.introduce = company_dict.introduce[:constants.COMPANY_INTRODUCE_MAX_LEN]
196 | if 'advantage' in company_dict:
197 | company_dict.advantage = list(map(utils.text.to_plaintext, company_dict.advantage))
198 | company_dict.advantage = json.dumps(company_dict.advantage)[
199 | :constants.COMPANY_ADVANTAGE_MAX_LEN]
200 | if 'industries' in company_dict:
201 | company_dict.industries = set(re.split(r",|,|、|\s", company_dict.industries))
202 |
--------------------------------------------------------------------------------
/webspider/crawlers/lagou_jobs.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 |
4 | from lxml import etree
5 | from tornado.util import ObjectDict
6 |
7 | from webspider import utils
8 | from webspider import constants
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def get_jobs_pagination_from_lg(lg_company_id, job_type, page_no=1, is_school_job=False):
14 | """
15 | 爬取职位分页数据
16 |
17 | :param lg_company_id: 接口使用的公司 id
18 | :param job_type: 职位类型
19 | :param page_no: 页码
20 | :param is_school_job: 是否爬取校招职位
21 | :return:
22 | """
23 | params = {
24 | 'companyId': lg_company_id,
25 | 'positionFirstType': job_type,
26 | 'schoolJob': is_school_job,
27 | 'pageNo': page_no,
28 | 'pageSize': 10,
29 | }
30 | response_json = utils.http_tools.requests_get(
31 | url=constants.COMPANY_JOBS_URL, params=params).json()
32 | pagination = utils.pagination.Pagination(per_page=int(response_json['content']['data']['page']['pageSize']),
33 | total=int(response_json['content']['data']['page']['totalCount']))
34 |
35 | return pagination
36 |
37 |
38 | def get_jobs_from_lg(lg_company_id, job_type, page_no=1, is_school_job=False):
39 | """
40 | 爬取职位数据
41 |
42 | 返回的 dict 组成:
43 | lg_job_id:
44 | type: int
45 | meaning: 接口使用的职位 id
46 | eg: 1
47 | city_name:
48 | type: str
49 | meaning: 城市名
50 | eg: 北京
51 | title:
52 | type: str
53 | meaning: 职位标题
54 | eg: 招聘后端工程师
55 | salary:
56 | type: str
57 | meaning: 薪酬范围
58 | eg: '10k~20k'
59 | education:
60 | type: str
61 | meaning: 教育背景要求
62 | eg: 本科或以上
63 | nature:
64 | type: str
65 | meaning: 职位性质
66 | eg: 全职
67 | work_year:
68 | type: str
69 | meaning: 工作年限要求
70 | eg: 1~3年
71 | advantage:
72 | type: str
73 | meaning: 职位优势
74 | eg: 大平台,五险一金
75 | department:
76 | type: str
77 | meaning: 招聘部门
78 | eg: 商业部
79 | keywords:
80 | type: List[str]
81 | meaning: 职位关键词
82 | eg: ['后端', 'Web', 'Python']
83 | description:
84 | type: List[str]
85 | meaning: 职位介绍
86 | eg: ['职位要求:', 'blablabla', '.......']
87 |
88 | :param lg_company_id: 接口使用的公司 id
89 | :param job_type: 职位类型
90 | :param page_no: 页码
91 | :param is_school_job: 是否爬取校招职位
92 | :param skip_exist: 是否跳过数据库已经存在的职位数据
93 | :return: 职位数据集合
94 | :rtype: List[tornado.util.ObjectDict]
95 | """
96 | params = {
97 | 'companyId': lg_company_id,
98 | 'positionFirstType': job_type,
99 | 'schoolJob': is_school_job,
100 | 'pageNo': page_no,
101 | 'pageSize': 10,
102 | }
103 | response_json = utils.http_tools.requests_get(
104 | url=constants.COMPANY_JOBS_URL, params=params).json()
105 | jobs = response_json['content']['data']['page']['result']
106 |
107 | jobs_dicts = []
108 | for job in jobs:
109 | lg_job_id = job['positionId']
110 | job_detail = get_job_detail_from_lg(lg_job_id=lg_job_id)
111 | jobs_dicts.append(ObjectDict(
112 | lg_job_id=lg_job_id,
113 | city_name=job.get('city'),
114 | title=job.get('positionName'),
115 | salary=job.get('salary'),
116 | education=job.get('education'),
117 | nature=job.get('jobNature'),
118 | work_year=job.get('workYear'),
119 | advantage=job.get('positionAdvantage', ''),
120 | # job detail
121 | department=job_detail.get('department'),
122 | keywords=job_detail.get('keywords'),
123 | description=job_detail.get('description'),
124 | ))
125 | return jobs_dicts
126 |
127 |
128 | def get_job_detail_from_lg(lg_job_id):
129 | """
130 | 爬取职位详情页的数据
131 |
132 | 返回的 dict 组成:
133 | department:
134 | type: str
135 | meaning: 招聘部门
136 | eg: 商业部
137 | keywords:
138 | type: List[str]
139 | meaning: 职位关键词
140 | eg: ['后端', 'Web', 'Python']
141 | description:
142 | type: List[str]
143 | meaning: 职位介绍
144 | eg: ['职位要求:', 'blablabla', '.......']
145 |
146 | :param lg_job_id: 接口使用的职位 id
147 | :return: 职位详情页数据
148 | :rtype: tornado.util.ObjectDict
149 | """
150 | response = utils.http_tools.requests_get(
151 | url=constants.JOB_DETAIL_URL.format(lg_job_id=lg_job_id))
152 | job_detail_html = etree.HTML(response.text)
153 |
154 | department = job_detail_html.xpath('//div[@class="job-name"]/div[@class="company"]/text()')
155 | description = job_detail_html.xpath('//dd[@class="job_bt"]/div//text()')
156 | keywords = job_detail_html.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()')
157 |
158 | if not department:
159 | logger.error('can not get department by lg_job_id = {}, html is \n {}'.format(
160 | lg_job_id, response.text))
161 |
162 | return ObjectDict(
163 | department=department[0] if department else '',
164 | description=description,
165 | keywords=keywords,
166 | )
167 |
168 |
169 | def clean_lg_job_data(job_dict):
170 | """
171 | 清洗爬取到的职位信息
172 |
173 | :param job_dict: tornado.util.ObjectDict
174 | """
175 | if 'keywords' in job_dict:
176 | job_dict.keywords = set(map(lambda keyword: keyword.strip().lower(), job_dict.keywords))
177 | if 'description' in job_dict:
178 | job_dict.description = ''.join(job_dict.description) if job_dict.description else ''
179 | job_dict.description = job_dict.description[:constants.JOB_DESCRIPTION_MAX_LEN]
180 | if 'advantage' in job_dict:
181 | job_dict.advantage = job_dict.advantage[:constants.JOB_ADVANTAGE_MAX_LEN]
182 |
--------------------------------------------------------------------------------
/webspider/crawlers/lagou_jobs_count.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from webspider.constants import JOB_JSON_URL
3 | from webspider.utils.http_tools import requests_post, generate_http_request_headers
4 |
5 |
6 | def get_jobs_count_from_lg(city_name, keyword_name):
7 | """
8 | 爬取职位数量
9 |
10 | :param city_name: 城市名
11 | :param keyword_name: 关键词名
12 | :return: 城市下的关于关键词的职位数量,如北京的 python 职位数量
13 | :rtype: int
14 | """
15 | query_string = {'needAddtionalResult': False}
16 | if city_name != '全国':
17 | query_string['city'] = city_name
18 | form_data = {
19 | 'first': False,
20 | 'pn': 1,
21 | 'kd': keyword_name
22 | }
23 | headers = generate_http_request_headers(
24 | referer='https://www.lg.com/jobs/list_java?labelWords=&fromSearch=true')
25 | response_json = requests_post(url=JOB_JSON_URL, params=query_string,
26 | data=form_data, headers=headers).json()
27 | return int(response_json['content']['positionResult']['totalCount'])
28 |
--------------------------------------------------------------------------------
/webspider/exceptions.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | __all__ = ['BaseException', 'ResourceNotFoundWebException', 'DowngradeException']
4 |
5 |
6 | class BaseException(Exception):
7 | ERROR_CODE = None
8 | STATUS_CODE = 200
9 |
10 | def __init__(self, message, data=None, debug_message=None):
11 | if self.ERROR_CODE is None:
12 | raise NotImplementedError()
13 | self._message = message
14 | self._data = dict(data) if data else None
15 | self._debug_message = debug_message
16 |
17 | @property
18 | def code(self):
19 | return self.ERROR_CODE
20 |
21 | @property
22 | def message(self):
23 | return self._message
24 |
25 | @property
26 | def data(self):
27 | return self._data
28 |
29 | @property
30 | def debug_message(self):
31 | return self._debug_message
32 |
33 | def __str__(self):
34 | return "Exception: code={code}, message={message}, data={data}, debug_message={debug_message}".format(
35 | code=self.code, message=self.message, data=self.data, debug_message=self.debug_message)
36 |
37 | def __repr__(self):
38 | return self.__str__()
39 |
40 |
41 | class ResourceNotFoundWebException(BaseException):
42 | """
43 | Corresponding to HTTP code 404
44 | """
45 | ERROR_CODE = 4041
46 | STATUS_CODE = 404
47 |
48 | def __init__(self, message=u'资源不存在', data=None, debug_message=None):
49 | super(ResourceNotFoundWebException, self).__init__(message, data, debug_message)
50 |
51 |
52 | class DowngradeException(BaseException):
53 | ERROR_CODE = 101
54 |
55 | def __init__(self, message=u'降级异常', data=None, debug_message=None):
56 | super(DowngradeException, self).__init__(message, data, debug_message)
57 |
--------------------------------------------------------------------------------
/webspider/models/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from webspider.models.city import CityModel
3 | from webspider.models.job import JobModel
4 | from webspider.models.jobs_count import JobsCountModel
5 | from webspider.models.company import CompanyModel
6 | from webspider.models.company_industry import CompanyIndustryModel
7 | from webspider.models.industry import IndustryModel
8 | from webspider.models.job_keyword import JobKeywordModel
9 | from webspider.models.keyword import KeywordModel
10 | from webspider.models.keyword_statistic import KeywordStatisticModel
11 |
12 | __all__ = ['CityModel', 'JobModel', 'JobsCountModel', 'CompanyModel', 'CompanyIndustryModel', 'IndustryModel',
13 | 'JobKeywordModel', 'KeywordModel', 'KeywordStatisticModel']
14 |
--------------------------------------------------------------------------------
/webspider/models/base.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import logging
4 |
5 | from sqlalchemy import MetaData, inspect, func, text
6 | from sqlalchemy.ext.declarative import declarative_base
7 | from tornado.util import ObjectDict
8 |
9 | from webspider.utils import sql
10 | from webspider.utils.classproperty import classproperty
11 |
12 | __all__ = ['BaseModel']
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 | _Base = declarative_base()
17 |
18 |
19 | class BaseModel(_Base):
20 | __abstract__ = True
21 | __table_args__ = {
22 | 'mysql_engine': 'InnoDB',
23 | 'mysql_charset': 'utf8mb4',
24 | 'extend_existing': True,
25 | }
26 |
27 | metadata = MetaData(bind=sql.db_engine, reflect=True)
28 |
29 | @classproperty
30 | def session(cls):
31 | return sql.get_session()
32 |
33 | @classproperty
34 | def pk_name(cls):
35 | """主键名"""
36 | return inspect(cls).primary_key[0].name
37 |
38 | @classproperty
39 | def pk(cls):
40 | """表主键"""
41 | return getattr(cls, cls.pk_name)
42 |
43 | def dict(self):
44 | """sqlalchemy object -> dict"""
45 | columns = self.__table__.columns.keys()
46 | return ObjectDict((column, getattr(self, column)) for column in columns)
47 |
48 | @classmethod
49 | def count(cls, filter=None, filter_by=None):
50 | """
51 | 获取数据库中记录的数目
52 | :param filter: apply the given filtering criterion to a copy of this Query,
53 | using SQL expressions.
54 | :param filter_by: apply the given filtering criterion to a copy of this Query,
55 | using keyword expressions as a dict.
56 | :return:
57 | """
58 | query = cls.session.query(func.count(cls.pk))
59 |
60 | if filter is not None:
61 | query = query.filter(filter)
62 | if filter_by is not None:
63 | query = query.filter_by(**filter_by)
64 |
65 | return query.scalar()
66 |
67 | @classmethod
68 | def add(cls, **values):
69 | """添加记录"""
70 | obj = cls(**values)
71 | cls.session.add(obj)
72 | cls.session.flush()
73 | return getattr(obj, obj.pk_name)
74 |
75 | @classmethod
76 | def get_by_pk(cls, pk):
77 | """通过主键值获取记录"""
78 | query = cls.session.query(cls).filter(cls.pk == pk)
79 | return query.scalar()
80 |
81 | @classmethod
82 | def get_one(cls, filter=None, filter_by=None):
83 | """
84 | 获取记录
85 | :param filter: apply the given filtering criterion to a copy of this Query,
86 | using SQL expressions.
87 | :param filter_by: apply the given filtering criterion to a copy of this Query,
88 | using keyword expressions as a dict.
89 | :return:
90 | """
91 | query = cls.session.query(cls)
92 |
93 | if filter is not None:
94 | query = query.filter(filter)
95 | if filter_by is not None:
96 | query = query.filter_by(**filter_by)
97 |
98 | return query.first()
99 |
100 | @classmethod
101 | def list(cls, columns=None, filter=None, filter_by=None, order_by=None, group_by=None, offset=None, limit=None):
102 | """
103 | 批量获取记录
104 | :param columns: the columns you want to query, SQL expression, column, or mapped entity expected
105 | :param filter: apply the given filtering criterion to a copy of this Query,
106 | using SQL expressions.
107 | :param filter_by: apply the given filtering criterion to a copy of this Query,
108 | using keyword expressions as a dict.
109 | :param order_by: apply one or more ORDER BY criterion to the query and return
110 | the newly resulting ``Query``
111 | :param group_by: apply one or more GROUP BY criterion to the query and return
112 | the newly resulting :class:`.Query`
113 | :param offset: Apply an ``OFFSET`` to the query and return the newly resulting
114 | ``Query``.
115 | :param limit: Apply a ``LIMIT`` to the query and return the newly resulting
116 | ``Query``.
117 | :return:
118 | """
119 | query = cls.session.query(cls)
120 | if columns:
121 | query = cls.session.query(columns)
122 | if filter is not None:
123 | query = query.filter(filter)
124 | if filter_by is not None:
125 | query = query.filter_by(**filter_by)
126 | if group_by is not None:
127 | query = query.group_by(group_by)
128 | if order_by is not None:
129 | query = query.order_by(order_by)
130 | if offset is not None:
131 | query = query.offset(offset)
132 | if limit is not None:
133 | query = query.limit(limit)
134 |
135 | result = query.all()
136 |
137 | return result
138 |
139 | @classmethod
140 | def is_exist(cls, filter=None, filter_by=None):
141 | """
142 | 判断某个记录是否存在
143 | :param filter: apply the given filtering criterion to a copy of this Query,
144 | using SQL expressions.
145 | :param filter_by: apply the given filtering criterion to a copy of this Query,
146 | using keyword expressions as a dict.
147 | :return: boolean
148 | """
149 |
150 | return cls.count(filter=filter, filter_by=filter_by) != 0
151 |
152 | @classmethod
153 | def update(cls, filter=None, filter_by=None, values=None):
154 | """更新数据
155 | :param filter: apply the given filtering criterion to a copy of this Query,
156 | using SQL expressions.
157 | :param filter_by: apply the given filtering criterion to a copy of this Query,
158 | using keyword expressions as a dict.
159 | :param values: values to update
160 | :return: type: int, affected rows
161 | """
162 | query = cls.session.query(cls)
163 |
164 | if filter is not None:
165 | query = query.filter(filter)
166 |
167 | if filter_by is not None:
168 | query = query.filter_by(**filter_by)
169 |
170 | affect_rows = query.update(values)
171 | return affect_rows
172 |
173 | @classmethod
174 | def update_by_pk(cls, pk, values):
175 | """主键更新数据
176 |
177 | :param pk: 主键值
178 | :param values: dict 要更新的值,key=value 形式
179 | :return: 返回变更的行数
180 | """
181 | return cls.update(filter=(cls.pk == pk), values=values)
182 |
183 | @classmethod
184 | def execute_sql_string(cls, sql_string, parameters_dict=None):
185 | """
186 | 直接执行 sql 语句
187 | eg:
188 | sql_string = 'select * from temp where id = :numbers' and parameters_dict = {'numbers': 1}
189 | >> select * from temp where id = 1
190 | :param sql_string: the sql string you want to execute
191 | :param parameters_dict: parameters
192 | :return: if query returns_rows return rows(List(tuple)) else return affect_rows(int)
193 | """
194 | query = cls.session.execute(text(sql_string), parameters_dict)
195 | if query.returns_rows:
196 | return query.fetchall()
197 | else:
198 | return query.rowcount
199 |
200 | @classmethod
201 | def batch_add(cls, instances):
202 | """批量添加记录"""
203 | if not all([isinstance(instance, cls) for instance in instances]):
204 | raise ValueError('all instances must be {table_name} model instance'.format(table_name=cls.__tablename__))
205 | cls.session.bulk_save_objects(instances)
206 |
--------------------------------------------------------------------------------
/webspider/models/city.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP
6 |
7 | from webspider.models.base import BaseModel
8 |
9 |
10 | class CityModel(BaseModel):
11 | __tablename__ = 'city'
12 |
13 | id = Column(INTEGER, primary_key=True, nullable=False, autoincrement=True)
14 | name = Column(VARCHAR(64), nullable=False, doc=u'城市名')
15 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
16 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间')
17 |
--------------------------------------------------------------------------------
/webspider/models/company.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP, TINYINT
6 |
7 | from webspider import constants
8 | from webspider.models.base import BaseModel
9 |
10 |
11 | class CompanyModel(BaseModel):
12 | __tablename__ = 'company'
13 |
14 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
15 | lg_company_id = Column(INTEGER, nullable=False, doc='所使用的公司id')
16 | city_id = Column(INTEGER, nullable=False, doc=u'所在城市 id')
17 | shortname = Column(VARCHAR(64), nullable=False, doc=u'公司名称')
18 | fullname = Column(VARCHAR(128), nullable=False, doc=u'公司全称')
19 | finance_stage = Column(TINYINT, nullable=False, doc=u'融资阶段')
20 | size = Column(TINYINT, nullable=False, doc=u'公司规模')
21 | address = Column(VARCHAR(128), nullable=False, doc=u'公司地址')
22 | features = Column(VARCHAR(128), nullable=False, doc=u'公司特点')
23 | process_rate = Column(TINYINT, nullable=False, doc=u'简历处理率')
24 | introduce = Column(VARCHAR(constants.COMPANY_INTRODUCE_MAX_LEN), nullable=False, doc=u'公司简介')
25 | advantage = Column(VARCHAR(constants.COMPANY_ADVANTAGE_MAX_LEN), nullable=False, doc=u'公司优势')
26 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
27 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now,
28 | onupdate=datetime.now, doc=u'最后更新时间')
29 |
--------------------------------------------------------------------------------
/webspider/models/company_industry.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP
6 |
7 | from webspider.models.base import BaseModel
8 |
9 |
10 | class CompanyIndustryModel(BaseModel):
11 | __tablename__ = 'company_industry'
12 |
13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 | company_id = Column(INTEGER, nullable=False, doc=u'公司 id')
15 | industry_id = Column(INTEGER, nullable=False, doc=u'行业 id')
16 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
17 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间')
18 |
--------------------------------------------------------------------------------
/webspider/models/industry.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP
6 |
7 | from webspider.models.base import BaseModel
8 |
9 |
10 | class IndustryModel(BaseModel):
11 | __tablename__ = 'industry'
12 |
13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 | name = Column(VARCHAR(64), nullable=False, doc=u'行业名称')
15 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
16 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间')
17 |
--------------------------------------------------------------------------------
/webspider/models/job.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TINYINT, TIMESTAMP
6 |
7 | from webspider import constants
8 | from webspider.models.base import BaseModel
9 |
10 |
11 | class JobModel(BaseModel):
12 | __tablename__ = 'job'
13 |
14 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
15 | lg_job_id = Column(INTEGER, nullable=False, doc=u'接口使用的 job id')
16 | city_id = Column(INTEGER, nullable=False, doc=u'城市 id')
17 | company_id = Column(INTEGER, nullable=False, doc=u'公司 id')
18 | title = Column(VARCHAR(64), nullable=False, default='', doc=u'职位标题')
19 | work_year = Column(TINYINT, nullable=False, doc=u'工作年限要求')
20 | department = Column(VARCHAR(64), nullable=False, doc=u'招聘部门')
21 | salary = Column(VARCHAR(32), nullable=False, doc=u'薪水')
22 | education = Column(TINYINT, nullable=False, doc=u'教育背景要求')
23 | nature = Column(TINYINT, nullable=False, doc=u'工作性质')
24 | description = Column(VARCHAR(constants.JOB_DESCRIPTION_MAX_LEN), nullable=False, doc=u'额外描述')
25 | advantage = Column(VARCHAR(constants.JOB_ADVANTAGE_MAX_LEN), nullable=False, doc=u'职位优势')
26 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'职位创建时间')
27 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now,
28 | onupdate=datetime.now, doc=u'职位创建时间')
29 |
--------------------------------------------------------------------------------
/webspider/models/job_keyword.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP
6 |
7 | from webspider.models.base import BaseModel
8 |
9 |
10 | class JobKeywordModel(BaseModel):
11 | __tablename__ = 'job_keyword'
12 |
13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 | job_id = Column(INTEGER, nullable=False, doc=u'职位 id')
15 | keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id')
16 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
17 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间')
18 |
--------------------------------------------------------------------------------
/webspider/models/jobs_count.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP
6 |
7 | from webspider.models.base import BaseModel
8 |
9 |
10 | class JobsCountModel(BaseModel):
11 | __tablename__ = 'jobs_count'
12 |
13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 | date = Column(INTEGER, nullable=False, doc=u'日期')
15 | keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id')
16 | all_city = Column(INTEGER, nullable=False, default=0, doc=u'全国岗位数量')
17 | beijing = Column(INTEGER, nullable=False, default=0, doc=u'北京岗位数量')
18 | guangzhou = Column(INTEGER, nullable=False, default=0, doc=u'广州岗位数量')
19 | shenzhen = Column(INTEGER, nullable=False, default=0, doc=u'深圳岗位数量')
20 | shanghai = Column(INTEGER, nullable=False, default=0, doc=u'上海岗位数量')
21 | hangzhou = Column(INTEGER, nullable=False, default=0, doc=u'杭州岗位数量')
22 | chengdu = Column(INTEGER, nullable=False, default=0, doc=u'成都岗位数量')
23 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
24 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间')
25 |
--------------------------------------------------------------------------------
/webspider/models/keyword.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP
6 |
7 | from webspider.models.base import BaseModel
8 |
9 |
10 | class KeywordModel(BaseModel):
11 | __tablename__ = 'keyword'
12 |
13 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 | name = Column(VARCHAR(64), nullable=False, doc=u'关键词名称')
15 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
16 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间')
17 |
--------------------------------------------------------------------------------
/webspider/models/keyword_statistic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from datetime import datetime
3 |
4 | from sqlalchemy import Column
5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP
6 |
7 | from webspider.models.base import BaseModel
8 | from webspider.models.jobs_count import JobsCountModel
9 |
10 |
11 | class KeywordStatisticModel(BaseModel):
12 | __tablename__ = 'keyword_statistic'
13 |
14 | id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
15 | keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id')
16 | educations = Column(VARCHAR(2048), nullable=False, doc=u'教育背景要求统计')
17 | city_jobs_count = Column(VARCHAR(2048), nullable=False, doc=u'城市职位数量统计')
18 | salary = Column(VARCHAR(2048), nullable=False, doc=u'薪水分布统计')
19 | financing_stage = Column(VARCHAR(2048), nullable=False, doc=u'招聘公司的融资统计')
20 | work_years = Column(VARCHAR(2048), nullable=False, doc=u'职位薪水统计')
21 | created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
22 | updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间')
23 |
24 | @property
25 | def per_day_jobs_count(self):
26 | return JobsCountModel.list(filter_by={'keyword_id': self.keyword_id}, order_by=JobsCountModel.date.asc())
27 |
--------------------------------------------------------------------------------
/webspider/quickly_cmd.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # flake8: noqa
3 | import os
4 | import logging
5 |
6 | from tornado.options import options, define
7 |
8 | from webspider import constants
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def run_web_app_by_gunicorn():
14 | define(name='port', default=8000, type=int, help='run on the given port')
15 | logger.info(
16 | '\n================ spider web server(require gunicorn and gevent) has started ================ ')
17 | logger.info('\n server start at port -> {}, debug mode = {} '.format(options.port,
18 | constants.DEBUG))
19 | os.system(
20 | "env/bin/gunicorn 'webspider.web_app:make_wsgi_app()' -b 0.0.0.0:{port} -w 1 -k gevent".format(
21 | port=options.port
22 | )
23 | )
24 |
25 |
26 | def run_celery_default_worker():
27 | os.system(
28 | u'env/bin/celery worker -A webspider.tasks.celery_app -Q default -n default_worker --loglevel=debug')
29 |
30 |
31 | def run_celery_lg_data_worker():
32 | os.system(
33 | u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_data -n lg_data_worker --loglevel=debug')
34 |
35 |
36 | def run_celery_lg_jobs_data_worker():
37 | os.system(
38 | u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_jobs_data -n lg_jobs_data_worker --loglevel=debug')
39 |
40 |
41 | def run_celery_lg_jobs_count_worker():
42 | os.system(
43 | u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_jobs_count -n lg_jobs_count_worker --loglevel=debug ')
44 |
45 |
46 | def run_celery_beat():
47 | os.system(u'env/bin/celery -A webspider.tasks.celery_app beat --loglevel=debug')
48 |
49 |
50 | def run_celery_flower():
51 | os.system(u'env/bin/celery flower --broker=redis://localhost:6379/0 --broker_api=redis://localhost:6379/0')
52 |
--------------------------------------------------------------------------------
/webspider/setting.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os
3 |
4 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
5 |
6 | # smtp
7 | SMTP_HOST = os.environ.get('SMTP_HOST')
8 | SMTP_PORT = os.environ.get('SMTP_PORT')
9 |
10 | # email
11 | MAIL_USER_NAME = os.environ.get('MAIL_USER_NAME')
12 | MAIL_USER_PASSWORD = os.environ.get('MAIL_USER_PASSWORD')
13 | FROM_EMAIL_ADDRESS = os.environ.get('FROM_EMAIL_ADDRESS')
14 | TO_EMAIL_ADDRESS = os.environ.get('TO_EMAIL_ADDRESS')
15 |
16 | # MYSQL
17 | MYSQL_USERNAME = os.environ.get('MYSQL_USERNAME', 'root')
18 | MYSQL_PASSWORD = os.environ.get('MYSQL_PASSWORD', '')
19 | DB_HOST = os.environ.get('DB_HOST', 'localhost')
20 | DB_PORT = os.environ.get('DB_PORT', '3306')
21 | DB_NAME = os.environ.get('DB_NAME', 'spider')
22 | DB_CONNECT_STRING_FORMAT = 'mysql+mysqldb://{username}:{password}@{db_host}:{db_port}/{db_name}?charset=utf8mb4'
23 |
24 | # REDIS
25 | REDIS_HOST = os.environ.get('DB_HOST', 'localhost')
26 | REDIS_PORT = os.environ.get('DB_PORT', '6379')
27 |
28 | # MYSQL 配置
29 | MYSQL_CONF = {
30 | 'connect_string': DB_CONNECT_STRING_FORMAT.format(
31 | username=MYSQL_USERNAME,
32 | password=MYSQL_PASSWORD,
33 | db_host=DB_HOST,
34 | db_port=DB_PORT,
35 | db_name=DB_NAME
36 | ),
37 | 'host': DB_HOST,
38 | 'port': DB_PORT,
39 | 'username': MYSQL_USERNAME,
40 | 'password': MYSQL_PASSWORD,
41 | }
42 |
43 | SMTP_CONF = {
44 | 'host': SMTP_HOST,
45 | 'port': SMTP_PORT,
46 | 'from_email': FROM_EMAIL_ADDRESS,
47 | 'to_email': TO_EMAIL_ADDRESS,
48 | }
49 |
50 | MAIL_CONF = {
51 | 'username': MAIL_USER_NAME,
52 | 'password': MAIL_USER_PASSWORD,
53 | }
54 |
55 | REDIS_CONF = {
56 | 'host': REDIS_HOST,
57 | 'port': REDIS_PORT
58 | }
59 |
--------------------------------------------------------------------------------
/webspider/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
--------------------------------------------------------------------------------
/webspider/tasks/actor/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
--------------------------------------------------------------------------------
/webspider/tasks/actor/keyword_statistic.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 | import json
4 |
5 | from webspider.tasks.celery_app import celery_app
6 | from webspider.controllers import keyword_statistic_ctl
7 | from webspider.models import (KeywordModel, JobModel, JobKeywordModel, KeywordStatisticModel)
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | @celery_app.task()
13 | def update_keywords_statistic_task():
14 | """更新关键词统计任务"""
15 | keywords = KeywordModel.list()
16 | for keyword in keywords:
17 | update_single_keyword_statistic_task.delay(keyword.id)
18 |
19 |
20 | @celery_app.task()
21 | def update_single_keyword_statistic_task(keyword_id):
22 | """更新关键词统计任务"""
23 |
24 | job_keywords = JobKeywordModel.list(filter_by={'keyword_id': keyword_id})
25 | jobs = JobModel.list(filter=(JobModel.id.in_([job_keyword.job_id for job_keyword in job_keywords])))
26 | if not jobs:
27 | return
28 |
29 | educations_statistic = keyword_statistic_ctl.get_educations_statistic(jobs=jobs)
30 | finance_stage_statistic = keyword_statistic_ctl.get_finance_stage_statistic(jobs=jobs)
31 | city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(jobs=jobs)
32 | salary_statistic = keyword_statistic_ctl.get_salary_statistic(jobs=jobs)
33 | work_years_statistic = keyword_statistic_ctl.get_work_years_statistic(jobs=jobs)
34 |
35 | statistic_values = dict(
36 | keyword_id=keyword_id,
37 | educations=json.dumps(educations_statistic),
38 | city_jobs_count=json.dumps(city_jobs_count_statistic),
39 | salary=json.dumps(salary_statistic),
40 | financing_stage=json.dumps(finance_stage_statistic),
41 | work_years=json.dumps(work_years_statistic)
42 | )
43 |
44 | if KeywordStatisticModel.is_exist(filter_by={'keyword_id': keyword_id}):
45 | KeywordStatisticModel.update(filter_by={'keyword_id': keyword_id}, values=statistic_values)
46 | else:
47 | KeywordStatisticModel.add(**statistic_values)
48 |
--------------------------------------------------------------------------------
/webspider/tasks/actor/lagou_data.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 |
4 | from webspider import utils
5 | from webspider import crawlers
6 | from webspider import constants
7 | from webspider.utils.cache import redis_instance
8 | from webspider.tasks.celery_app import celery_app
9 | from webspider.controllers import industry_ctl, keyword_ctl, city_ctl
10 | from webspider.models import (CityModel, CompanyModel,
11 | CompanyIndustryModel, JobModel, JobKeywordModel)
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | @celery_app.task()
17 | def crawl_lg_data_task():
18 | """爬取数据任务"""
19 |
20 | # 清除抓取记录
21 | keys = redis_instance.keys('crawled_company_jobs*')
22 | if keys:
23 | redis_instance.delete(*keys)
24 |
25 | crawl_lg_city_data_task.delay()
26 | # 目前只抓取这几个城市 全国:0, 北京:2 上海:3 杭州:6 深圳:215 广州:213 成都:252
27 | lg_city_ids = [2, 3, 6, 215, 213, 252]
28 | lg_finance_stage_ids = [1, 2, 3, 4, 5, 6, 7, 8]
29 | lg_industry_ids = [24, 25, 33, 27, 29, 45, 31, 28,
30 | 47, 34, 35, 43, 32, 41, 26, 48, 38, 49, 10594]
31 | # 爬取公司数据
32 | for industry_id in lg_industry_ids:
33 | for city_id in lg_city_ids:
34 | for finance_stage_id in lg_finance_stage_ids:
35 | crawl_lg_company_data_task.delay(city_id=city_id, finance_stage_id=finance_stage_id,
36 | industry_id=industry_id)
37 |
38 |
39 | @celery_app.task()
40 | def crawl_lg_city_data_task():
41 | """爬取城市数据任务"""
42 | city_dicts = crawlers.get_cites_from_lg()
43 | for city_dict in city_dicts:
44 | if CityModel.is_exist(filter_by={'id': city_dict.id}):
45 | CityModel.update_by_pk(pk=city_dict.id, values=city_dict)
46 | else:
47 | CityModel.add(**city_dict)
48 |
49 |
50 | @celery_app.task()
51 | def crawl_lg_company_data_task(city_id, finance_stage_id, industry_id):
52 | """爬取公司数据任务"""
53 | companies_pagination = crawlers.get_companies_pagination_from_lg(city_id=city_id,
54 | finance_stage_id=finance_stage_id,
55 | industry_id=industry_id)
56 | for page_no in companies_pagination.iter_pages:
57 | company_dicts = crawlers.get_companies_from_lg(city_id=city_id,
58 | finance_stage_id=finance_stage_id,
59 | industry_id=industry_id,
60 | page_no=page_no)
61 | if not company_dicts:
62 | break
63 | for company_dict in company_dicts:
64 | crawlers.clean_lg_company_data(company_dict)
65 | utils.convert.convert_dict_field_to_constants(company_dict)
66 |
67 | industries = company_dict.pop('industries')
68 | city_name = company_dict.pop('city_name')
69 |
70 | city_ctl.insert_city_if_not_exist(city_name)
71 | company_dict['city_id'] = city_ctl.get_city_id_by_name(city_name)
72 |
73 | company = CompanyModel.get_one(
74 | filter_by={'lg_company_id': company_dict.lg_company_id})
75 | if company:
76 | CompanyModel.update_by_pk(pk=company.id, values=company_dict)
77 | else:
78 | company_id = CompanyModel.add(**company_dict)
79 |
80 | for industry in industries:
81 | industry_ctl.insert_industry_if_not_exist(name=industry)
82 | industry_id = industry_ctl.get_industry_id_by_name(name=industry)
83 | CompanyIndustryModel.add(industry_id=industry_id, company_id=company_id)
84 |
85 | crawl_lg_job_data_task.delay(company_dict.lg_company_id)
86 |
87 |
88 | @celery_app.task()
89 | def crawl_lg_job_data_task(lg_company_id):
90 | """爬取职位数据任务"""
91 | # 过滤本轮已经爬取过职位的公司
92 | if not redis_instance.setnx(constants.CRAWLED_COMPANY_JOBS_REDIS_KEY.format(lg_company_id=lg_company_id), 1):
93 | return
94 | jobs_pagination = crawlers.get_jobs_pagination_from_lg(lg_company_id=lg_company_id,
95 | job_type=constants.LGJobType.technology)
96 | for page_no in jobs_pagination.iter_pages:
97 | job_dicts = crawlers.get_jobs_from_lg(lg_company_id=lg_company_id,
98 | job_type=constants.LGJobType.technology,
99 | page_no=page_no)
100 | if not job_dicts:
101 | break
102 | for job_dict in job_dicts:
103 | crawlers.clean_lg_job_data(job_dict)
104 | utils.convert.convert_dict_field_to_constants(job_dict)
105 |
106 | keywords = job_dict.pop('keywords')
107 | city_name = job_dict.pop('city_name')
108 |
109 | city_ctl.insert_city_if_not_exist(city_name)
110 | job_dict['city_id'] = city_ctl.get_city_id_by_name(city_name)
111 | company = CompanyModel.get_one(filter_by={'lg_company_id': lg_company_id})
112 | job_dict['company_id'] = company.id
113 |
114 | job = JobModel.get_one(filter_by={'lg_job_id': job_dict.lg_job_id})
115 | if job:
116 | JobModel.update_by_pk(pk=job.id, values=job_dict)
117 | else:
118 | job_id = JobModel.add(**job_dict)
119 |
120 | for keyword in keywords:
121 | keyword_ctl.insert_keyword_if_not_exist(name=keyword)
122 | keyword_id = keyword_ctl.get_keyword_id_by_name(name=keyword)
123 | JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
124 |
--------------------------------------------------------------------------------
/webspider/tasks/actor/lagou_jobs_count.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 | from datetime import datetime
4 |
5 | from webspider import crawlers
6 | from webspider.tasks.celery_app import celery_app
7 | from webspider.controllers import keyword_ctl, job_keyword_ctl
8 | from webspider.models import JobsCountModel
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | @celery_app.task()
14 | def crawl_lg_jobs_count_task():
15 | keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=1000)
16 | for keyword_id in keyword_ids:
17 | crawl_lg_keyword_jobs_count_task.delay(keyword_id)
18 |
19 |
20 | @celery_app.task()
21 | def crawl_lg_keyword_jobs_count_task(keyword_id):
22 | cities_name_map = {
23 | 'all_city': u'全国',
24 | 'beijing': u'北京',
25 | 'shanghai': u'上海',
26 | 'guangzhou': u'广州',
27 | 'shenzhen': u'深圳',
28 | 'hangzhou': u'杭州',
29 | 'chengdu': u'成都',
30 | }
31 | keyword_name = keyword_ctl.get_keyword_name_by_id(keyword_id)
32 | jobs_count_dict = dict(keyword_id=keyword_id)
33 | for city_name_key, city_name in cities_name_map.items():
34 | jobs_count_dict[city_name_key] = crawlers.get_jobs_count_from_lg(city_name=city_name,
35 | keyword_name=keyword_name)
36 | jobs_count_dict['date'] = int(datetime.today().strftime('%Y%m%d'))
37 |
38 | JobsCountModel.add(**jobs_count_dict)
39 |
--------------------------------------------------------------------------------
/webspider/tasks/celery_app.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | from celery import Celery
4 |
5 | celery_app = Celery('tasks')
6 | celery_app.config_from_object('webspider.tasks.celery_config')
7 |
--------------------------------------------------------------------------------
/webspider/tasks/celery_config.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from kombu import Queue
3 | from kombu import Exchange
4 |
5 | from celery.schedules import crontab
6 |
7 | BROKER_URL = 'redis://127.0.0.1:6379' # 指定 Broker
8 |
9 | CELERY_RESULT_BACKEND = 'redis://127.0.0.1:6379/0' # 指定 Backend
10 |
11 | CELERY_CREATE_MISSING_QUEUES = True # 某个程序中出现的队列,在broker中不存在,则立刻创建它
12 |
13 | CELERY_TIMEZONE = 'Asia/Shanghai' # 指定时区,默认是 UTC
14 |
15 | CELERYD_CONCURRENCY = 2 # 并发worker数
16 |
17 | CELERY_ENABLE_UTC = False
18 |
19 | CELERYD_FORCE_EXECV = True # 强制退出
20 |
21 | CELERY_TASK_SERIALIZER = 'json' # 任务序列化和反序列化
22 |
23 | CELERY_RESULT_SERIALIZER = 'json' # 读取任务结果一般性能要求不高,所以使用了可读性更好的JSON
24 |
25 | CELERY_IGNORE_RESULT = True # 忽略任务结果
26 |
27 | # CELERY_TASK_RESULT_EXPIRES = 60 * 60 * 1 # 任务结果过期时间
28 |
29 | CELERY_IMPORTS = ( # 指定导入的任务模块
30 | 'webspider.tasks.actor.lg_data',
31 | 'webspider.tasks.actor.lg_jobs_count',
32 | 'webspider.tasks.actor.keyword_statistic',
33 | )
34 |
35 | CELERY_TASK_PUBLISH_RETRY = False # 重试
36 |
37 | CELERYBEAT_SCHEDULE = {
38 | 'crawl_lg_jobs_count_task': {
39 | 'task': 'webspider.tasks.actor.lg_jobs_count.crawl_lg_jobs_count_task',
40 | 'schedule': crontab(hour='01', minute='01', day_of_week='2, 5'),
41 | },
42 | 'crawl_lg_data_task': {
43 | 'task': 'webspider.tasks.actor.lg_data.crawl_lg_data_task',
44 | 'schedule': crontab(hour='01', minute='01', day_of_month='1'),
45 | },
46 | 'update_keyword_statistic': {
47 | 'task': 'webspider.tasks.actor.keyword_statistic.update_keywords_statistic_task',
48 | 'schedule': crontab(hour='01', minute='01', day_of_week='1, 4'),
49 | },
50 | }
51 |
52 | default_exchange = Exchange('default', type='direct')
53 | lg_exchange = Exchange('lg', type='direct')
54 |
55 | CELERY_QUEUES = (
56 | Queue(name='default', exchange=default_exchange, routing_key='default'),
57 | Queue(name='lg_data', exchange=lg_exchange, routing_key='for_lg_data'),
58 | Queue(name='lg_jobs_data', exchange=lg_exchange, routing_key='for_lg_jobs_data'),
59 | Queue(name='lg_jobs_count', exchange=lg_exchange, routing_key='for_lg_jobs_count'),
60 | )
61 |
62 | CELERY_ROUTES = {
63 | 'webspider.tasks.actor.lg_data.crawl_lg_job_data_task': {'exchange': 'lg',
64 | 'routing_key': 'for_lg_jobs_data'},
65 | 'webspider.tasks.actor.lg_jobs_count.*': {'exchange': 'lg', 'routing_key': 'for_lg_jobs_count'},
66 | 'webspider.tasks.actor.lg_data.*': {'exchange': 'lg', 'routing_key': 'for_lg_data'},
67 | 'webspider.tasks.actor.keyword_statistic.*': {'exchange': 'default', 'routing_key': 'default'}
68 | }
69 |
--------------------------------------------------------------------------------
/webspider/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from webspider.utils import cache
3 | from webspider.utils import classproperty
4 | from webspider.utils import common
5 | from webspider.utils import convert
6 | from webspider.utils import http_tools
7 | from webspider.utils import log
8 | from webspider.utils import pagination
9 | from webspider.utils import sql
10 | from webspider.utils import text
11 | from webspider.utils import time_tools
12 |
13 | __all__ = ['cache', 'classproperty', 'common', 'convert', 'http_tools', 'log', 'pagination', 'sql', 'text',
14 | 'time_tools']
15 |
--------------------------------------------------------------------------------
/webspider/utils/cache.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 | import pickle
4 | from functools import wraps
5 |
6 | import redis
7 |
8 | from webspider import setting
9 |
10 | redis_pool = redis.ConnectionPool(host=setting.REDIS_CONF['host'],
11 | port=setting.REDIS_CONF['port'])
12 | redis_instance = redis.Redis(connection_pool=redis_pool)
13 |
14 |
15 | def simple_cache(ex=None):
16 | """利用 redis 进行缓存,暂不支持 kwargs 类型的参数传入方式"""
17 |
18 | def decorator(func):
19 | @wraps(func)
20 | def wrapper(*args, **kwargs):
21 | if kwargs:
22 | raise ValueError(
23 | "args key generator does not accept kwargs arguments")
24 | redis_key = func.__name__ + '(' + ','.join(map(str, args)) + ')'
25 | result = redis_instance.get(redis_key)
26 | if result:
27 | logging.debug('cache: get func result from redis key - {}'.format(redis_key))
28 | result = pickle.loads(result)
29 | else:
30 | logging.debug('cache: get func result from func key - {}'.format(redis_key))
31 | result = func(*args)
32 | redis_instance.set(name=redis_key, value=pickle.dumps(result), ex=ex)
33 | return result
34 |
35 | return wrapper
36 |
37 | return decorator
38 |
39 |
40 | def cache_clear(func, *args):
41 | """失效缓存"""
42 | redis_key = func.__name__
43 | if args:
44 | redis_key += ('(' + ','.join(map(str, args)) + ')')
45 | logging.info('remove cache redis-key: {}'.format(redis_key))
46 | keys = redis_instance.keys('*' + redis_key + '*')
47 | if keys:
48 | remove_count = redis_instance.delete(*keys)
49 | logging.info('cache clear count {}'.format(remove_count))
50 | return remove_count
51 |
--------------------------------------------------------------------------------
/webspider/utils/classproperty.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 |
4 | class ClassPropertyDescriptor(object):
5 | """类属性"""
6 | def __init__(self, fget, fset=None):
7 | self.fget = fget
8 | self.fset = fset
9 |
10 | def __get__(self, obj, obj_type=None):
11 | if obj_type is None:
12 | obj_type = type(obj)
13 | return self.fget.__get__(obj, obj_type)()
14 |
15 | def __set__(self, obj, value):
16 | raise AttributeError("can't set attribute")
17 |
18 |
19 | def classproperty(func):
20 | if not isinstance(func, (classmethod, staticmethod)):
21 | func = classmethod(func)
22 |
23 | return ClassPropertyDescriptor(func)
24 |
--------------------------------------------------------------------------------
/webspider/utils/common.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 | from collections import Counter
4 |
5 | logger = logging.getLogger(__name__)
6 |
7 |
8 | def get_key_from_dict_by_value(value, dictionary):
9 | keys = [_key for (_key, _value) in dictionary.items() if _value == value]
10 | if not keys:
11 | raise ValueError(u'can not get key from dict by value {}'.format(value))
12 | if len(keys) > 1:
13 | raise AttributeError(u'get multi keys from dict by value {}'.format(value))
14 | return keys[0]
15 |
16 |
17 | def get_field_statistics(values, constants_dict):
18 | """
19 | 获得某批数据的统计情况
20 | eg:
21 | >>>get_field_statistics([0, 0, 0, 1, 1], {'男': 0, '女': 1})
22 | >>>{'男':3, '女':2}
23 |
24 | :param values: list[int], field values list
25 | :param constants_dict: Dict
26 | :return: collections.Counter
27 | """
28 | statistics_counter = Counter()
29 | for value in values:
30 | field_name = get_key_from_dict_by_value(value=value, dictionary=constants_dict)
31 | statistics_counter[field_name] += 1
32 | return statistics_counter
33 |
--------------------------------------------------------------------------------
/webspider/utils/convert.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import logging
3 |
4 | from webspider import constants
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 | CONSTANTS_MAP = {
9 | 'finance_stage': constants.FINANCE_STAGE_DICT,
10 | 'nature': constants.JOB_NATURE_DICT,
11 | 'work_year': constants.WORK_YEARS_REQUEST_DICT,
12 | 'education': constants.EDUCATION_REQUEST_DICT,
13 | 'size': constants.COMPANY_SIZE_DICT,
14 | }
15 |
16 |
17 | def convert_dict_field_to_constants(to_converted_dict, constants_map=CONSTANTS_MAP):
18 | """
19 | 把dict的字段转换为相应常量
20 | :param to_converted_dict: 需要转换的字典
21 | :param constants_map: 字段常量对应关系
22 | :return: 转换后的字段
23 | """
24 | for field_name, field_value in to_converted_dict.items():
25 | if field_name in constants_map:
26 | to_converted_dict[field_name] = convert_field_to_constants(field_name, field_value, constants_map)
27 |
28 |
29 | def convert_field_to_constants(field_name, field_value, constants_map=CONSTANTS_MAP):
30 | """
31 | 把字段转化为相应的常量, 如果无法转换,返回 -1
32 |
33 | eg:
34 | convert_field_to_constants(field_name='size', field_value='2000人以上', constants_map={'size': {'2000人以上': 1}})
35 | return: 1
36 | :param field_name: 字段名
37 | :param field_value: 字段值
38 | :param constants_map: 字段常量对应关系
39 | :rtype: int
40 | """
41 | if field_name not in constants_map:
42 | raise ValueError(u'can not find the field in constants_map, field name is {}'.find(field_name))
43 |
44 | field_constant_map = constants_map[field_name]
45 |
46 | if field_value in field_constant_map:
47 | return field_constant_map[field_value]
48 | else:
49 | logger.error('error {field_name}, value is {field_value}'.format(field_name=field_name,
50 | field_value=field_value))
51 | return field_constant_map['unknown'] if 'unknown' in field_constant_map else -1
52 |
--------------------------------------------------------------------------------
/webspider/utils/http_tools.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import time
3 | import random
4 |
5 | import requests
6 | from retrying import retry
7 |
8 | from webspider import constants
9 |
10 |
11 | def generate_http_request_headers(referer=None):
12 | """构造 HTTP 请求头"""
13 | header = constants.HTTP_HEADER
14 | header['User-Agent'] = random.choice(constants.USER_AGENT_LIST)
15 | if referer:
16 | header['Referer'] = referer
17 | return header
18 |
19 |
20 | @retry(stop_max_attempt_number=constants.RETRY_TIMES, stop_max_delay=constants.STOP_MAX_DELAY,
21 | wait_fixed=constants.WAIT_FIXED)
22 | def requests_get(url, params=None, headers=None, allow_redirects=False, timeout=constants.REQUEST_TIMEOUT,
23 | need_sleep=True, **kwargs):
24 | if need_sleep:
25 | time.sleep(random.randint(constants.MIN_SLEEP_SECS, constants.MAX_SLEEP_SECS))
26 | if not headers:
27 | headers = generate_http_request_headers()
28 | return requests.get(url=url, params=params, headers=headers, allow_redirects=allow_redirects,
29 | timeout=timeout, **kwargs)
30 |
31 |
32 | @retry(stop_max_attempt_number=constants.RETRY_TIMES, stop_max_delay=constants.STOP_MAX_DELAY,
33 | wait_fixed=constants.WAIT_FIXED)
34 | def requests_post(url, data=None, params=None, headers=None, allow_redirects=False, timeout=constants.REQUEST_TIMEOUT,
35 | need_sleep=True, **kwargs):
36 | if need_sleep:
37 | time.sleep(random.randint(constants.MIN_SLEEP_SECS, constants.MAX_SLEEP_SECS))
38 | if not headers:
39 | headers = generate_http_request_headers()
40 | return requests.post(url=url, data=data, params=params, headers=headers, allow_redirects=allow_redirects,
41 | timeout=timeout, **kwargs)
42 |
--------------------------------------------------------------------------------
/webspider/utils/log.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import os
3 | import logging.config
4 |
5 | from webspider import setting
6 |
7 | LOG_FILE_PATH = os.path.join(setting.BASE_DIR, 'log', 'spider_log.txt')
8 |
9 | LOGGING_CONFIG = {
10 | 'version': 1,
11 | 'disable_existing_loggers': True,
12 |
13 | 'formatters': {
14 | 'default': {
15 | 'format': '%(asctime)s- %(module)s:%(lineno)d [%(levelname)1.1s] %(name)s: %(message)s',
16 | 'datefmt': '%Y/%m/%d %H:%M:%S'
17 | },
18 | },
19 |
20 | 'handlers': {
21 | 'console': {
22 | 'level': 'DEBUG',
23 | 'formatter': 'default',
24 | 'class': 'logging.StreamHandler'
25 | },
26 | 'smtp': {
27 | 'level': 'ERROR',
28 | 'class': 'logging.handlers.SMTPHandler',
29 | 'formatter': 'default',
30 | 'mailhost': (setting.SMTP_CONF['host'], setting.SMTP_CONF['port']),
31 | 'fromaddr': setting.SMTP_CONF['from_email'],
32 | 'toaddrs': [setting.SMTP_CONF['to_email'], ],
33 | 'subject': '爬虫系统出现异常',
34 | 'credentials': (setting.MAIL_CONF['username'], setting.MAIL_CONF['password'])
35 | },
36 | 'file': {
37 | 'level': 'ERROR',
38 | 'formatter': 'default',
39 | 'class': 'logging.handlers.RotatingFileHandler',
40 | 'filename': LOG_FILE_PATH,
41 | 'encoding': 'utf8'
42 | },
43 | },
44 |
45 | 'loggers': {
46 | '': {
47 | 'handlers': ['console', 'file'],
48 | 'level': 'DEBUG',
49 | 'propagate': False,
50 | },
51 | 'webspider': {
52 | 'handlers': ['console', 'file'],
53 | 'level': 'DEBUG',
54 | 'propagate': False,
55 | },
56 | 'tornado': {
57 | 'handlers': ['console', 'file'],
58 | 'level': 'DEBUG',
59 | 'propagate': False,
60 | },
61 | 'tornado.access': {
62 | 'handlers': ['console', 'file'],
63 | 'level': 'INFO',
64 | 'propagate': False,
65 | },
66 | 'tornado.application': {
67 | 'handlers': ['console', 'file'],
68 | 'level': 'INFO',
69 | 'propagate': False,
70 | },
71 | 'tornado.general': {
72 | 'handlers': ['console', 'file'],
73 | 'propagate': False,
74 | 'level': 'INFO',
75 | },
76 | 'sqlalchemy.engine': {
77 | 'handlers': ['console', 'file'],
78 | 'level': 'INFO',
79 | 'propagate': False,
80 | },
81 | 'gunicorn': {
82 | 'handlers': ['console', 'file'],
83 | 'level': 'INFO',
84 | 'propagate': False,
85 | },
86 | 'celery': {
87 | 'handlers': ['console', 'file'],
88 | 'level': 'DEBUG',
89 | 'propagate': False,
90 | },
91 | },
92 | }
93 |
94 |
95 | def config_logging():
96 | """配置日志"""
97 | logging.config.dictConfig(LOGGING_CONFIG)
98 |
--------------------------------------------------------------------------------
/webspider/utils/pagination.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from math import ceil
3 |
4 |
5 | class Pagination(object):
6 | """分页"""
7 | def __init__(self, page=1, per_page=10, total=0):
8 | self.page = page
9 | self.per_page = per_page
10 | self.total = total
11 |
12 | @property
13 | def pages(self):
14 | if self.per_page == 0:
15 | pages = 0
16 | else:
17 | pages = int(ceil(self.total / float(self.per_page)))
18 | return pages
19 |
20 | @property
21 | def prev_num(self):
22 | if not self.has_prev:
23 | return None
24 | return self.page - 1
25 |
26 | @property
27 | def has_prev(self):
28 | return self.page > 1
29 |
30 | @property
31 | def has_next(self):
32 | return self.page < self.pages
33 |
34 | @property
35 | def next_num(self):
36 | if not self.has_next:
37 | return None
38 | return self.page + 1
39 |
40 | @property
41 | def iter_pages(self):
42 | for num in range(1, self.pages + 1):
43 | yield num
44 |
--------------------------------------------------------------------------------
/webspider/utils/sql.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import logging
3 |
4 | from sqlalchemy import create_engine
5 | from sqlalchemy.orm import sessionmaker, scoped_session
6 |
7 | from webspider import setting
8 | from webspider import constants
9 |
10 | __all__ = ['get_session', 'remove_sessions', 'db_engine']
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 | db_engine = create_engine(
15 | setting.MYSQL_CONF['connect_string'],
16 | echo=constants.DEBUG, max_overflow=48,
17 | pool_timeout=0, pool_recycle=3600,
18 | logging_name='sql')
19 |
20 | _session = scoped_session(sessionmaker(bind=db_engine, autocommit=True, autoflush=True))
21 |
22 |
23 | def get_session():
24 | return _session
25 |
26 |
27 | def remove_sessions():
28 | _session.remove()
29 |
--------------------------------------------------------------------------------
/webspider/utils/text.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import re
3 |
4 |
5 | def to_plaintext(content, pattern=r'
|\n', strip=True):
6 | """
7 | 根据 pattern 过滤文本
8 | :param content: 需要过滤的文本
9 | :param pattern: 需要过滤内容的正则表达式
10 | :param strip: 是否去掉首尾空格
11 | :return:
12 | """
13 | plaintext = re.sub(pattern=pattern, repl='', string=content)
14 | if strip:
15 | plaintext = plaintext.strip()
16 | return plaintext
17 |
--------------------------------------------------------------------------------
/webspider/utils/time_tools.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import time
3 | import datetime as datetime
4 |
5 |
6 | def datetime_to_timestamp(datetime_obj):
7 | return int(time.mktime(datetime_obj.timetuple()))
8 |
9 |
10 | def timestamp_to_datetime(timestamp):
11 | return datetime.datetime.fromtimestamp(timestamp)
12 |
13 |
14 | def timestamp_to_datetime_str(ts, time_format=None):
15 | """
16 | 时间戳转化为日期字符串(1476547200->'2016-10-16')
17 | :param ts: 时间戳
18 | :param time_format: '日期格式'
19 | :return: 日期字符串
20 | """
21 | if time_format is None or time_format == '':
22 | time_format = '%Y-%m-%d'
23 | ts = time.localtime(float(ts))
24 | return time.strftime(time_format, ts)
25 |
--------------------------------------------------------------------------------
/webspider/web/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
--------------------------------------------------------------------------------
/webspider/web/app.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # flake8: noqa
4 |
5 | import os
6 | import logging.config
7 |
8 | import tornado
9 | import tornado.web
10 | import tornado.ioloop
11 | import tornado.httpserver
12 | from tornado.options import options, define, parse_command_line
13 | from tornado.wsgi import WSGIAdapter
14 |
15 | from webspider import constants
16 | from webspider.web.urls import url_handlers
17 | from webspider.utils.log import config_logging
18 |
19 | config_logging()
20 | logger = logging.getLogger(__name__)
21 |
22 |
23 | def make_wsgi_app():
24 | web_app = make_web_app()
25 | return tornado.wsgi.WSGIAdapter(web_app)
26 |
27 |
28 | def make_web_app():
29 | settings = {
30 | 'debug': constants.DEBUG,
31 | 'template_path': os.path.join(
32 | os.path.dirname(__file__), "templates"
33 | ),
34 | 'static_path': os.path.join(
35 | os.path.dirname(__file__), 'static'
36 | )
37 | }
38 |
39 | app = tornado.web.Application(url_handlers, **settings)
40 | return app
41 |
42 |
43 | def main():
44 | define(name='port', default=8000, type=int, help='run on the given port')
45 | parse_command_line()
46 | logger.info('====== web server starting at http://0.0.0.0:{} ======'.format(options.port))
47 | if constants.DEBUG:
48 | logger.info('debug mode is enabled!!!')
49 |
50 | app = make_web_app()
51 | http_server = tornado.httpserver.HTTPServer(app)
52 | http_server.listen(options.port)
53 | http_server.start()
54 |
55 | tornado.ioloop.IOLoop.instance().start()
56 |
57 |
58 | if __name__ == '__main__':
59 | main()
60 |
--------------------------------------------------------------------------------
/webspider/web/formatter/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from webspider.models import KeywordStatisticModel, JobsCountModel
3 | from webspider.web.formatter.jobs_count import JobsCountFormatter
4 | from webspider.web.formatter.keyword_statistic import KeywordStatisticFormatter
5 |
6 | from webspider.web.formatter.base import Formatter
7 |
8 | formatter_mappings = {
9 | JobsCountModel: JobsCountFormatter,
10 | KeywordStatisticModel: KeywordStatisticFormatter,
11 | }
12 |
13 | Formatter.register_formatter(formatter_mappings)
14 |
--------------------------------------------------------------------------------
/webspider/web/formatter/base.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from tornado.util import ObjectDict
3 |
4 | from webspider.exceptions import DowngradeException
5 |
6 |
7 | class Downgrade(object):
8 | """降级"""
9 | def __init__(self, value):
10 | self.value = value
11 |
12 |
13 | class Field(object):
14 | """Formatter字段"""
15 | def __init__(self, name, converter=None, downgrade=None):
16 | self.name = name
17 | self.converter = converter
18 | if downgrade is not None and not isinstance(downgrade, Downgrade):
19 | raise DowngradeException(u'downgrade must be Downgrade instance')
20 | self.downgrade = downgrade
21 |
22 |
23 | class Formatter(object):
24 | """Formatter 根据设定的 FORMATTER_MAPS 自动渲染"""
25 | _FORMATTER_MAPS = {}
26 | FIELDS = {}
27 |
28 | @classmethod
29 | def register_formatter(cls, mapping):
30 | cls._FORMATTER_MAPS.update(mapping)
31 |
32 | @classmethod
33 | def format(cls, data):
34 | if isinstance(data, list):
35 | return [cls.format(item) for item in data]
36 | else:
37 | formatter = cls.get_formatter(data)
38 | if not formatter:
39 | raise ValueError(u'Can not find the formatter by model {}'.format(type(data)))
40 |
41 | format_result = ObjectDict()
42 | for field in formatter.FIELDS:
43 | if not isinstance(field, Field):
44 | raise ValueError('formatter field must be Field instance')
45 | try:
46 | value = getattr(data, field.name)
47 | # 可再次渲染
48 | if isinstance(value, list) or cls.get_formatter(value):
49 | value = cls.format(value)
50 | if field.converter:
51 | value = field.converter(value)
52 | except Exception:
53 | # Field 设置了降级
54 | if field.downgrade:
55 | value = field.downgrade.value
56 | else:
57 | raise
58 | format_result[field.name] = value
59 |
60 | return format_result
61 |
62 | @classmethod
63 | def get_formatter(cls, data):
64 | if data in cls._FORMATTER_MAPS:
65 | return cls._FORMATTER_MAPS[data]
66 | for model, formatter in cls._FORMATTER_MAPS.items():
67 | if type(data) is model:
68 | return formatter
69 |
--------------------------------------------------------------------------------
/webspider/web/formatter/jobs_count.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from webspider import utils
3 | from webspider.web.formatter.base import Field, Formatter
4 |
5 |
6 | class JobsCountFormatter(Formatter):
7 | FIELDS = [
8 | Field('date'),
9 | Field('all_city'),
10 | Field('beijing'),
11 | Field('guangzhou'),
12 | Field('shenzhen'),
13 | Field('shanghai'),
14 | Field('hangzhou'),
15 | Field('chengdu'),
16 | Field('created_at', converter=utils.time_tools.datetime_to_timestamp),
17 | Field('updated_at', converter=utils.time_tools.datetime_to_timestamp),
18 | ]
19 |
--------------------------------------------------------------------------------
/webspider/web/formatter/keyword_statistic.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 |
4 | from webspider import utils
5 | from webspider.web.formatter.base import Field, Downgrade, Formatter
6 |
7 |
8 | class KeywordStatisticFormatter(Formatter):
9 | FIELDS = [
10 | Field('educations', converter=json.loads, downgrade=Downgrade({})),
11 | Field('city_jobs_count', converter=json.loads, downgrade=Downgrade({})),
12 | Field('salary', converter=json.loads, downgrade=Downgrade({})),
13 | Field('financing_stage', converter=json.loads, downgrade=Downgrade({})),
14 | Field('work_years', converter=json.loads, downgrade=Downgrade({})),
15 | Field('per_day_jobs_count'),
16 | Field('created_at', converter=utils.time_tools.datetime_to_timestamp),
17 | Field('updated_at', converter=utils.time_tools.datetime_to_timestamp),
18 | ]
19 |
--------------------------------------------------------------------------------
/webspider/web/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from webspider.web.handlers.keyword_statistics import KeywordStatisticsApiHandler, KeywordStatisticsPageHandler
3 |
4 | __all__ = [
5 | 'KeywordStatisticsApiHandler',
6 | 'KeywordStatisticsPageHandler'
7 | ]
8 |
--------------------------------------------------------------------------------
/webspider/web/handlers/base.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | from tornado.escape import json_encode
3 | from tornado.web import RequestHandler
4 |
5 | from webspider import constants
6 | from webspider.exceptions import BaseException, ResourceNotFoundWebException
7 | from webspider.web.formatter import Formatter
8 | from webspider.utils.sql import remove_sessions
9 |
10 |
11 | class BaseApiHandler(RequestHandler):
12 | def write_error(self, status_code, **kwargs):
13 | exception = kwargs['exc_info'][1]
14 |
15 | # TODO 后端改成纯 API 后,删除其逻辑
16 | # 生产环境下, 且请求非 API 接口, 渲染错误页面
17 | if not constants.DEBUG and isinstance(self, BasePageHandler):
18 | self._handler_production_page_error(exception)
19 | return
20 |
21 | if isinstance(exception, BaseException):
22 | self.render_exception(exception)
23 | else:
24 | RequestHandler.write_error(self, status_code=status_code, **kwargs)
25 |
26 | def auto_render(self, data):
27 | formatted_dict = Formatter.format(data)
28 | self.render_json(formatted_dict)
29 |
30 | def _handler_production_page_error(self, exception):
31 | """处理生产环境下页面的错误"""
32 | if isinstance(exception, ResourceNotFoundWebException):
33 | self.render('404.html')
34 | else:
35 | self.render('500.html')
36 |
37 | def render_exception(self, exception):
38 | self.set_status(
39 | status_code=exception.STATUS_CODE,
40 | reason=exception.message
41 | )
42 | error_dict = {
43 | 'error': {
44 | 'code': exception.code,
45 | 'name': exception.__class__.__name__,
46 | 'message': exception.message,
47 | 'data': exception.data if exception.data else '',
48 | 'debug_message': exception.debug_message if exception.data else ''
49 | }
50 | }
51 | self.render_json(error_dict)
52 |
53 | def render_json(self, data):
54 | self.set_header('Content-Type', 'application/json')
55 | self.finish(json_encode(data))
56 |
57 | def on_finish(self):
58 | remove_sessions()
59 |
60 |
61 | # TODO page to api
62 | class BasePageHandler(BaseApiHandler):
63 | """前后端代码混合型的页面 Handler"""
64 | pass
65 |
--------------------------------------------------------------------------------
/webspider/web/handlers/keyword_statistics.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import json
3 |
4 | from webspider.web.handlers.base import BasePageHandler, BaseApiHandler
5 | from webspider.exceptions import ResourceNotFoundWebException
6 | from webspider.models import KeywordModel, KeywordStatisticModel
7 |
8 |
9 | class KeywordStatisticsApiHandler(BaseApiHandler):
10 | def get(self):
11 | keyword_name = self.get_argument('keyword_name', '')
12 | if not keyword_name:
13 | raise ResourceNotFoundWebException(u'请输入关键词')
14 |
15 | keyword = KeywordModel.get_one(filter_by={'name': keyword_name})
16 | if not keyword:
17 | raise ResourceNotFoundWebException(u'找不到该关键词')
18 |
19 | keyword_statistic = KeywordStatisticModel.get_one(filter_by={'keyword_id': keyword.id})
20 | if not keyword_statistic:
21 | raise ResourceNotFoundWebException(u'暂无该关键词的统计结果')
22 |
23 | self.auto_render(keyword_statistic)
24 |
25 |
26 | class KeywordStatisticsPageHandler(BasePageHandler):
27 | def get(self):
28 | keyword_name = self.get_argument('keyword_name', '')
29 | if not keyword_name:
30 | raise ResourceNotFoundWebException(u'请输入关键词')
31 |
32 | keyword = KeywordModel.get_one(filter_by={'name': keyword_name})
33 | if not keyword:
34 | raise ResourceNotFoundWebException(u'找不到该关键词')
35 |
36 | keyword_statistic = KeywordStatisticModel.get_one(filter_by={'keyword_id': keyword.id})
37 | if not keyword_statistic:
38 | raise ResourceNotFoundWebException(u'暂无该关键词的统计结果')
39 |
40 | self.render(
41 | "statistics.html",
42 | keyword_name=keyword_name,
43 | educations_statistic=json.loads(keyword_statistic.educations),
44 | city_jobs_count_statistic=json.loads(keyword_statistic.city_jobs_count),
45 | salary_statistic=json.loads(keyword_statistic.salary),
46 | finance_stage_statistic=json.loads(keyword_statistic.financing_stage),
47 | work_years_statistic=json.loads(keyword_statistic.work_years),
48 | per_day_jobs_count_statistic=keyword_statistic.per_day_jobs_count
49 | )
50 |
--------------------------------------------------------------------------------
/webspider/web/static/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/css/bootstrap-theme.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Bootstrap v3.3.7 (http://getbootstrap.com)
3 | * Copyright 2011-2016 Twitter, Inc.
4 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
5 | */.btn-danger,.btn-default,.btn-info,.btn-primary,.btn-success,.btn-warning{text-shadow:0 -1px 0 rgba(0,0,0,.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 1px rgba(0,0,0,.075)}.btn-danger.active,.btn-danger:active,.btn-default.active,.btn-default:active,.btn-info.active,.btn-info:active,.btn-primary.active,.btn-primary:active,.btn-success.active,.btn-success:active,.btn-warning.active,.btn-warning:active{-webkit-box-shadow:inset 0 3px 5px rgba(0,0,0,.125);box-shadow:inset 0 3px 5px rgba(0,0,0,.125)}.btn-danger.disabled,.btn-danger[disabled],.btn-default.disabled,.btn-default[disabled],.btn-info.disabled,.btn-info[disabled],.btn-primary.disabled,.btn-primary[disabled],.btn-success.disabled,.btn-success[disabled],.btn-warning.disabled,.btn-warning[disabled],fieldset[disabled] .btn-danger,fieldset[disabled] .btn-default,fieldset[disabled] .btn-info,fieldset[disabled] .btn-primary,fieldset[disabled] .btn-success,fieldset[disabled] .btn-warning{-webkit-box-shadow:none;box-shadow:none}.btn-danger .badge,.btn-default .badge,.btn-info .badge,.btn-primary .badge,.btn-success .badge,.btn-warning .badge{text-shadow:none}.btn.active,.btn:active{background-image:none}.btn-default{text-shadow:0 1px 0 #fff;background-image:-webkit-linear-gradient(top,#fff 0,#e0e0e0 100%);background-image:-o-linear-gradient(top,#fff 0,#e0e0e0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fff),to(#e0e0e0));background-image:linear-gradient(to bottom,#fff 0,#e0e0e0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#ffe0e0e0', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#dbdbdb;border-color:#ccc}.btn-default:focus,.btn-default:hover{background-color:#e0e0e0;background-position:0 -15px}.btn-default.active,.btn-default:active{background-color:#e0e0e0;border-color:#dbdbdb}.btn-default.disabled,.btn-default.disabled.active,.btn-default.disabled.focus,.btn-default.disabled:active,.btn-default.disabled:focus,.btn-default.disabled:hover,.btn-default[disabled],.btn-default[disabled].active,.btn-default[disabled].focus,.btn-default[disabled]:active,.btn-default[disabled]:focus,.btn-default[disabled]:hover,fieldset[disabled] .btn-default,fieldset[disabled] .btn-default.active,fieldset[disabled] .btn-default.focus,fieldset[disabled] .btn-default:active,fieldset[disabled] .btn-default:focus,fieldset[disabled] .btn-default:hover{background-color:#e0e0e0;background-image:none}.btn-primary{background-image:-webkit-linear-gradient(top,#337ab7 0,#265a88 100%);background-image:-o-linear-gradient(top,#337ab7 0,#265a88 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#265a88));background-image:linear-gradient(to bottom,#337ab7 0,#265a88 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff265a88', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#245580}.btn-primary:focus,.btn-primary:hover{background-color:#265a88;background-position:0 -15px}.btn-primary.active,.btn-primary:active{background-color:#265a88;border-color:#245580}.btn-primary.disabled,.btn-primary.disabled.active,.btn-primary.disabled.focus,.btn-primary.disabled:active,.btn-primary.disabled:focus,.btn-primary.disabled:hover,.btn-primary[disabled],.btn-primary[disabled].active,.btn-primary[disabled].focus,.btn-primary[disabled]:active,.btn-primary[disabled]:focus,.btn-primary[disabled]:hover,fieldset[disabled] .btn-primary,fieldset[disabled] .btn-primary.active,fieldset[disabled] .btn-primary.focus,fieldset[disabled] .btn-primary:active,fieldset[disabled] .btn-primary:focus,fieldset[disabled] .btn-primary:hover{background-color:#265a88;background-image:none}.btn-success{background-image:-webkit-linear-gradient(top,#5cb85c 0,#419641 100%);background-image:-o-linear-gradient(top,#5cb85c 0,#419641 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5cb85c),to(#419641));background-image:linear-gradient(to bottom,#5cb85c 0,#419641 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff419641', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#3e8f3e}.btn-success:focus,.btn-success:hover{background-color:#419641;background-position:0 -15px}.btn-success.active,.btn-success:active{background-color:#419641;border-color:#3e8f3e}.btn-success.disabled,.btn-success.disabled.active,.btn-success.disabled.focus,.btn-success.disabled:active,.btn-success.disabled:focus,.btn-success.disabled:hover,.btn-success[disabled],.btn-success[disabled].active,.btn-success[disabled].focus,.btn-success[disabled]:active,.btn-success[disabled]:focus,.btn-success[disabled]:hover,fieldset[disabled] .btn-success,fieldset[disabled] .btn-success.active,fieldset[disabled] .btn-success.focus,fieldset[disabled] .btn-success:active,fieldset[disabled] .btn-success:focus,fieldset[disabled] .btn-success:hover{background-color:#419641;background-image:none}.btn-info{background-image:-webkit-linear-gradient(top,#5bc0de 0,#2aabd2 100%);background-image:-o-linear-gradient(top,#5bc0de 0,#2aabd2 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5bc0de),to(#2aabd2));background-image:linear-gradient(to bottom,#5bc0de 0,#2aabd2 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff2aabd2', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#28a4c9}.btn-info:focus,.btn-info:hover{background-color:#2aabd2;background-position:0 -15px}.btn-info.active,.btn-info:active{background-color:#2aabd2;border-color:#28a4c9}.btn-info.disabled,.btn-info.disabled.active,.btn-info.disabled.focus,.btn-info.disabled:active,.btn-info.disabled:focus,.btn-info.disabled:hover,.btn-info[disabled],.btn-info[disabled].active,.btn-info[disabled].focus,.btn-info[disabled]:active,.btn-info[disabled]:focus,.btn-info[disabled]:hover,fieldset[disabled] .btn-info,fieldset[disabled] .btn-info.active,fieldset[disabled] .btn-info.focus,fieldset[disabled] .btn-info:active,fieldset[disabled] .btn-info:focus,fieldset[disabled] .btn-info:hover{background-color:#2aabd2;background-image:none}.btn-warning{background-image:-webkit-linear-gradient(top,#f0ad4e 0,#eb9316 100%);background-image:-o-linear-gradient(top,#f0ad4e 0,#eb9316 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f0ad4e),to(#eb9316));background-image:linear-gradient(to bottom,#f0ad4e 0,#eb9316 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffeb9316', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#e38d13}.btn-warning:focus,.btn-warning:hover{background-color:#eb9316;background-position:0 -15px}.btn-warning.active,.btn-warning:active{background-color:#eb9316;border-color:#e38d13}.btn-warning.disabled,.btn-warning.disabled.active,.btn-warning.disabled.focus,.btn-warning.disabled:active,.btn-warning.disabled:focus,.btn-warning.disabled:hover,.btn-warning[disabled],.btn-warning[disabled].active,.btn-warning[disabled].focus,.btn-warning[disabled]:active,.btn-warning[disabled]:focus,.btn-warning[disabled]:hover,fieldset[disabled] .btn-warning,fieldset[disabled] .btn-warning.active,fieldset[disabled] .btn-warning.focus,fieldset[disabled] .btn-warning:active,fieldset[disabled] .btn-warning:focus,fieldset[disabled] .btn-warning:hover{background-color:#eb9316;background-image:none}.btn-danger{background-image:-webkit-linear-gradient(top,#d9534f 0,#c12e2a 100%);background-image:-o-linear-gradient(top,#d9534f 0,#c12e2a 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9534f),to(#c12e2a));background-image:linear-gradient(to bottom,#d9534f 0,#c12e2a 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc12e2a', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#b92c28}.btn-danger:focus,.btn-danger:hover{background-color:#c12e2a;background-position:0 -15px}.btn-danger.active,.btn-danger:active{background-color:#c12e2a;border-color:#b92c28}.btn-danger.disabled,.btn-danger.disabled.active,.btn-danger.disabled.focus,.btn-danger.disabled:active,.btn-danger.disabled:focus,.btn-danger.disabled:hover,.btn-danger[disabled],.btn-danger[disabled].active,.btn-danger[disabled].focus,.btn-danger[disabled]:active,.btn-danger[disabled]:focus,.btn-danger[disabled]:hover,fieldset[disabled] .btn-danger,fieldset[disabled] .btn-danger.active,fieldset[disabled] .btn-danger.focus,fieldset[disabled] .btn-danger:active,fieldset[disabled] .btn-danger:focus,fieldset[disabled] .btn-danger:hover{background-color:#c12e2a;background-image:none}.img-thumbnail,.thumbnail{-webkit-box-shadow:0 1px 2px rgba(0,0,0,.075);box-shadow:0 1px 2px rgba(0,0,0,.075)}.dropdown-menu>li>a:focus,.dropdown-menu>li>a:hover{background-color:#e8e8e8;background-image:-webkit-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-o-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f5f5f5),to(#e8e8e8));background-image:linear-gradient(to bottom,#f5f5f5 0,#e8e8e8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);background-repeat:repeat-x}.dropdown-menu>.active>a,.dropdown-menu>.active>a:focus,.dropdown-menu>.active>a:hover{background-color:#2e6da4;background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}.navbar-default{background-image:-webkit-linear-gradient(top,#fff 0,#f8f8f8 100%);background-image:-o-linear-gradient(top,#fff 0,#f8f8f8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fff),to(#f8f8f8));background-image:linear-gradient(to bottom,#fff 0,#f8f8f8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#fff8f8f8', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-radius:4px;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 5px rgba(0,0,0,.075);box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 5px rgba(0,0,0,.075)}.navbar-default .navbar-nav>.active>a,.navbar-default .navbar-nav>.open>a{background-image:-webkit-linear-gradient(top,#dbdbdb 0,#e2e2e2 100%);background-image:-o-linear-gradient(top,#dbdbdb 0,#e2e2e2 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dbdbdb),to(#e2e2e2));background-image:linear-gradient(to bottom,#dbdbdb 0,#e2e2e2 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdbdbdb', endColorstr='#ffe2e2e2', GradientType=0);background-repeat:repeat-x;-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,.075);box-shadow:inset 0 3px 9px rgba(0,0,0,.075)}.navbar-brand,.navbar-nav>li>a{text-shadow:0 1px 0 rgba(255,255,255,.25)}.navbar-inverse{background-image:-webkit-linear-gradient(top,#3c3c3c 0,#222 100%);background-image:-o-linear-gradient(top,#3c3c3c 0,#222 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#3c3c3c),to(#222));background-image:linear-gradient(to bottom,#3c3c3c 0,#222 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff3c3c3c', endColorstr='#ff222222', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-radius:4px}.navbar-inverse .navbar-nav>.active>a,.navbar-inverse .navbar-nav>.open>a{background-image:-webkit-linear-gradient(top,#080808 0,#0f0f0f 100%);background-image:-o-linear-gradient(top,#080808 0,#0f0f0f 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#080808),to(#0f0f0f));background-image:linear-gradient(to bottom,#080808 0,#0f0f0f 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff080808', endColorstr='#ff0f0f0f', GradientType=0);background-repeat:repeat-x;-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,.25);box-shadow:inset 0 3px 9px rgba(0,0,0,.25)}.navbar-inverse .navbar-brand,.navbar-inverse .navbar-nav>li>a{text-shadow:0 -1px 0 rgba(0,0,0,.25)}.navbar-fixed-bottom,.navbar-fixed-top,.navbar-static-top{border-radius:0}@media (max-width:767px){.navbar .navbar-nav .open .dropdown-menu>.active>a,.navbar .navbar-nav .open .dropdown-menu>.active>a:focus,.navbar .navbar-nav .open .dropdown-menu>.active>a:hover{color:#fff;background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}}.alert{text-shadow:0 1px 0 rgba(255,255,255,.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.25),0 1px 2px rgba(0,0,0,.05);box-shadow:inset 0 1px 0 rgba(255,255,255,.25),0 1px 2px rgba(0,0,0,.05)}.alert-success{background-image:-webkit-linear-gradient(top,#dff0d8 0,#c8e5bc 100%);background-image:-o-linear-gradient(top,#dff0d8 0,#c8e5bc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dff0d8),to(#c8e5bc));background-image:linear-gradient(to bottom,#dff0d8 0,#c8e5bc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffc8e5bc', GradientType=0);background-repeat:repeat-x;border-color:#b2dba1}.alert-info{background-image:-webkit-linear-gradient(top,#d9edf7 0,#b9def0 100%);background-image:-o-linear-gradient(top,#d9edf7 0,#b9def0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9edf7),to(#b9def0));background-image:linear-gradient(to bottom,#d9edf7 0,#b9def0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffb9def0', GradientType=0);background-repeat:repeat-x;border-color:#9acfea}.alert-warning{background-image:-webkit-linear-gradient(top,#fcf8e3 0,#f8efc0 100%);background-image:-o-linear-gradient(top,#fcf8e3 0,#f8efc0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fcf8e3),to(#f8efc0));background-image:linear-gradient(to bottom,#fcf8e3 0,#f8efc0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fff8efc0', GradientType=0);background-repeat:repeat-x;border-color:#f5e79e}.alert-danger{background-image:-webkit-linear-gradient(top,#f2dede 0,#e7c3c3 100%);background-image:-o-linear-gradient(top,#f2dede 0,#e7c3c3 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f2dede),to(#e7c3c3));background-image:linear-gradient(to bottom,#f2dede 0,#e7c3c3 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffe7c3c3', GradientType=0);background-repeat:repeat-x;border-color:#dca7a7}.progress{background-image:-webkit-linear-gradient(top,#ebebeb 0,#f5f5f5 100%);background-image:-o-linear-gradient(top,#ebebeb 0,#f5f5f5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#ebebeb),to(#f5f5f5));background-image:linear-gradient(to bottom,#ebebeb 0,#f5f5f5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffebebeb', endColorstr='#fff5f5f5', GradientType=0);background-repeat:repeat-x}.progress-bar{background-image:-webkit-linear-gradient(top,#337ab7 0,#286090 100%);background-image:-o-linear-gradient(top,#337ab7 0,#286090 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#286090));background-image:linear-gradient(to bottom,#337ab7 0,#286090 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff286090', GradientType=0);background-repeat:repeat-x}.progress-bar-success{background-image:-webkit-linear-gradient(top,#5cb85c 0,#449d44 100%);background-image:-o-linear-gradient(top,#5cb85c 0,#449d44 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5cb85c),to(#449d44));background-image:linear-gradient(to bottom,#5cb85c 0,#449d44 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff449d44', GradientType=0);background-repeat:repeat-x}.progress-bar-info{background-image:-webkit-linear-gradient(top,#5bc0de 0,#31b0d5 100%);background-image:-o-linear-gradient(top,#5bc0de 0,#31b0d5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5bc0de),to(#31b0d5));background-image:linear-gradient(to bottom,#5bc0de 0,#31b0d5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff31b0d5', GradientType=0);background-repeat:repeat-x}.progress-bar-warning{background-image:-webkit-linear-gradient(top,#f0ad4e 0,#ec971f 100%);background-image:-o-linear-gradient(top,#f0ad4e 0,#ec971f 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f0ad4e),to(#ec971f));background-image:linear-gradient(to bottom,#f0ad4e 0,#ec971f 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffec971f', GradientType=0);background-repeat:repeat-x}.progress-bar-danger{background-image:-webkit-linear-gradient(top,#d9534f 0,#c9302c 100%);background-image:-o-linear-gradient(top,#d9534f 0,#c9302c 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9534f),to(#c9302c));background-image:linear-gradient(to bottom,#d9534f 0,#c9302c 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc9302c', GradientType=0);background-repeat:repeat-x}.progress-bar-striped{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent)}.list-group{border-radius:4px;-webkit-box-shadow:0 1px 2px rgba(0,0,0,.075);box-shadow:0 1px 2px rgba(0,0,0,.075)}.list-group-item.active,.list-group-item.active:focus,.list-group-item.active:hover{text-shadow:0 -1px 0 #286090;background-image:-webkit-linear-gradient(top,#337ab7 0,#2b669a 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2b669a 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2b669a));background-image:linear-gradient(to bottom,#337ab7 0,#2b669a 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2b669a', GradientType=0);background-repeat:repeat-x;border-color:#2b669a}.list-group-item.active .badge,.list-group-item.active:focus .badge,.list-group-item.active:hover .badge{text-shadow:none}.panel{-webkit-box-shadow:0 1px 2px rgba(0,0,0,.05);box-shadow:0 1px 2px rgba(0,0,0,.05)}.panel-default>.panel-heading{background-image:-webkit-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-o-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f5f5f5),to(#e8e8e8));background-image:linear-gradient(to bottom,#f5f5f5 0,#e8e8e8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);background-repeat:repeat-x}.panel-primary>.panel-heading{background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}.panel-success>.panel-heading{background-image:-webkit-linear-gradient(top,#dff0d8 0,#d0e9c6 100%);background-image:-o-linear-gradient(top,#dff0d8 0,#d0e9c6 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dff0d8),to(#d0e9c6));background-image:linear-gradient(to bottom,#dff0d8 0,#d0e9c6 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffd0e9c6', GradientType=0);background-repeat:repeat-x}.panel-info>.panel-heading{background-image:-webkit-linear-gradient(top,#d9edf7 0,#c4e3f3 100%);background-image:-o-linear-gradient(top,#d9edf7 0,#c4e3f3 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9edf7),to(#c4e3f3));background-image:linear-gradient(to bottom,#d9edf7 0,#c4e3f3 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffc4e3f3', GradientType=0);background-repeat:repeat-x}.panel-warning>.panel-heading{background-image:-webkit-linear-gradient(top,#fcf8e3 0,#faf2cc 100%);background-image:-o-linear-gradient(top,#fcf8e3 0,#faf2cc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fcf8e3),to(#faf2cc));background-image:linear-gradient(to bottom,#fcf8e3 0,#faf2cc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fffaf2cc', GradientType=0);background-repeat:repeat-x}.panel-danger>.panel-heading{background-image:-webkit-linear-gradient(top,#f2dede 0,#ebcccc 100%);background-image:-o-linear-gradient(top,#f2dede 0,#ebcccc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f2dede),to(#ebcccc));background-image:linear-gradient(to bottom,#f2dede 0,#ebcccc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffebcccc', GradientType=0);background-repeat:repeat-x}.well{background-image:-webkit-linear-gradient(top,#e8e8e8 0,#f5f5f5 100%);background-image:-o-linear-gradient(top,#e8e8e8 0,#f5f5f5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#e8e8e8),to(#f5f5f5));background-image:linear-gradient(to bottom,#e8e8e8 0,#f5f5f5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffe8e8e8', endColorstr='#fff5f5f5', GradientType=0);background-repeat:repeat-x;border-color:#dcdcdc;-webkit-box-shadow:inset 0 1px 3px rgba(0,0,0,.05),0 1px 0 rgba(255,255,255,.1);box-shadow:inset 0 1px 3px rgba(0,0,0,.05),0 1px 0 rgba(255,255,255,.1)}
6 | /*# sourceMappingURL=bootstrap-theme.min.css.map */
--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.eot
--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.ttf
--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff
--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff2
--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/js/npm.js:
--------------------------------------------------------------------------------
1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment.
2 | require('../../js/transition.js')
3 | require('../../js/alert.js')
4 | require('../../js/button.js')
5 | require('../../js/carousel.js')
6 | require('../../js/collapse.js')
7 | require('../../js/dropdown.js')
8 | require('../../js/modal.js')
9 | require('../../js/tooltip.js')
10 | require('../../js/popover.js')
11 | require('../../js/scrollspy.js')
12 | require('../../js/tab.js')
13 | require('../../js/affix.js')
--------------------------------------------------------------------------------
/webspider/web/static/css/mystyle.css:
--------------------------------------------------------------------------------
1 | body {
2 | font-family: "Hiragino Sans GB", "Microsoft Yahei", SimSun, Arial, "Helvetica Neue", Helvetica;
3 | color: #333;
4 | word-wrap: break-word;
5 | -webkit-font-smoothing: antialiased;
6 | font-size: 14px;
7 | }
8 |
9 | footer {
10 | font-size: 14px;
11 | border-radius: 5px;
12 | margin: 0 auto;
13 | width: 100%;
14 | text-align: center;
15 | padding: 10px 0;
16 | }
17 |
18 | .main-body {
19 | min-height: 780px;
20 | }
21 |
22 | .chart-div {
23 | width: 550px;
24 | height: 400px;
25 | padding-top: 30px;
26 | margin: 0 auto;
27 | }
28 |
29 | .large-chart-div {
30 | width: 1000px;
31 | height: 700px;
32 | padding-top: 30px;
33 | margin: 0 auto;
34 | }
--------------------------------------------------------------------------------
/webspider/web/static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/img/favicon.ico
--------------------------------------------------------------------------------
/webspider/web/templates/404.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block body %}
3 |
4 |
你来到了没有知识的荒原 _(:з」∠)_
5 | {% end %}
--------------------------------------------------------------------------------
/webspider/web/templates/500.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block body %}
3 |
4 |
服务器提了一个问题 _(:з」∠)_
5 | {% end %}
--------------------------------------------------------------------------------
/webspider/web/templates/base.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
Web Spider|JustForFunnn
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 | {% block header %}{% end %}
18 |
19 |
20 |
36 |
37 |
38 | {% block body %}
39 | {% end %}
40 |
41 |
42 |
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/webspider/web/templates/city-jobs-count-chart-module.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/webspider/web/templates/education-chart-module.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/webspider/web/templates/finance-stage-chart-module.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/webspider/web/templates/pagination-module.html:
--------------------------------------------------------------------------------
1 |
2 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/webspider/web/templates/per-day-jobs-count-chart-module.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/webspider/web/templates/salary-chart-module.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/webspider/web/templates/statistics.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block body %}
3 |
4 |
5 |
about {{ keyword_name }}:
6 |
7 | {% include 'work-year-chart-module.html' %}
8 | {% include 'salary-chart-module.html' %}
9 | {% include 'city-jobs-count-chart-module.html' %}
10 | {% include 'education-chart-module.html' %}
11 | {% include 'per-day-jobs-count-chart-module.html' %}
12 | {% include 'finance-stage-chart-module.html' %}
13 |
14 |
15 | {% end %}
16 |
17 | {% block header %}
18 |
19 | {% end %}
--------------------------------------------------------------------------------
/webspider/web/templates/work-year-chart-module.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/webspider/web/urls.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from tornado.web import URLSpec, RedirectHandler
3 |
4 | from webspider.web.handlers import KeywordStatisticsApiHandler, KeywordStatisticsPageHandler
5 |
6 | url_handlers = [
7 | URLSpec(r"/", RedirectHandler, {'url': '/statistics?keyword_name=python'}),
8 | URLSpec(r"/api/statistics", KeywordStatisticsApiHandler),
9 | URLSpec(r"/statistics", KeywordStatisticsPageHandler),
10 | ]
11 |
--------------------------------------------------------------------------------