├── .gitattributes
├── .gitignore
├── .landscape.yml
├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── job-chart.jpg
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── base.py
    ├── fixture
    │   ├── city.sql
    │   ├── company.sql
    │   ├── industry.sql
    │   ├── job.sql
    │   ├── job_keyword.sql
    │   ├── jobs_count.sql
    │   ├── keyword.sql
    │   └── keyword_statistic.sql
    ├── schema.sql
    ├── test_controllers
    │   ├── __init__.py
    │   ├── test_city_ctl.py
    │   ├── test_industry_ctl.py
    │   ├── test_job_ctl.py
    │   ├── test_job_keyword_ctl.py
    │   ├── test_keyword_ctl.py
    │   └── test_keyword_statistic_ctl.py
    ├── test_models
    │   └── test_job.py
    ├── test_utils
    │   ├── test_cache.py
    │   ├── test_classproperty.py
    │   ├── test_common.py
    │   ├── test_convert.py
    │   ├── test_http_tools.py
    │   ├── test_pagination.py
    │   ├── test_text.py
    │   └── test_time_tools.py
    ├── test_web
    │   ├── base.py
    │   ├── test_formatter.py
    │   └── test_keyword_statistic.py
    └── util.py
└── webspider
    ├── __init__.py
    ├── constants.py
    ├── controllers
        ├── __init__.py
        ├── city_ctl.py
        ├── industry_ctl.py
        ├── job_ctl.py
        ├── job_keyword_ctl.py
        ├── keyword_ctl.py
        └── keyword_statistic_ctl.py
    ├── crawlers
        ├── __init__.py
        ├── lagou_cites.py
        ├── lagou_companies.py
        ├── lagou_jobs.py
        └── lagou_jobs_count.py
    ├── exceptions.py
    ├── models
        ├── __init__.py
        ├── base.py
        ├── city.py
        ├── company.py
        ├── company_industry.py
        ├── industry.py
        ├── job.py
        ├── job_keyword.py
        ├── jobs_count.py
        ├── keyword.py
        └── keyword_statistic.py
    ├── quickly_cmd.py
    ├── setting.py
    ├── tasks
        ├── __init__.py
        ├── actor
        │   ├── __init__.py
        │   ├── keyword_statistic.py
        │   ├── lagou_data.py
        │   └── lagou_jobs_count.py
        ├── celery_app.py
        └── celery_config.py
    ├── utils
        ├── __init__.py
        ├── cache.py
        ├── classproperty.py
        ├── common.py
        ├── convert.py
        ├── http_tools.py
        ├── log.py
        ├── pagination.py
        ├── sql.py
        ├── text.py
        └── time_tools.py
    └── web
        ├── __init__.py
        ├── app.py
        ├── formatter
            ├── __init__.py
            ├── base.py
            ├── jobs_count.py
            └── keyword_statistic.py
        ├── handlers
            ├── __init__.py
            ├── base.py
            └── keyword_statistics.py
        ├── static
            ├── __init__.py
            ├── bootstrap
            │   ├── css
            │   │   ├── bootstrap-theme.css
            │   │   ├── bootstrap-theme.css.map
            │   │   ├── bootstrap-theme.min.css
            │   │   ├── bootstrap-theme.min.css.map
            │   │   ├── bootstrap.css
            │   │   ├── bootstrap.css.map
            │   │   ├── bootstrap.min.css
            │   │   └── bootstrap.min.css.map
            │   ├── fonts
            │   │   ├── glyphicons-halflings-regular.eot
            │   │   ├── glyphicons-halflings-regular.svg
            │   │   ├── glyphicons-halflings-regular.ttf
            │   │   ├── glyphicons-halflings-regular.woff
            │   │   └── glyphicons-halflings-regular.woff2
            │   └── js
            │   │   ├── bootstrap.js
            │   │   ├── bootstrap.min.js
            │   │   └── npm.js
            ├── css
            │   └── mystyle.css
            ├── img
            │   └── favicon.ico
            └── js
            │   ├── echarts.js
            │   ├── echarts.min.js
            │   └── jquery.min.js
        ├── templates
            ├── 404.html
            ├── 500.html
            ├── base.html
            ├── city-jobs-count-chart-module.html
            ├── education-chart-module.html
            ├── finance-stage-chart-module.html
            ├── pagination-module.html
            ├── per-day-jobs-count-chart-module.html
            ├── salary-chart-module.html
            ├── statistics.html
            └── work-year-chart-module.html
        └── urls.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.html linguist-language=python
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | .idea/
 91 | bin/
 92 | spider_log.txt
 93 | dump.rdb
 94 | .DS_Store
 95 | cover/
 96 | celerybeat.pid
 97 | oj.py
 98 | /webspider/log
 99 | /webspider/security_constants.py
100 | celerybeat-schedule
101 | cove
102 | nohup.out
103 | 


--------------------------------------------------------------------------------
/.landscape.yml:
--------------------------------------------------------------------------------
 1 | autodetect: yes
 2 | test-warnings: true
 3 | doc-warnings: true
 4 | strictness: veryhigh
 5 | max-line-length: 120
 6 | python-targets: 3
 7 | 
 8 | uses:
 9 |   - celery
10 | 
11 | ignore-paths:
12 |   - .git
13 |   - coverage
14 |   - env
15 |   - test
16 |   - webspider/web/templates
17 |   - webspider/web/static
18 | 
19 | pep8:
20 |   run: true
21 |   disable:
22 |     - W291
23 |     - E501
24 | 
25 | pyflakes:
26 |     run: true
27 | 
28 | inherits: [flake8]
29 | 
30 | requirements:
31 |   - requirements.txt
32 | 
33 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | sudo: false
 4 | 
 5 | python:
 6 |   - "3.6"
 7 | 
 8 | services:
 9 |   - mysql
10 |   - redis-server
11 | 
12 | before_install:
13 |   - mysql -e 'CREATE DATABASE spider CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;'
14 | 
15 | install:
16 |   - make
17 | 
18 | script:
19 |   - make test
20 | 
21 | after_success:
22 |   - env/bin/codecov


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | PYTHON:=$(shell which python3)
 2 | 
 3 | all: python
 4 | 
 5 | .PHONY: clean python test flake8
 6 | 
 7 | python: setup.py requirements.txt
 8 | 	pip install virtualenv
 9 | 	echo "\n Creating python virtual environment......\n"
10 | 	virtualenv -p $(PYTHON) env
11 | 	echo "\n Use python virtual environment to install required packages......\n"
12 | 	env/bin/pip install -e .
13 | 	mkdir -p webspider/log
14 | 	touch webspider/log/spider_log.txt
15 | 
16 | test: flake8
17 | 	env/bin/nosetests -vd
18 | 
19 | flake8:
20 | 	env/bin/flake8
21 | 
22 | clean:
23 | 	-rm -rf env cover *eggs *.egg-info *.egg webspider/log
24 | 	@find . -type f -name "*.py[co]" -delete
25 | 	@find . -type d -name "__pycache__" -delete
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | #
  2 | 
  3 | [![Build Status](https://travis-ci.org/JustForFunnnn/webspider.svg)](https://travis-ci.org/JustForFunnnn/webspider)
  4 | [![codecov](https://codecov.io/gh/JustForFunnnn/webspider/branch/master/graph/badge.svg)](https://codecov.io/gh/JustForFunnnn/webspider)
  5 | [![Code Health](https://landscape.io/github/JustForFunnnn/webspider/master/landscape.svg?style=flat)](https://landscape.io/github/JustForFunnnn/webspider/master)
  6 | [![License](https://img.shields.io/github/license/JustForFunnnn/webspider.svg)](https://github.com/JustForFunnnn/webspider/blob/master/LICENSE)
  7 | [![Python](https://img.shields.io/badge/python-3-ff69b4.svg)](https://github.com/JustForFunnnn/webspider)
  8 | 
  9 | | --       | --                                         |
 10 | | -------- | ------------------------------------------ |
 11 | | Version  | 1.0.1                                      |
 12 | | WebSite  | http://119.23.223.90:8000                  |
 13 | | Source   | https://github.com/JustForFunnnn/webspider |
 14 | | Keywords | `Python3`, `Tornado`, `Celery`, `Requests` |
 15 | 
 16 | ## Introduction
 17 | 
 18 | This project crawls job&company data from job-seeking websites, cleans the data, modelizes, converts, and stores it in the database. then use [Echarts](https://echarts.apache.org/en/index.html) and [Bootstrap](https://getbootstrap.com/) to build a front-end page to display the IT job statistics, to show the newest requirements and trends of the IT job market.
 19 | 
 20 | ## Demo
 21 | 
 22 | You can input the keyword you are interested in into the search box, such as "Python", then click the search button, and the statistics of this keyword will show.
 23 | 
 24 | * The first chart `Years of Working(工作年限要求`) is about the experience requirement of the `Python`, according to the data, the "3 ~ 5 years" is the most frequent requirement, then the following is `1 ~ 3 years`  ([Chart Source Code](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/work-year-chart-module.html))
 25 | 
 26 | * The second chart `Salary Range(薪水分布`) is about the salary of the `Python`, according to the data,  the "11k ~ 20k" is the most frequent salary provided, then the following is `21k ~ 35k`  ([Chart Source Code](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/salary-chart-module.html))
 27 | 
 28 | and we also got charts:
 29 | * [Education Requirement Data Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/education-chart-module.html)
 30 | * [City Job Count Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/city-jobs-count-chart-module.html)
 31 | * [Job Count Change Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/per-day-jobs-count-chart-module.html)
 32 | * [Company Finance Stage Chart](https://github.com/JustForFunnnn/webspider/blob/8664fdd135d0ee4322169e484ba9a35bc46032bf/webspider/web/templates/finance-stage-chart-module.html)
 33 | 
 34 | Python Charts Example:
 35 | 
 36 | ![Alt text](job-chart.jpg)
 37 | 
 38 | ## Quick Start
 39 | > This tutorial is based on `Linux - Ubuntu`, for other systems, please find the corresponding command
 40 | 
 41 | * Clone the project
 42 | 
 43 | ```bash
 44 | git clone git@github.com:JustForFunnnn/webspider.git
 45 | ```
 46 | 
 47 | * Install `MySQL`, `Redis`, `Python3`
 48 | 
 49 | ```bash
 50 | # install Redis
 51 | apt-get install redis-server
 52 | 
 53 | # run Redis in background
 54 | nohup redis-server &
 55 | 
 56 | # install Python3
 57 | apt-get install python3
 58 | 
 59 | # install MySQL
 60 | apt-get install mysql-server
 61 | 
 62 | # start MySQL
 63 | sudo service mysql start
 64 | ```
 65 | 
 66 | * Config database and table
 67 | ```
 68 | # create database
 69 | CREATE DATABASE `spider` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
 70 | ```
 71 | We still need to create the tables, copy the table definition SQL from `tests/schema.sql` and run it in MySQL
 72 | 
 73 | * Build project
 74 | ```bash
 75 | # after a successful build, some executable jobs will be generated under the path env/bin 
 76 | make
 77 | ```
 78 | 
 79 | * Run unit-test
 80 | ```bash
 81 | make test
 82 | ```
 83 | 
 84 | * Run code style check
 85 | ```bash
 86 | make flake8
 87 | ```
 88 | 
 89 | * Start web service
 90 | ```bash
 91 | env/bin/web
 92 | ```
 93 | 
 94 | * Stat crawler
 95 | ```bash
 96 | # run task scheduler/dispatcher
 97 | env/bin/celery_beat
 98 | # run celery worker for job data
 99 | env/bin/celery_lg_jobs_data_worker
100 | # run celery worker for job count
101 | env/bin/celery_lg_jobs_count_worker
102 | ```
103 | 
104 | * Other jobs
105 | ```bash
106 | # start crawl job count immediately
107 | env/bin/crawl_lg_jobs_count
108 | # start crawl job data immediately
109 | env/bin/crawl_lg_data
110 | # start celery monitoring
111 | env/bin/celery_flower
112 | ```
113 | 
114 | * Clean
115 | ```bash
116 | # clean the existing build result
117 | make clean
118 | ```
119 | 


--------------------------------------------------------------------------------
/job-chart.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/job-chart.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # All requirements put in this file
 2 | # You **MUST** specify the package version in this file
 3 | 
 4 | tornado == 4.5.3
 5 | gevent == 1.2.2
 6 | gunicorn == 19.7.1
 7 | lxml == 4.1.0
 8 | requests == 2.18.4
 9 | mysqlclient == 1.3.12
10 | sqlalchemy == 1.2.2
11 | redis == 2.10.6
12 | python-redis == 0.1.7
13 | retrying == 1.3.3
14 | celery == 4.0.2
15 | 
16 | flower == 0.9.2
17 | ipython == 6.2.1
18 | nose == 1.3.7
19 | coverage == 4.4.2
20 | flake8 == 3.5.0
21 | codecov == 2.0.15


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = W291
 3 | max-line-length = 120
 4 | exclude =
 5 |     .git,
 6 |     eggs,
 7 |     env,
 8 |     tests
 9 | 
10 | [nosetests]
11 | logging-clear-handlers = 1
12 | with-coverage = 1
13 | cover-package = webspider
14 | cover-erase = 1
15 | logging-level = DEBUG
16 | cover-xml = 1
17 | cover-xml-file = coverage.xml
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | from setuptools import find_packages, setup
 5 | 
 6 | from webspider import __version__
 7 | 
 8 | # get the dependencies and installs
 9 | here = os.path.abspath(os.path.dirname(__file__))
10 | with open(os.path.join(here, 'requirements.txt')) as f:
11 |     all_requirements = f.read().split('\n')
12 | 
13 | setup(
14 |     name='webspider',
15 |     version=__version__,
16 |     license='MIT',
17 |     author='JustForFunnn',
18 |     author_email='',
19 |     description='web spider',
20 |     url='https://github.com/JustForFunnnn/webspider',
21 |     packages=find_packages(exclude=['tests']),
22 |     package_data={'webspider': ['README.md']},
23 |     zip_safe=False,
24 |     install_requires=all_requirements,
25 |     entry_points={
26 |         'console_scripts': [
27 |             'web = webspider.web.app:main',
28 |             'production_web = webspider.quickly_cmd:run_web_app_by_gunicorn',
29 |             'crawl_lg_data = webspider.tasks.actor.lg_data:crawl_lg_data_task',
30 |             'crawl_lg_jobs_count = webspider.tasks.actor.lg_jobs_count:crawl_lg_jobs_count_task',
31 |             # beat
32 |             'celery_beat = webspider.quickly_cmd:run_celery_beat',
33 |             'celery_flower = webspider.quickly_cmd.py:run_celery_flower',
34 |             # worker
35 |             'celery_default_worker = webspider.quickly_cmd:run_celery_default_worker',
36 |             'celery_lg_data_worker = webspider.quickly_cmd:run_celery_lg_data_worker',
37 |             'celery_lg_jobs_data_worker = webspider.quickly_cmd:run_celery_lg_jobs_data_worker',
38 |             'celery_lg_jobs_count_worker = webspider.quickly_cmd:run_celery_lg_jobs_count_worker',
39 |         ],
40 |     }
41 | )
42 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | from unittest import TestCase
 5 | 
 6 | from webspider.utils.sql import get_session
 7 | from tests.util import execute_sql_file, get_current_database_name
 8 | 
 9 | here_dir = os.path.dirname(__file__)
10 | 
11 | 
12 | class BaseTestCase(TestCase):
13 |     session = get_session()
14 | 
15 |     def setUp(self):
16 |         test_db_name = 'test_spider'
17 |         # 清除测试数据库
18 |         self.session.execute("DROP DATABASE IF EXISTS {db_name};".format(db_name=test_db_name))
19 |         # 创建测试数据库
20 |         self.session.execute("CREATE DATABASE {db_name} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;".format(
21 |             db_name=test_db_name))
22 |         # 指定测试数据库 test_spider
23 |         self.session.execute("USE {db_name};".format(db_name=test_db_name))
24 | 
25 |         path = os.path.dirname(__file__)
26 |         # 创建表
27 |         execute_sql_file(
28 |             file_paths=[os.path.join(path, "schema.sql"), ],
29 |             db_session=self.session,
30 |             predictive_db_name=test_db_name
31 |         )
32 |         fixture_path = os.path.join(path, 'fixture')
33 |         # 装载表数据
34 |         fixture_file_paths = [os.path.join(fixture_path, file) for file in os.listdir(fixture_path)]
35 |         execute_sql_file(
36 |             file_paths=fixture_file_paths,
37 |             db_session=self.session,
38 |             predictive_db_name=test_db_name
39 |         )
40 |         assert get_current_database_name(self.session) == test_db_name
41 | 
42 |     def teardown(self):
43 |         # 测试结束 销毁测试数据库
44 |         self.session.execute('DROP DATABASE test_spider;')
45 | 


--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | from unittest import TestCase
 5 | 
 6 | from webspider.utils.sql import get_session
 7 | from tests.util import create_test_db, drop_test_db
 8 | 
 9 | here_dir = os.path.dirname(__file__)
10 | 
11 | 
12 | class BaseTestCase(TestCase):
13 |     session = get_session()
14 | 
15 |     def setUp(self):
16 |         create_test_db(session=self.session)
17 | 
18 |     def tearDown(self):
19 |         # 测试结束 销毁测试数据库
20 |         drop_test_db(session=self.session)
21 | 


--------------------------------------------------------------------------------
/tests/fixture/city.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `test_spider`.`city`(`id`, `name`)
2 | VALUE (2, '北京'),
3 |       (3, '上海'),
4 |       (4, '广州');


--------------------------------------------------------------------------------
/tests/fixture/company.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `company` (`id`, `lg_company_id`, `city_id`, `shortname`, `fullname`, `finance_stage`, `size`, `address`, `features`, `process_rate`, `introduce`, `advantage`, `created_at`, `updated_at`)
2 | VALUES
3 | 	(1, 168219, 2, '贝壳金控', '贝壳金控控股集团有限公司', 1, 5, '2017年5月，贝壳正式独立运作，是国内首家聚焦于居住领域的消费金融服务平台','不知道', 100, '我是简介', '[\"\\u4e13\\u9879\\u5956\\u91d1\", \"\\u5e74\\u7ec8\\u5206\\u7ea2\", \"\\u5b9a\\u671f\\u4f53\\u68c0\", \"\\u7ee9\\u6548\\u5956\\u91d1\", \"\\u5348\\u9910\\u8865\\u52a9\", \"\\u4ea4\\u901a\\u8865\\u52a9\"]', '2018-01-28 15:26:13', '2018-01-28 15:35:19'),
4 | 	(2, 142800, 2, '猫眼电影', '北京猫眼文化传媒有限公司', 1, 5, '北京朝阳区望京东路4号恒电大厦B座8层', '一网打尽好电影', 100, '猫眼电影简介\n猫眼电影（网站经营者：北京猫眼文化传媒有限公司）是美团。。。', '[]', '2018-01-28 15:26:13', '2018-01-28 15:35:19'),
5 | 	(3, 107435, 2, '熊猫直播', '上海熊猫互娱文化有限公司北京分公司', 3, 5, '北京朝阳区望京soho塔3，A座18层', '王思聪任CEO的视频直播平台', 100, '熊猫直播成立于2015年7月，由王思聪先生亲任CEO，并聚集了国内众多一线视频主播资源。', '[\"\\u5e74\\u5e95\\u53cc\\u85aa\", \"\\u5e26\\u85aa\\u5e74\\u5047\", \"\\u5348\\u9910\\u8865\\u52a9\", \"\\u7ee9\\u6548\\u5956\\u91d1\", \"\\u80a1\\u7968\\u671f\\u6743\"]', '2018-01-28 15:26:13', '2018-01-28 15:35:19');
6 | 


--------------------------------------------------------------------------------
/tests/fixture/industry.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `test_spider`.`industry` (`id`, `name`, `created_at`, `updated_at`)
2 | VALUES
3 | 	(1000001, '开网吧', '2018-01-29 19:07:52', '2018-01-29 19:07:52'),
4 | 	(1000002, '开餐厅', '2018-01-29 19:07:52', '2018-01-29 19:07:52');
5 | 


--------------------------------------------------------------------------------
/tests/fixture/job.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `test_spider`.`job` (`id`, `lg_job_id`, `city_id`, `company_id`, `title`, `work_year`, `department`, `salary`, `education`, `nature`, `description`, `advantage`, `created_at`, `updated_at`)
2 | VALUES
3 | 	(1, 10001, 2, 1, '高级前端开发工程师', 5, '贝壳金控交易研发部-交易前端组招聘', '15k-30k', 3, 1, '职位介绍A', '15薪,工作居住证,六险一金,双休', '2018-01-29 19:11:33', '2018-01-30 17:22:30'),
4 | 	(2, 10002, 4, 2, '前端开发工程师', 6, '贝壳金控技术产品中心招聘', '20k-40k', 3, 1, '职位介绍B', '高薪,大牛,六险一金,成长空间大', '2018-01-29 19:11:33', '2018-01-30 17:22:30'),
5 | 	(3, 10003, 4, 3, 'DBA工程师', 5, '贝壳金控运维技术部招聘', '15k-30k', 3, 1, '职位介绍C', '大牛,高薪,成长空间大,团队氛围好', '2018-01-29 19:11:33', '2018-01-30 17:22:30');
6 | 


--------------------------------------------------------------------------------
/tests/fixture/job_keyword.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `job_keyword` (`id`, `job_id`, `keyword_id`, `created_at`, `updated_at`)
2 | VALUES
3 | 	(1, 1, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
4 | 	(2, 1, 101, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
5 | 	(3, 2, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
6 | 	(4, 2, 101, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
7 | 	(5, 2, 102, '2018-01-28 15:36:12', '2018-01-28 15:36:12'),
8 | 	(6, 3, 100, '2018-01-28 15:36:12', '2018-01-28 15:36:12');
9 | 


--------------------------------------------------------------------------------
/tests/fixture/jobs_count.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `jobs_count` (`id`, `date`, `keyword_id`, `all_city`, `beijing`, `guangzhou`, `shenzhen`,
2 |                           `shanghai`, `hangzhou`, `chengdu`, `created_at`, `updated_at`)
3 | VALUES
4 |   (1, 20180128, 100, 576, 198, 35, 93, 80, 41, 26, '2018-01-28 17:01:04', '2018-01-28 17:01:04'),
5 |   (2, 20180129, 100, 580, 200, 36, 100, 82, 44, 30, '2018-01-28 17:01:04', '2018-01-28 17:01:04');
6 | 


--------------------------------------------------------------------------------
/tests/fixture/keyword.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO `test_spider`.`keyword` (`id`, `name`)
2 | VALUES
3 |   (100, 'python'),
4 |   (101, 'java'),
5 |   (102, 'qt'),
6 |   (103, '前端');
7 | 


--------------------------------------------------------------------------------
/tests/fixture/keyword_statistic.sql:
--------------------------------------------------------------------------------
 1 | INSERT INTO `keyword_statistic` (`id`, `keyword_id`, `educations`, `city_jobs_count`, `salary`, `financing_stage`, `work_years`, `created_at`, `updated_at`)
 2 | VALUES
 3 | 	(1, 100,
 4 | 	'{"不限": 1, "大专": 2, "本科": 3, "本科": 4, "硕士": 5, "博士": 6, "unknown": 7}',
 5 |   '{"北京": 8, "深圳": 9, "广州": 10}',
 6 |   '{"10k以下": 11, "11k-20k": 12, "21k-35k": 13, "36k-60k": 14, "61k以上": 15}',
 7 |   '{"未融资": 16, "天使轮": 17, "A轮": 18, "B轮": 19, "C轮": 20, "D轮及以上": 21, "上市公司": 22, "不需要融资": 23, "unknown": 24}',
 8 | 	'{"不限": 25, "应届毕业生": 26, "1年以下": 27, "1-3年": 28, "3-5年": 29, "5-10年": 30, "10年以上": 31, "unknown": 32}',
 9 | 	'2018-02-01 19:01:44', '2018-02-05 01:01:48');
10 | 


--------------------------------------------------------------------------------
/tests/schema.sql:
--------------------------------------------------------------------------------
  1 | -- CREATE DATABASE `spider` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
  2 | 
  3 | CREATE TABLE IF NOT EXISTS `job` (
  4 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
  5 |   `lg_job_id`INT UNSIGNED NOT NULL COMMENT '所使用的职位id',
  6 |   `city_id`     INT UNSIGNED NOT NULL COMMENT '城市 id',
  7 |   `company_id`  INT UNSIGNED NOT NULL COMMENT '公司 id',
  8 |   `title`       VARCHAR(64) NOT NULL COMMENT '职位标题',
  9 |   `work_year`   TINYINT NOT NULL DEFAULT 0 COMMENT '工作年限要求',
 10 |   `department`  VARCHAR(64) NOT NULL DEFAULT '' COMMENT '招聘部门',
 11 |   `salary`      VARCHAR(32) NOT NULL DEFAULT '' COMMENT '薪水',
 12 |   `education`   TINYINT NOT NULL DEFAULT 0 COMMENT '教育背景要求',
 13 |   `nature`      TINYINT NOT NULL DEFAULT 0 COMMENT '工作性质',
 14 |   `description` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '额外描述',
 15 |   `advantage`   VARCHAR(256) NOT NULL DEFAULT '' COMMENT '职位优势',
 16 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
 17 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
 18 |   UNIQUE KEY (`lg_job_id`),
 19 |   KEY `idx_company_id` (`company_id`)
 20 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位表';
 21 | 
 22 | 
 23 | CREATE TABLE IF NOT EXISTS `company` (
 24 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
 25 |   `lg_company_id` INT UNSIGNED NOT NULL COMMENT '所使用的公司id',
 26 |   `city_id`     INT UNSIGNED NOT NULL COMMENT '所在城市 id',
 27 |   `shortname`   VARCHAR(64) NOT NULL COMMENT '公司名称',
 28 |   `fullname`    VARCHAR(128) NOT NULL COMMENT '公司全称',
 29 |   `finance_stage` TINYINT NOT NULL DEFAULT 0 COMMENT '融资阶段',
 30 |   `size`        TINYINT NOT NULL DEFAULT 0 COMMENT '公司规模',
 31 |   `address`     VARCHAR(128) NOT NULL DEFAULT '' COMMENT '公司地址',
 32 |   `features`    VARCHAR(128) NOT NULL DEFAULT '' COMMENT '公司特点',
 33 |   `process_rate` TINYINT NOT NULL DEFAULT 0  COMMENT '简历处理率',
 34 |   `introduce`   VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '公司简介',
 35 |   `advantage`   VARCHAR(256) NOT NULL DEFAULT '' COMMENT '公司优势',
 36 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
 37 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
 38 |   UNIQUE KEY (`lg_company_id`),
 39 |   KEY `idx_city_id` (`city_id`)
 40 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='公司表';
 41 | 
 42 | 
 43 | CREATE TABLE IF NOT EXISTS `city` (
 44 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
 45 |   `name`        VARCHAR(64) NOT NULL COMMENT '城市名',
 46 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
 47 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
 48 |   UNIQUE KEY (`name`)
 49 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='城市表';
 50 | 
 51 | 
 52 | CREATE TABLE IF NOT EXISTS `industry` (
 53 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
 54 |   `name`        VARCHAR(64) NOT NULL COMMENT '行业名称',
 55 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
 56 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
 57 |   UNIQUE KEY (`name`)
 58 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4  COMMENT='行业表';
 59 | 
 60 | 
 61 | CREATE TABLE IF NOT EXISTS `company_industry` (
 62 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
 63 |   `company_id`  INT UNSIGNED NOT NULL COMMENT '公司 id',
 64 |   `industry_id` INT UNSIGNED NOT NULL COMMENT '行业 id',
 65 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
 66 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
 67 |   UNIQUE KEY(`company_id`, `industry_id`)
 68 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='公司行业表';
 69 | 
 70 | 
 71 | -- 预置行业类型
 72 | INSERT INTO `industry` (`id`, `name`)
 73 | VALUES
 74 | (24,'移动互联网'),
 75 | (25,'电子商务'),
 76 | (26,'社交网络'),
 77 | (27,'企业服务'),
 78 | (28,'O2O'),
 79 | (29,'教育'),
 80 | (31,'游戏'),
 81 | (32,'旅游'),
 82 | (33,'金融'),
 83 | (34,'医疗健康'),
 84 | (35,'生活服务'),
 85 | (38,'信息安全'),
 86 | (41,'数据服务'),
 87 | (43,'广告营销'),
 88 | (45,'文化娱乐'),
 89 | (47,'硬件'),
 90 | (48,'分类信息'),
 91 | (49,'招聘'),
 92 | (10594,'其他');
 93 | 
 94 | 
 95 | CREATE TABLE IF NOT EXISTS `keyword` (
 96 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
 97 |   `name`        VARCHAR(64) NOT NULL COMMENT '关键词名称',
 98 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
 99 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
100 |   UNIQUE KEY (`name`)
101 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='关键词';
102 | 
103 | 
104 | CREATE TABLE IF NOT EXISTS `job_keyword` (
105 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
106 |   `job_id`      INT NOT NULL COMMENT '工作 id',
107 |   `keyword_id`  INT NOT NULL COMMENT '关键词 id',
108 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
109 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
110 |   UNIQUE KEY(`job_id`, `keyword_id`),
111 |   KEY `idx_keyword_id` (`keyword_id`)
112 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位关键词';
113 | 
114 | 
115 | CREATE TABLE IF NOT EXISTS `jobs_count` (
116 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
117 |   `date`        INT NOT NULL COMMENT '日期',
118 |   `keyword_id`  INT NOT NULL COMMENT '关键词 id',
119 |   `all_city`    INT NOT NULL DEFAULT 0 COMMENT '全国岗位数量',
120 |   `beijing`     INT NOT NULL DEFAULT 0 COMMENT '北京岗位数量',
121 |   `guangzhou`   INT NOT NULL DEFAULT 0 COMMENT '广州岗位数量',
122 |   `shenzhen`    INT NOT NULL DEFAULT 0 COMMENT '深圳岗位数量',
123 |   `shanghai`    INT NOT NULL DEFAULT 0 COMMENT '上海岗位数量',
124 |   `hangzhou`    INT NOT NULL DEFAULT 0 COMMENT '杭州岗位数量',
125 |   `chengdu`     INT NOT NULL DEFAULT 0 COMMENT '成都岗位数量',
126 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
127 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
128 |   UNIQUE KEY(`date`, `keyword_id`),
129 |   KEY `idx_keyword_id` (`keyword_id`)
130 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='职位每日数量统计';
131 | 
132 | 
133 | CREATE TABLE IF NOT EXISTS `keyword_statistic` (
134 |   `id`          INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
135 |   `keyword_id`  INT UNSIGNED NOT NULL COMMENT '关键词 id',
136 |   `educations`  VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '教育背景要求情况',
137 |   `city_jobs_count`VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '城市职位数量情况',
138 |   `salary`      VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '薪水分布情况',
139 |   `financing_stage` VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '公司融资阶段情况',
140 |   `work_years`  VARCHAR(2048) NOT NULL DEFAULT '' COMMENT '要求的工作年限情况',
141 |   `created_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
142 |   `updated_at`  TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间',
143 |   UNIQUE KEY(`keyword_id`)
144 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='关键词分析表' COMMENT='关键词分析';
145 | 


--------------------------------------------------------------------------------
/tests/test_controllers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/tests/test_controllers/__init__.py


--------------------------------------------------------------------------------
/tests/test_controllers/test_city_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from tests import BaseTestCase
 3 | from webspider.controllers import city_ctl
 4 | from webspider.models import CityModel
 5 | 
 6 | 
 7 | class TestCityController(BaseTestCase):
 8 |     def test_get_city_id_by_name(self):
 9 |         city_id = city_ctl.get_city_id_by_name(name='北京')
10 |         self.assertEqual(city_id, 2)
11 | 
12 |         with self.assertRaises(ValueError):
13 |             city_ctl.get_city_id_by_name(name='通利福尼亚')
14 | 
15 |     def test_insert_city_if_not_exist(self):
16 |         city_id = city_ctl.insert_city_if_not_exist('湛江')
17 |         self.assertTrue(city_id > 0)
18 |         city = CityModel.get_by_pk(pk=city_id)
19 |         self.assertEqual(city.name, '湛江')
20 | 
21 |         self.assertIsNone(city_ctl.insert_city_if_not_exist('湛江'))
22 | 
23 |     def test_get_city_name_dict(self):
24 |         city_name_dict = city_ctl.get_city_name_dict()
25 |         self.assertDictEqual(city_name_dict, {'北京': 2, '上海': 3, '广州': 4})
26 | 


--------------------------------------------------------------------------------
/tests/test_controllers/test_industry_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from tests import BaseTestCase
 3 | from webspider.controllers import industry_ctl
 4 | from webspider.models import IndustryModel
 5 | 
 6 | 
 7 | class TestIndustryController(BaseTestCase):
 8 |     def test_get_industry_id_by_name(self):
 9 |         industry_id = industry_ctl.get_industry_id_by_name(name='开网吧')
10 |         self.assertEqual(industry_id, 1000001)
11 | 
12 |         with self.assertRaises(ValueError):
13 |             industry_ctl.get_industry_id_by_name(name='开飞机')
14 | 
15 |     def test_insert_industry_if_not_exist(self):
16 |         industry_name = '开飞机'
17 |         industry_id = industry_ctl.insert_industry_if_not_exist(industry_name)
18 |         self.assertTrue(industry_id > 0)
19 |         industry = IndustryModel.get_by_pk(pk=industry_id)
20 |         self.assertEqual(industry.name, industry_name)
21 | 
22 |         self.assertIsNone(industry_ctl.insert_industry_if_not_exist(industry_name))
23 | 


--------------------------------------------------------------------------------
/tests/test_controllers/test_job_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from webspider.controllers import job_ctl
 5 | 
 6 | 
 7 | class TestJobController(TestCase):
 8 | 
 9 |     def test_get_salary_section(self):
10 |         salary = '15k-25k'
11 |         left, right = job_ctl.get_salary_section(salary)
12 |         self.assertEqual(left, 15)
13 |         self.assertEqual(right, 25)
14 | 
15 |         salary = '15k以上'
16 |         left, right = job_ctl.get_salary_section(salary)
17 |         self.assertEqual(left, 15)
18 |         self.assertEqual(right, 20)
19 | 
20 |         salary = '15k以下'
21 |         left, right = job_ctl.get_salary_section(salary)
22 |         self.assertEqual(left, 10)
23 |         self.assertEqual(right, 15)
24 | 
25 |         with self.assertRaises(ValueError):
26 |             left, right = job_ctl.get_salary_section('15k30k')
27 | 


--------------------------------------------------------------------------------
/tests/test_controllers/test_job_keyword_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from tests import BaseTestCase
 3 | from webspider.controllers import job_keyword_ctl
 4 | 
 5 | 
 6 | class TestJobKeywordController(BaseTestCase):
 7 |     def test_get_most_frequently_keyword_ids(self):
 8 |         keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids()
 9 |         self.assertEqual(keyword_ids, [100, 101, 102])
10 | 
11 |         keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=2)
12 |         self.assertEqual(keyword_ids, [100, 101])
13 | 
14 |         keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(offset=1)
15 |         self.assertEqual(keyword_ids, [101, 102])
16 | 
17 |         keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=1, offset=1)
18 |         self.assertEqual(keyword_ids, [101])
19 | 


--------------------------------------------------------------------------------
/tests/test_controllers/test_keyword_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from tests import BaseTestCase
 3 | from webspider.controllers import keyword_ctl
 4 | from webspider.models import KeywordModel
 5 | 
 6 | 
 7 | class TestKeywordController(BaseTestCase):
 8 |     def test_get_keyword_name_by_id(self):
 9 |         keyword_name = keyword_ctl.get_keyword_name_by_id(keyword_id=100)
10 |         self.assertEqual(keyword_name, 'python')
11 | 
12 |         with self.assertRaises(ValueError):
13 |             keyword_ctl.get_keyword_name_by_id(keyword_id=10001)
14 | 
15 |     def test_get_keyword_id_by_name(self):
16 |         keyword_id = keyword_ctl.get_keyword_id_by_name(name='python')
17 |         self.assertEqual(keyword_id, 100)
18 | 
19 |         with self.assertRaises(ValueError):
20 |             keyword_ctl.get_keyword_id_by_name(name='go')
21 | 
22 |     def test_insert_keyword_if_not_exist(self):
23 |         keyword_name = 'C--'
24 |         keyword_id = keyword_ctl.insert_keyword_if_not_exist(keyword_name)
25 |         self.assertTrue(keyword_id > 0)
26 |         keyword = KeywordModel.get_by_pk(pk=keyword_id)
27 |         self.assertEqual(keyword.name, keyword_name)
28 | 
29 |         self.assertIsNone(keyword_ctl.insert_keyword_if_not_exist(keyword_name))
30 | 


--------------------------------------------------------------------------------
/tests/test_controllers/test_keyword_statistic_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from tests import BaseTestCase
 3 | from webspider.controllers import keyword_statistic_ctl
 4 | from webspider.models import JobModel
 5 | from webspider.constants import EDUCATION_REQUEST_DICT, WORK_YEARS_REQUEST_DICT
 6 | 
 7 | 
 8 | class TestKeywordStatisticController(BaseTestCase):
 9 |     def test_get_salary_statistic(self):
10 |         test_jobs_model = [JobModel(salary='5k-9k'), JobModel(salary='10-15k'), JobModel(salary='15k-20k'),
11 |                            JobModel(salary='16-18k'), JobModel(salary='20k-30k'), JobModel(salary='30k-35k'),
12 |                            JobModel(salary='20k以上'), JobModel(salary='60k-100k'), JobModel(salary='40k-42k')]
13 |         salary_statistic = keyword_statistic_ctl.get_salary_statistic(test_jobs_model)
14 |         self.assertDictEqual(salary_statistic, {
15 |             '10k及以下': 2,
16 |             '11k-20k': 5,
17 |             '21k-35k': 3,
18 |             '36k-60k': 2,
19 |             '61k以上': 1,
20 |         })
21 | 
22 |     def test_get_finance_stage_statistic(self):
23 |         test_jobs_model = [JobModel(company_id=1), JobModel(company_id=2), JobModel(company_id=3)]
24 |         finance_stage_statistic = keyword_statistic_ctl.get_finance_stage_statistic(test_jobs_model)
25 |         self.assertDictEqual(finance_stage_statistic, {
26 |             '未融资': 2,
27 |             'A轮': 1,
28 |         })
29 | 
30 |     def test_get_educations_statistic(self):
31 |         test_jobs_model = [JobModel(education=EDUCATION_REQUEST_DICT['大专']),
32 |                            JobModel(education=EDUCATION_REQUEST_DICT['本科']),
33 |                            JobModel(education=EDUCATION_REQUEST_DICT['本科'])]
34 |         educations_statistic = keyword_statistic_ctl.get_educations_statistic(test_jobs_model)
35 |         self.assertDictEqual(educations_statistic, {
36 |             '本科': 2,
37 |             '大专': 1,
38 |         })
39 | 
40 |     def test_get_work_years_statistic(self):
41 |         test_jobs_model = [JobModel(work_year=WORK_YEARS_REQUEST_DICT['应届毕业生']),
42 |                            JobModel(work_year=WORK_YEARS_REQUEST_DICT['应届毕业生']),
43 |                            JobModel(work_year=WORK_YEARS_REQUEST_DICT['1-3年'])]
44 |         work_years_statistic = keyword_statistic_ctl.get_work_years_statistic(test_jobs_model)
45 |         self.assertDictEqual(work_years_statistic, {
46 |             '应届毕业生': 2,
47 |             '1-3年': 1,
48 |         })
49 | 
50 |     def test_get_city_jobs_count_statistic(self):
51 |         test_jobs_model = [JobModel(city_id=2), JobModel(city_id=2), JobModel(city_id=2), JobModel(city_id=2),
52 |                            JobModel(city_id=3), JobModel(city_id=3), JobModel(city_id=3),
53 |                            JobModel(city_id=4), JobModel(city_id=4)]
54 |         sorted_city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(test_jobs_model)
55 |         self.assertDictEqual(sorted_city_jobs_count_statistic, {
56 |             '北京': 4,
57 |             '上海': 3,
58 |             '广州': 2,
59 |         })
60 | 
61 |         sorted_city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(test_jobs_model, 2)
62 |         self.assertDictEqual(sorted_city_jobs_count_statistic, {
63 |             '北京': 4,
64 |             '上海': 3
65 |         })
66 | 


--------------------------------------------------------------------------------
/tests/test_models/test_job.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from datetime import datetime
  3 | 
  4 | from sqlalchemy import and_
  5 | 
  6 | from tests import BaseTestCase
  7 | from webspider.models import JobModel, CityModel
  8 | 
  9 | test_job_dict = dict(id=1,
 10 |                      lg_job_id=10001,
 11 |                      city_id=2,
 12 |                      company_id=1,
 13 |                      title='高级前端开发工程师',
 14 |                      work_year=5,
 15 |                      department='贝壳金控交易研发部-交易前端组招聘',
 16 |                      salary='15k-30k',
 17 |                      education=3,
 18 |                      nature=1,
 19 |                      description='职位介绍A',
 20 |                      advantage='15薪,工作居住证,六险一金,双休',
 21 |                      created_at=datetime.strptime('2018-01-29 19:11:33', '%Y-%m-%d %H:%M:%S'),
 22 |                      updated_at=datetime.strptime('2018-01-30 17:22:30', '%Y-%m-%d %H:%M:%S'))
 23 | 
 24 | 
 25 | class TestJobModel(BaseTestCase):
 26 |     def test_pk_name(self):
 27 |         self.assertEqual(JobModel.pk_name, 'id')
 28 | 
 29 |     def test_pk(self):
 30 |         self.assertEqual(JobModel.pk, JobModel.id)
 31 | 
 32 |     def test_model_instance_to_dict(self):
 33 |         job = JobModel.get_by_pk(pk=1).dict()
 34 |         self.assertTrue(isinstance(job, dict))
 35 |         self.assertDictEqual(job, test_job_dict)
 36 | 
 37 |     def test_get_by_pk(self):
 38 |         job = JobModel.get_by_pk(pk=1)
 39 |         self.assertDictEqual(job.dict(), test_job_dict)
 40 | 
 41 |     def test_count(self):
 42 |         jobs_count = JobModel.count()
 43 |         self.assertEqual(jobs_count, 3)
 44 | 
 45 |         jobs_count = JobModel.count(filter_by={'city_id': 4})
 46 |         self.assertEqual(jobs_count, 2)
 47 | 
 48 |         jobs_count = JobModel.count(filter=(and_(JobModel.city_id == 4, JobModel.company_id == 3)))
 49 |         self.assertEqual(jobs_count, 1)
 50 | 
 51 |         jobs_count = JobModel.count(filter=(JobModel.id == 1))
 52 |         self.assertEqual(jobs_count, 1)
 53 | 
 54 |     def test_is_exist(self):
 55 |         is_exist = JobModel.is_exist(filter=(JobModel.id == 1))
 56 |         self.assertEqual(is_exist, True)
 57 | 
 58 |     def test_add(self):
 59 |         to_add_data_dict = dict(lg_job_id=10004,
 60 |                                 city_id=3,
 61 |                                 company_id=1,
 62 |                                 title='Python 开发工程师',
 63 |                                 work_year=5,
 64 |                                 department='吖吖项目组',
 65 |                                 salary='15k-35k',
 66 |                                 education=2,
 67 |                                 nature=1,
 68 |                                 description='职位介绍D',
 69 |                                 advantage='16薪,工作居住证,六十八险一金,双休', )
 70 |         job_id = JobModel.add(**to_add_data_dict)
 71 |         self.assertTrue(job_id > 0)
 72 |         job = JobModel.get_by_pk(pk=job_id)
 73 |         self.assertDictContainsSubset(to_add_data_dict, job.dict())
 74 | 
 75 |     def test_get_one(self):
 76 |         job = JobModel.get_one(filter_by={'id': 1})
 77 |         self.assertDictEqual(job.dict(), test_job_dict)
 78 | 
 79 |         job = JobModel.get_one(filter=(JobModel.id == 1))
 80 |         self.assertDictEqual(job.dict(), test_job_dict)
 81 | 
 82 |     def test_list(self):
 83 |         # test list
 84 |         jobs = JobModel.list()
 85 |         self.assertEqual(len(jobs), 3)
 86 |         self.assertDictEqual(jobs[0].dict(), test_job_dict)
 87 | 
 88 |         # test list limit
 89 |         jobs = JobModel.list(limit=1)
 90 |         self.assertEqual(len(jobs), 1)
 91 | 
 92 |         # test list offset
 93 |         jobs = JobModel.list(offset=1)
 94 |         self.assertEqual(len(jobs), 2)
 95 | 
 96 |         # test list filter_by
 97 |         jobs = JobModel.list(filter_by={'id': 1})
 98 |         self.assertEqual(len(jobs), 1)
 99 |         self.assertEqual(jobs[0].dict(), test_job_dict)
100 | 
101 |     def test_update(self):
102 |         init_job_data_dict = JobModel.get_by_pk(pk=1).dict()
103 |         to_update_data_dict = dict(title=u'后端吃饭工程师',
104 |                                    work_year=1,
105 |                                    city_id=1,
106 |                                    company_id=1,
107 |                                    department='飞天面条神教招聘',
108 |                                    salary='20k-32k',
109 |                                    education=2,
110 |                                    description=u'日常工作:吃饭！')
111 | 
112 |         affect_rows = JobModel.update(filter_by={'id': 1}, values=to_update_data_dict)
113 |         self.assertEqual(affect_rows, 1)
114 | 
115 |         # 更新后预期的结果
116 |         init_job_data_dict.update(**to_update_data_dict)
117 |         predictive_job_data_dict = init_job_data_dict
118 |         init_updated_at = init_job_data_dict.pop('updated_at')
119 | 
120 |         new_job_data_dict = JobModel.get_by_pk(pk=1).dict()
121 |         self.assertDictContainsSubset(predictive_job_data_dict, new_job_data_dict)
122 |         self.assertGreater(new_job_data_dict.updated_at, init_updated_at)
123 | 
124 |         # 其他记录不受影响
125 |         self.assertEqual(JobModel.get_by_pk(pk=2).title, u'前端开发工程师')
126 | 
127 |         # 批量更改
128 |         affect_rows = JobModel.update(filter_by={'city_id': 4}, values={'title': '测试'})
129 |         self.assertEqual(affect_rows, 2)
130 |         jobs = JobModel.list(filter_by={'city_id': 4})
131 |         self.assertTrue(all([job.title == u'测试' for job in jobs]))
132 | 
133 |     def test_update_by_pk(self):
134 |         affect_rows = JobModel.update_by_pk(pk=1, values={'title': '你好啊啊'})
135 |         self.assertEqual(affect_rows, 1)
136 |         self.assertEqual(JobModel.get_by_pk(pk=1).title, u'你好啊啊')
137 | 
138 |     def test_execute_sql_string(self):
139 |         job_rows = JobModel.execute_sql_string(
140 |             'SELECT id, title FROM job WHERE id = :id', {'id': 1})
141 |         self.assertEqual(len(job_rows), 1)
142 |         self.assertEqual(job_rows[0][0], 1)
143 |         self.assertEqual(job_rows[0][1], u'高级前端开发工程师')
144 | 
145 |         job_rows = JobModel.execute_sql_string('SELECT id, title FROM job')
146 |         self.assertEqual(len(job_rows), 3)
147 |         self.assertEqual(job_rows[0][0], 1)
148 |         self.assertEqual(job_rows[0][1], u'高级前端开发工程师')
149 | 
150 |         affect_rows = JobModel.execute_sql_string(
151 |             "UPDATE job SET title = '测试' WHERE id = :id", {'id': 1})
152 |         self.assertEqual(affect_rows, 1)
153 |         job = JobModel.get_by_pk(pk=1)
154 |         self.assertEqual(job.title, u'测试')
155 | 
156 |     def test_batch_add(self):
157 |         # 插入了其他的类实例
158 |         init_jobs_count = JobModel.count()
159 |         model_instances = [CityModel(name='你好'),
160 |                            JobModel(title='招聘资深前端工程师', city_id=1, company_id=2, lg_job_id=100056),
161 |                            JobModel(title='招聘资深中端工程师', city_id=1, company_id=2, lg_job_id=100055), ]
162 | 
163 |         with self.assertRaises(ValueError):
164 |             JobModel.batch_add(model_instances)
165 | 
166 |         self.assertEqual(JobModel.count(), init_jobs_count)
167 | 
168 |         model_instances = [JobModel(title='招聘资深前端工程师', city_id=1, company_id=2, lg_job_id=100056),
169 |                            JobModel(title='招聘资深中端工程师', city_id=1, company_id=2, lg_job_id=100055), ]
170 | 
171 |         JobModel.batch_add(model_instances)
172 | 
173 |         self.assertEqual(JobModel.count(), init_jobs_count + 2)
174 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_cache.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import time
  3 | from unittest import TestCase
  4 | 
  5 | from webspider.utils.cache import simple_cache, cache_clear, redis_instance
  6 | 
  7 | test_number = 0
  8 | 
  9 | 
 10 | @simple_cache()
 11 | def incr_then_return_test_number(keyword=None):
 12 |     global test_number
 13 |     test_number += 1
 14 |     return test_number
 15 | 
 16 | 
 17 | @simple_cache(ex=1)
 18 | def incr_then_return_test_number_with_ex(keyword=None):
 19 |     global test_number
 20 |     test_number += 1
 21 |     return test_number
 22 | 
 23 | 
 24 | class TestClass(object):
 25 |     def __init__(self, name):
 26 |         self.name = name
 27 | 
 28 | 
 29 | class TestUtilCache(TestCase):
 30 | 
 31 |     def setUp(self):
 32 |         keys = redis_instance.keys('*incr_then_return_test_number*')
 33 |         if keys:
 34 |             redis_instance.delete(*keys)
 35 | 
 36 |         keys = redis_instance.keys('*return_what_you_put*')
 37 |         if keys:
 38 |             redis_instance.delete(*redis_instance.keys('*return_what_you_put*'))
 39 | 
 40 |     def test_simple_cache(self):
 41 |         """测试缓存"""
 42 |         global test_number
 43 |         test_number = 0
 44 |         self.assertEqual(1, incr_then_return_test_number('test'))
 45 |         self.assertEqual(1, incr_then_return_test_number('test'))
 46 |         self.assertEqual(2, incr_then_return_test_number('test_1'))
 47 |         self.assertEqual(2, incr_then_return_test_number('test_1'))
 48 |         self.assertEqual(3, incr_then_return_test_number('test_2'))
 49 | 
 50 |         with self.assertRaises(ValueError):
 51 |             incr_then_return_test_number(keyword='test')
 52 | 
 53 |     def test_simple_cache_with_ex(self):
 54 |         """测试设置了过期时间的缓存"""
 55 |         global test_number
 56 |         test_number = 0
 57 |         self.assertEqual(1, incr_then_return_test_number_with_ex('test'))
 58 |         self.assertEqual(1, incr_then_return_test_number_with_ex('test'))
 59 |         time.sleep(1.1)
 60 |         self.assertEqual(2, incr_then_return_test_number_with_ex('test'))
 61 | 
 62 |     def test_cache_clear(self):
 63 |         """测试清除缓存"""
 64 |         global test_number
 65 |         test_number = 0
 66 |         self.assertEqual(1, incr_then_return_test_number('test'))
 67 |         self.assertEqual(2, incr_then_return_test_number('test_1'))
 68 |         # 清除全部函数缓存
 69 |         cache_clear(incr_then_return_test_number)
 70 |         self.assertEqual(3, incr_then_return_test_number('test'))
 71 |         self.assertEqual(4, incr_then_return_test_number('test_1'))
 72 | 
 73 |         # 清除部分函数缓存
 74 |         cache_clear(incr_then_return_test_number, 'test_1')
 75 |         self.assertEqual(3, incr_then_return_test_number('test'))
 76 |         self.assertEqual(5, incr_then_return_test_number('test_1'))
 77 | 
 78 |     def test_cache_class_instance(self):
 79 |         """测试缓存类实例"""
 80 | 
 81 |         @simple_cache()
 82 |         def return_what_you_input(whatever):
 83 |             return whatever
 84 | 
 85 |         instance = TestClass('测试类实例')
 86 |         # cache class
 87 |         instance = return_what_you_input(instance)
 88 |         # get result from redis
 89 |         cache_instance = return_what_you_input(instance)
 90 |         self.assertTrue(instance is not cache_instance)
 91 |         self.assertTrue(isinstance(cache_instance, TestClass))
 92 |         self.assertEqual(cache_instance.name, '测试类实例')
 93 | 
 94 |     def tearDown(self):
 95 |         keys = redis_instance.keys('*incr_then_return_test_number*')
 96 |         if keys:
 97 |             redis_instance.delete(*redis_instance.keys('*incr_then_return_test_number*'))
 98 | 
 99 |         keys = redis_instance.keys('*return_what_you_put*')
100 |         if keys:
101 |             redis_instance.delete(*redis_instance.keys('*return_what_you_put*'))
102 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_classproperty.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from webspider.utils.classproperty import classproperty
 5 | 
 6 | 
 7 | class TestClass(object):
 8 |     _name = '阿河'
 9 | 
10 |     @classproperty
11 |     def name(cls):
12 |         return cls._name
13 | 
14 | 
15 | class TestUtilClassProperty(TestCase):
16 |     def test_read_class_property(self):
17 |         self.assertEqual(TestClass.name, '阿河')
18 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_common.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from webspider.utils.common import get_key_from_dict_by_value, get_field_statistics
 5 | 
 6 | 
 7 | class TestUtilCommon(TestCase):
 8 |     def test_get_key_from_dict_by_value(self):
 9 |         dictionary = {
10 |             '全国': 1,
11 |             '北京': 2,
12 |             '广州': 3,
13 |         }
14 |         key = get_key_from_dict_by_value(1, dictionary)
15 |         self.assertEqual(key, '全国')
16 | 
17 |         # no key
18 |         with self.assertRaises(ValueError):
19 |             get_key_from_dict_by_value(4, dictionary)
20 | 
21 |         dictionary = {
22 |             '全国': 1,
23 |             '北京': 1,
24 |             '广州': 3,
25 |         }
26 |         key = get_key_from_dict_by_value(3, dictionary)
27 |         self.assertEqual(key, '广州')
28 |         # multi key
29 |         with self.assertRaises(AttributeError):
30 |             get_key_from_dict_by_value(1, dictionary)
31 | 
32 |     def test_get_field_statistics(self):
33 |         statistics = get_field_statistics([0, 0, 0, 1, 1], {'男': 0, '女': 1, '不明': 2})
34 |         self.assertDictEqual(statistics, {'男': 3, '女': 2})
35 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_convert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from webspider.constants import WORK_YEARS_REQUEST_DICT, JOB_NATURE_DICT, COMPANY_SIZE_DICT
 5 | from webspider.utils.convert import convert_dict_field_to_constants, convert_field_to_constants
 6 | 
 7 | 
 8 | class TestUtilConvert(TestCase):
 9 |     def test_convert_dict_field_to_constants(self):
10 |         init_dict = {
11 |             'work_year': '应届毕业生',
12 |             'size': '没有人',
13 |             'nature': '全职',
14 |             'name': '沙师弟',
15 |             'id': 3,
16 |             'value': None
17 |         }
18 |         convert_dict_field_to_constants(init_dict)
19 |         self.assertDictEqual(init_dict, {
20 |             'work_year': WORK_YEARS_REQUEST_DICT['应届毕业生'],
21 |             'size': COMPANY_SIZE_DICT['unknown'],
22 |             'nature': JOB_NATURE_DICT['全职'],
23 |             'name': '沙师弟',
24 |             'id': 3,
25 |             'value': None
26 |         })
27 | 
28 |     def test_convert_field_to_constants(self):
29 |         constant_value = convert_field_to_constants(field_name='work_year', field_value='应届毕业生')
30 |         self.assertEqual(constant_value, WORK_YEARS_REQUEST_DICT['应届毕业生'])
31 | 
32 |         constant_value = convert_field_to_constants(field_name='work_year', field_value='家里蹲')
33 |         self.assertEqual(constant_value, WORK_YEARS_REQUEST_DICT['unknown'])
34 | 
35 |         with self.assertRaises(ValueError):
36 |             convert_field_to_constants(field_name='dinner', field_value='牛肉饭')
37 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_http_tools.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase, mock
 3 | 
 4 | from webspider.utils.http_tools import generate_http_request_headers, requests_get, requests_post
 5 | 
 6 | 
 7 | class TestUtilHttpTools(TestCase):
 8 |     def test_generate_http_request_headers(self):
 9 |         header = generate_http_request_headers()
10 |         self.assertTrue(isinstance(header, dict))
11 | 
12 |         header = generate_http_request_headers(referer='https://www.zhihu.com')
13 |         self.assertEqual(header['Referer'], 'https://www.zhihu.com')
14 | 
15 |     @mock.patch('requests.get')
16 |     def test_request_get(self, mock_get):
17 |         mock_get.return_value = '200'
18 |         response = requests_get(url='https://baidu.com', need_sleep=False)
19 |         self.assertEqual(response, '200')
20 | 
21 |     @mock.patch('requests.post')
22 |     def test_request_post(self, mock_post):
23 |         mock_post.return_value = '200'
24 |         response = requests_post(url='https://baidu.com', need_sleep=False)
25 |         self.assertEqual(response, '200')
26 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_pagination.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from webspider.utils.pagination import Pagination
 5 | 
 6 | 
 7 | class TestUtilPagination(TestCase):
 8 |     def test_pagination(self):
 9 |         pagination = Pagination(page=2, total=20, per_page=6)
10 |         self.assertEqual(pagination.pages, 4)
11 |         self.assertEqual(pagination.prev_num, 1)
12 |         self.assertEqual(pagination.has_prev, True)
13 |         self.assertEqual(pagination.next_num, 3)
14 |         self.assertEqual(pagination.has_next, True)
15 |         self.assertEqual([page for page in pagination.iter_pages], [1, 2, 3, 4])
16 | 
17 |     def test_pagination_no_pages(self):
18 |         pagination = Pagination(page=2, total=20, per_page=0)
19 |         self.assertEqual(pagination.pages, 0)
20 | 
21 |     def test_pagination_no_pre(self):
22 |         pagination = Pagination(page=1, total=20, per_page=6)
23 |         self.assertEqual(pagination.has_prev, False)
24 |         self.assertEqual(pagination.prev_num, None)
25 |         self.assertEqual(pagination.has_next, True)
26 |         self.assertEqual(pagination.next_num, 2)
27 | 
28 |     def test_pagination_no_next(self):
29 |         pagination = Pagination(page=4, total=20, per_page=6)
30 |         self.assertEqual(pagination.has_prev, True)
31 |         self.assertEqual(pagination.prev_num, 3)
32 |         self.assertEqual(pagination.has_next, False)
33 |         self.assertEqual(pagination.next_num, None)
34 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_text.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from unittest import TestCase
 3 | 
 4 | from webspider.utils.text import to_plaintext
 5 | 
 6 | 
 7 | class TestUtilText(TestCase):
 8 |     def test_to_plaintext(self):
 9 |         init_text = '<br/>abcd \n '
10 |         self.assertEqual(to_plaintext(content=init_text, strip=False), 'abcd  ')
11 | 
12 |         init_text = '<br/>abcd  \n  '
13 |         self.assertEqual(to_plaintext(content=init_text, strip=True), 'abcd')
14 | 
15 |         init_text = '<br/>abcd  \n  '
16 |         self.assertEqual(to_plaintext(content=init_text, pattern=u'a|b', strip=False), '<r/>cd  \n  ')
17 | 


--------------------------------------------------------------------------------
/tests/test_utils/test_time_tools.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from datetime import datetime
 3 | from unittest import TestCase
 4 | 
 5 | from webspider.utils.time_tools import (datetime_to_timestamp, timestamp_to_datetime, timestamp_to_datetime_str)
 6 | 
 7 | 
 8 | class TestUtilTimeTools(TestCase):
 9 |     def test_datetime_to_timestamp(self):
10 |         datetime_obj = datetime(year=2017, month=5, day=10)
11 |         timestamp = datetime_to_timestamp(datetime_obj)
12 |         self.assertEqual(int(datetime_obj.timestamp()), timestamp)
13 | 
14 |     def test_timestamp_to_datetime(self):
15 |         timestamp = int(datetime(year=2017, month=5, day=10).timestamp())
16 |         datetime_obj = timestamp_to_datetime(timestamp=timestamp)
17 |         self.assertEqual(datetime_obj.isoformat(), '2017-05-10T00:00:00')
18 | 
19 |     def test_timestamp_to_datetime_str(self):
20 |         timestamp = int(datetime(year=2017, month=5, day=10).timestamp())
21 |         datetime_str = timestamp_to_datetime_str(ts=timestamp)
22 |         self.assertEqual(datetime_str, '2017-05-10')
23 | 
24 |         timestamp = int(datetime(year=2018, month=2, day=1, hour=19, minute=46, second=57).timestamp())
25 |         datetime_str = timestamp_to_datetime_str(ts=timestamp, time_format='%Y/%m/%d %H:%M:%S')
26 |         self.assertEqual(datetime_str, '2018/02/01 19:46:57')
27 | 


--------------------------------------------------------------------------------
/tests/test_web/base.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import logging
 4 | from urllib.parse import urlencode
 5 | 
 6 | from tornado.testing import AsyncHTTPTestCase
 7 | from tornado.escape import json_encode, json_decode
 8 | 
 9 | from webspider.utils.sql import get_session
10 | from webspider.web.app import make_web_app
11 | from tests.util import create_test_db, drop_test_db
12 | 
13 | logger = logging.getLogger(__file__)
14 | 
15 | 
16 | class BaseHandlerTestCase(AsyncHTTPTestCase):
17 |     session = get_session()
18 | 
19 |     def setUp(self):
20 |         create_test_db(self.session)
21 |         super(BaseHandlerTestCase, self).setUp()
22 | 
23 |     def tearDown(self):
24 |         drop_test_db(self.session)
25 |         super(BaseHandlerTestCase, self).tearDown()
26 | 
27 |     def get_app(self):
28 |         return make_web_app()
29 | 
30 |     def request(self, method, url, headers=None, data=None, json=None, form=None, **kwargs):
31 |         if not headers:
32 |             headers = {}
33 | 
34 |         if json is not None:
35 |             headers['Content-Type'] = 'application/json'
36 |             data = json_encode(json)
37 | 
38 |         elif form is not None:
39 |             headers['Content-Type'] = 'application/x-www-form-urlencoded'
40 |             data = urlencode(form)
41 | 
42 |         response = self.fetch(url, method=method, headers=headers, body=data, allow_nonstandard_methods=True,
43 |                               **kwargs)
44 | 
45 |         if response.code / 100 != 2:
46 |             logger.error(response.body)
47 | 
48 |         return response
49 | 
50 |     def get(self, url, **kwargs):
51 |         return self.request(url=url, method="GET", **kwargs)
52 | 
53 |     def post(self, url, **kwargs):
54 |         return self.request(url=url, method="POST", **kwargs)
55 | 
56 |     def put(self, url, **kwargs):
57 |         return self.request(url=url, method="PUT", **kwargs)
58 | 
59 |     def fetch_json(self, path, **kwargs):
60 |         response = self.request('GET', path, **kwargs)
61 |         if response.code / 100 != 2:
62 |             raise ValueError('fetch json expect http code 2xx, got {}'.format(response.code))
63 |         return json_decode(response.body)
64 | 


--------------------------------------------------------------------------------
/tests/test_web/test_formatter.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from unittest import TestCase
  3 | 
  4 | from webspider.exceptions import DowngradeException
  5 | from webspider.web.formatter.base import Field, Downgrade, Formatter
  6 | 
  7 | """
  8 |     准备测试数据
  9 | """
 10 | 
 11 | 
 12 | class TestFormatter(Formatter):
 13 |     FIELDS = [
 14 |         Field('name', converter=lambda name: 'Mr.' + name),
 15 |         Field('value', converter=lambda value: int(value), downgrade=Downgrade(0)),
 16 |         Field('count'),
 17 |     ]
 18 | 
 19 | 
 20 | class TestModel(object):
 21 |     def __init__(self, name=None, value=None, count=None):
 22 |         self.name = name
 23 |         self.value = value
 24 |         self.count = count
 25 | 
 26 | 
 27 | class TestModelB(object):
 28 |     pass
 29 | 
 30 | 
 31 | formatter_mappings = {
 32 |     TestModel: TestFormatter
 33 | }
 34 | 
 35 | """end"""
 36 | 
 37 | 
 38 | class TestFormatter(TestCase):
 39 | 
 40 |     def test_register_formatter(self):
 41 |         Formatter.register_formatter(formatter_mappings)
 42 |         self.assertDictContainsSubset(formatter_mappings, Formatter._FORMATTER_MAPS)
 43 | 
 44 |     def test_get_formatter(self):
 45 |         Formatter.register_formatter(formatter_mappings)
 46 | 
 47 |         formatter = Formatter.get_formatter(TestModel)
 48 |         self.assertTrue(formatter is formatter_mappings[TestModel])
 49 | 
 50 |         formatter = Formatter.get_formatter(TestModel())
 51 |         self.assertTrue(formatter is formatter_mappings[TestModel])
 52 | 
 53 |         formatter = Formatter.get_formatter(TestModelB)
 54 |         self.assertTrue(formatter is None)
 55 | 
 56 |     def test_downgrade(self):
 57 |         # 测试降级
 58 |         Formatter.register_formatter(formatter_mappings)
 59 |         test_model = TestModel(name='He', value='10a', count=100)
 60 |         format_result = Formatter.format(test_model)
 61 |         self.assertDictEqual(format_result, {
 62 |             'name': 'Mr.He',
 63 |             'value': 0,
 64 |             'count': 100
 65 |         })
 66 | 
 67 |     def test_field(self):
 68 |         with self.assertRaises(DowngradeException):
 69 |             Field(name='hi', downgrade=0)
 70 | 
 71 |     def test_format(self):
 72 |         Formatter.register_formatter(formatter_mappings)
 73 | 
 74 |         test_model = TestModel(name='He', value='10', count=100)
 75 |         format_result = Formatter.format(test_model)
 76 |         self.assertDictEqual(format_result, {
 77 |             'name': 'Mr.He',
 78 |             'value': 10,
 79 |             'count': 100
 80 |         })
 81 | 
 82 |         # 测试 list format
 83 |         test_models = [TestModel(name='He', value='10', count=100),
 84 |                        TestModel(name='Wei', value='20', count=1)]
 85 |         format_result = Formatter.format(test_models)
 86 |         self.assertDictEqual(format_result[0], {
 87 |             'name': 'Mr.He',
 88 |             'value': 10,
 89 |             'count': 100
 90 |         })
 91 |         self.assertDictEqual(format_result[1], {
 92 |             'name': 'Mr.Wei',
 93 |             'value': 20,
 94 |             'count': 1
 95 |         })
 96 | 
 97 |         # 测试嵌套 format
 98 |         test_models = TestModel(name='He', value='10', count=TestModel(name='child', value='20', count=1))
 99 |         format_result = Formatter.format(test_models)
100 |         self.assertDictEqual(format_result, {
101 |             'name': 'Mr.He',
102 |             'value': 10,
103 |             'count': {
104 |                 'name': 'Mr.child',
105 |                 'value': 20,
106 |                 'count': 1,
107 |             }
108 |         })
109 | 


--------------------------------------------------------------------------------
/tests/test_web/test_keyword_statistic.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from datetime import datetime
 3 | 
 4 | from tornado.escape import json_decode
 5 | 
 6 | from tests.test_web.base import BaseHandlerTestCase
 7 | from webspider.utils.time_tools import datetime_to_timestamp
 8 | 
 9 | predictive_keyword_statistic_dict = {
10 |     'educations': {'不限': 1, '大专': 2, '本科': 4, '硕士': 5, '博士': 6, 'unknown': 7},
11 |     'city_jobs_count': {'北京': 8, '深圳': 9, '广州': 10},
12 |     'salary': {'10k以下': 11, '11k-20k': 12, '21k-35k': 13, '36k-60k': 14, '61k以上': 15},
13 |     'financing_stage': {'未融资': 16, '天使轮': 17, 'A轮': 18, 'B轮': 19, 'C轮': 20,
14 |                         'D轮及以上': 21, '上市公司': 22, '不需要融资': 23, 'unknown': 24},
15 |     'work_years': {'不限': 25, '应届毕业生': 26, '1年以下': 27, '1-3年': 28, '3-5年': 29,
16 |                    '5-10年': 30, '10年以上': 31, 'unknown': 32},
17 |     'per_day_jobs_count': [
18 |         {
19 |             'date': 20180128, 'all_city': 576, 'beijing': 198, 'guangzhou': 35, 'shenzhen': 93, 'shanghai': 80,
20 |             'hangzhou': 41, 'chengdu': 26,
21 |             'created_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S')),
22 |             'updated_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S'))
23 |         },
24 |         {
25 |             'date': 20180129, 'all_city': 580, 'beijing': 200, 'guangzhou': 36, 'shenzhen': 100, 'shanghai': 82,
26 |             'hangzhou': 44, 'chengdu': 30,
27 |             'created_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S')),
28 |             'updated_at': datetime_to_timestamp(datetime.strptime('2018-01-28 17:01:04', '%Y-%m-%d %H:%M:%S'))
29 |         }],
30 |     'created_at': datetime_to_timestamp(datetime.strptime('2018-02-01 19:01:44', '%Y-%m-%d %H:%M:%S')),
31 |     'updated_at': datetime_to_timestamp(datetime.strptime('2018-02-05 01:01:48', '%Y-%m-%d %H:%M:%S')),
32 | }
33 | 
34 | 
35 | class TestKeywordStatisticsApiHandler(BaseHandlerTestCase):
36 | 
37 |     def test_get(self):
38 |         response = self.fetch_json('/api/statistics?keyword_name=python')
39 |         self.assertDictEqual(predictive_keyword_statistic_dict, response)
40 | 
41 |     def test_get_when_error(self):
42 |         response = self.get('/api/statistics')
43 |         self.assertEqual(response.code, 404)
44 |         predictive_response_content = {
45 |             u"error": {
46 |                 u"message": u"请输入关键词",
47 |                 u"code": 4041,
48 |                 u"name": u"ResourceNotFoundWebException",
49 |                 u'data': '',
50 |                 u'debug_message': '',
51 |             }
52 |         }
53 |         self.assertDictEqual(predictive_response_content, json_decode(response.body))
54 | 
55 |         response = self.get('/api/statistics?keyword_name=种田')
56 |         self.assertEqual(response.code, 404)
57 |         predictive_response_content = {
58 |             u"error": {
59 |                 u"message": u"找不到该关键词",
60 |                 u"code": 4041,
61 |                 u"name": u"ResourceNotFoundWebException",
62 |                 u'data': '',
63 |                 u'debug_message': '',
64 |             }
65 |         }
66 |         self.assertDictEqual(predictive_response_content, json_decode(response.body))
67 | 
68 |         response = self.get('/api/statistics?keyword_name=java')
69 |         self.assertEqual(response.code, 404)
70 |         predictive_response_content = {
71 |             u"error": {
72 |                 u"message": u"暂无该关键词的统计结果",
73 |                 u"code": 4041,
74 |                 u"name": u"ResourceNotFoundWebException",
75 |                 u'data': '',
76 |                 u'debug_message': '',
77 |             }
78 |         }
79 |         self.assertDictEqual(predictive_response_content, json_decode(response.body))
80 | 
81 | 
82 | class TestKeywordStatisticsPageHandler(BaseHandlerTestCase):
83 | 
84 |     def test_get(self):
85 |         response = self.get('/statistics?keyword_name=python')
86 |         self.assertEqual(response.code, 200)
87 | 
88 |     def test_get_when_error(self):
89 |         response = self.get('/api/statistics')
90 |         self.assertEqual(response.code, 404)
91 | 
92 |         response = self.get('/api/statistics?keyword_name=种田')
93 |         self.assertEqual(response.code, 404)
94 | 
95 |         response = self.get('/api/statistics?keyword_name=java')
96 |         self.assertEqual(response.code, 404)
97 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os
 3 | 
 4 | from sqlalchemy import text
 5 | 
 6 | 
 7 | def execute_sql_file(file_paths, db_session, predictive_db_name=''):
 8 |     if predictive_db_name:
 9 |         assert get_current_database_name(db_session) == predictive_db_name
10 |     for file_path in file_paths:
11 |         sql_file = open(file_path, 'r')
12 | 
13 |         sql_command = ''
14 | 
15 |         for line in sql_file:
16 |             if not line.startswith('--'):
17 |                 sql_command += line.strip('\n')
18 | 
19 |                 if sql_command.endswith(';'):
20 |                     db_session.execute(text(sql_command))
21 |                     db_session.flush()
22 |                     sql_command = ''
23 | 
24 | 
25 | def get_current_database_name(db_session):
26 |     return db_session.execute('select database();').scalar()
27 | 
28 | 
29 | def create_test_db(session, db_name='test_spider'):
30 |     """转载数据库"""
31 |     # 清除测试数据库
32 |     drop_test_db(session)
33 |     # 创建测试数据库
34 |     session.execute("CREATE DATABASE {db_name} CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;".format(
35 |         db_name=db_name))
36 |     # 指定测试数据库 test_spider
37 |     session.execute("USE {db_name};".format(db_name=db_name))
38 | 
39 |     path = os.path.dirname(__file__)
40 |     # 创建表
41 |     execute_sql_file(
42 |         file_paths=[os.path.join(path, "schema.sql"), ],
43 |         db_session=session,
44 |         predictive_db_name=db_name
45 |     )
46 |     fixture_path = os.path.join(path, 'fixture')
47 |     # 装载表数据
48 |     fixture_file_paths = [os.path.join(fixture_path, file) for file in os.listdir(fixture_path)]
49 |     execute_sql_file(
50 |         file_paths=fixture_file_paths,
51 |         db_session=session,
52 |         predictive_db_name=db_name
53 |     )
54 |     assert get_current_database_name(session) == 'test_spider'
55 | 
56 | 
57 | def drop_test_db(session, db_name='test_spider'):
58 |     # 清除测试数据库
59 |     session.execute("DROP DATABASE IF EXISTS {db_name};".format(db_name=db_name))
60 | 


--------------------------------------------------------------------------------
/webspider/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | __version__ = '0.0.2'
3 | 


--------------------------------------------------------------------------------
/webspider/constants.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # flake8: noqa
  3 | import os
  4 | 
  5 | """
  6 |     工作类型
  7 | """
  8 | 
  9 | 
 10 | class LGJobType(object):
 11 |     all = '全部'
 12 |     technology = '技术'
 13 |     product = '产品'
 14 |     design = '设计'
 15 |     operation = '运营'
 16 |     sell_and_market = '市场与销售'
 17 |     function = '职能'
 18 | 
 19 | 
 20 | """
 21 |     公司融资阶段
 22 | """
 23 | FINANCE_STAGE_DICT = {
 24 |     'unknown': 0,
 25 |     '未融资': 1,
 26 |     '天使轮': 2,
 27 |     'A轮': 3,
 28 |     'B轮': 4,
 29 |     'C轮': 5,
 30 |     'D轮及以上': 6,
 31 |     '上市公司': 7,
 32 |     '不需要融资': 8,
 33 | }
 34 | 
 35 | """
 36 |     工作性质
 37 | """
 38 | JOB_NATURE_DICT = {
 39 |     'unknown': 0,
 40 |     '全职': 1,
 41 |     '兼职': 2,
 42 |     '实习': 3,
 43 | }
 44 | 
 45 | """
 46 |      工作年限要求
 47 | """
 48 | WORK_YEARS_REQUEST_DICT = {
 49 |     'unknown': 0,
 50 |     '不限': 1,
 51 |     '应届毕业生': 2,
 52 |     '1年以下': 3,
 53 |     '1-3年': 4,
 54 |     '3-5年': 5,
 55 |     '5-10年': 6,
 56 |     '10年以上': 7,
 57 | }
 58 | 
 59 | """
 60 |      学历要求
 61 | """
 62 | EDUCATION_REQUEST_DICT = {
 63 |     'unknown': 0,
 64 |     '不限': 1,
 65 |     '大专': 2,
 66 |     '本科': 3,
 67 |     '硕士': 4,
 68 |     '博士': 5,
 69 | }
 70 | 
 71 | """
 72 |     公司规模
 73 | """
 74 | COMPANY_SIZE_DICT = {
 75 |     'unknown': 0,
 76 |     '少于15人': 1,
 77 |     '15-50人': 2,
 78 |     '50-150人': 3,
 79 |     '150-500人': 4,
 80 |     '500-2000人': 5,
 81 |     '2000人以上': 6,
 82 | }
 83 | 
 84 | """
 85 |     其他常量
 86 | """
 87 | 
 88 | DEBUG = (os.environ.get('ENV', 'dev') == 'dev')
 89 | 
 90 | SECONDS_OF_DAY = 60 * 60 * 24
 91 | 
 92 | REQUEST_TIMEOUT = 4
 93 | 
 94 | # 爬虫最小睡眠时间
 95 | MIN_SLEEP_SECS = 3
 96 | 
 97 | # 爬虫最大睡眠时间
 98 | MAX_SLEEP_SECS = 5
 99 | 
100 | """
101 |     REDIS KEY 相关
102 | """
103 | 
104 | CRAWLED_COMPANY_JOBS_REDIS_KEY = 'crawled_company_jobs_{lg_company_id}'
105 | 
106 | """
107 |     字段长度限制
108 | """
109 | COMPANY_INTRODUCE_MAX_LEN = 2048
110 | COMPANY_ADVANTAGE_MAX_LEN = 256
111 | JOB_DESCRIPTION_MAX_LEN = 2048
112 | JOB_ADVANTAGE_MAX_LEN = 256
113 | 
114 | """
115 |     retry 相关
116 | """
117 | # 用来设定最大的尝试次数，超过该次数就停止重试
118 | RETRY_TIMES = 3
119 | # 函数最久持续时间
120 | STOP_MAX_DELAY = 1000 * 30
121 | # 设置在两次retrying之间的停留时间
122 | WAIT_FIXED = 1000 * 2
123 | 
124 | """
125 |     HTTP 相关 (基于减少 lg 的网站负载考虑, 屏蔽以下常量的实际内容)
126 | """
127 | HTTP_HEADER = {}
128 | 
129 | USER_AGENT_LIST = ['for_test']
130 | 
131 | """
132 |     相关网页 (基于减少 lg 的网站负载考虑, 屏蔽以下常量的实际内容)
133 | """
134 | 
135 | JOB_JSON_URL = ''
136 | 
137 | JOB_DETAIL_URL = ''
138 | 
139 | COMPANY_DETAIL_URL = ''
140 | 
141 | ALL_CITY_URL = ''
142 | 
143 | COMPANIES_URL = ''
144 | 
145 | COMPANY_JOBS_URL = ''
146 | 
147 | # COMPANIES_URL sort field
148 | SORTED_BY_JOBS_COUNT = 1
149 | 
150 | # 生产环境 和 个人开发环境加载真实常量的值
151 | if os.environ.get('ENV', '') in ('production', 'dev'):
152 |     from webspider.security_constants import *
153 | 


--------------------------------------------------------------------------------
/webspider/controllers/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/webspider/controllers/city_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from sqlalchemy.exc import IntegrityError
 4 | 
 5 | from webspider.models.city import CityModel
 6 | 
 7 | 
 8 | def get_city_id_by_name(name):
 9 |     city = CityModel.get_one(filter_by={'name': name})
10 |     if not city:
11 |         raise ValueError('Get None when city name is {}'.format(name))
12 |     return city.id
13 | 
14 | 
15 | def insert_city_if_not_exist(name):
16 |     if CityModel.is_exist(filter_by={'name': name}):
17 |         return
18 |     try:
19 |         city_id = CityModel.add(name=name)
20 |         return city_id
21 |     except IntegrityError:
22 |         pass
23 | 
24 | 
25 | def get_city_name_dict():
26 |     """
27 |     :return: dict{city_name: city_id, ....} eg: {'北京': 2, '上海':3, ......}
28 |     """
29 |     cities = CityModel.list()
30 |     return {city.name: city.id for city in cities}
31 | 


--------------------------------------------------------------------------------
/webspider/controllers/industry_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from sqlalchemy.exc import IntegrityError
 3 | 
 4 | from webspider.models.industry import IndustryModel
 5 | 
 6 | 
 7 | def insert_industry_if_not_exist(name):
 8 |     if IndustryModel.is_exist(filter_by={'name': name}):
 9 |         return
10 |     try:
11 |         industry_id = IndustryModel.add(name=name)
12 |         return industry_id
13 |     except IntegrityError:
14 |         pass
15 | 
16 | 
17 | def get_industry_id_by_name(name):
18 |     industry = IndustryModel.get_one(filter_by={'name': name})
19 |     if not industry:
20 |         raise ValueError('Get None when industry name is {}'.format(name))
21 |     return industry.id
22 | 


--------------------------------------------------------------------------------
/webspider/controllers/job_ctl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | 
 5 | def get_salary_section(string):
 6 |     """
 7 |     e.g:
 8 |     15k-25k  ->  (15, 25)
 9 |     15k以上  ->  (15, 20)
10 |     15k以下  ->  (10, 15)
11 |     :param string: 15k-25k
12 |     :return: 15,25
13 |     """
14 |     pattern = r'K|k|以上|以下'
15 |     replace_char = ''
16 | 
17 |     if string.find('-') != -1:
18 |         string = re.sub(pattern=pattern, repl=replace_char, string=string)
19 |         start, end = string.split('-')
20 |     elif string.endswith('以下'):
21 |         string = re.sub(pattern=pattern, repl=replace_char, string=string)
22 |         start, end = int(string) - 5 if int(string) - 5 >= 0 else 1, string
23 |     elif string.endswith('以上'):
24 |         string = re.sub(pattern=pattern, repl=replace_char, string=string)
25 |         start, end = string, int(string) + 5
26 |     else:
27 |         raise ValueError('error salary' + string)
28 | 
29 |     return int(start), int(end)
30 | 


--------------------------------------------------------------------------------
/webspider/controllers/job_keyword_ctl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sqlalchemy import func
 3 | 
 4 | from webspider.models.job_keyword import JobKeywordModel
 5 | 
 6 | 
 7 | def get_most_frequently_keyword_ids(limit=None, offset=None):
 8 |     """
 9 |     获得出现最为频繁的关键词 id
10 |     :param limit:
11 |     :param offset:
12 |     :return: 关键词 id 集合
13 |     :rtype: List[int]
14 |     """
15 |     result = JobKeywordModel.list(columns=JobKeywordModel.keyword_id, group_by=JobKeywordModel.keyword_id,
16 |                                   order_by=func.count(JobKeywordModel.id).desc(), limit=limit, offset=offset)
17 |     return [item[0] for item in result]
18 | 


--------------------------------------------------------------------------------
/webspider/controllers/keyword_ctl.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from sqlalchemy.exc import IntegrityError
 4 | 
 5 | from webspider.models.keyword import KeywordModel
 6 | 
 7 | 
 8 | def insert_keyword_if_not_exist(name):
 9 |     if KeywordModel.is_exist(filter_by={'name': name}):
10 |         return
11 |     try:
12 |         keyword_id = KeywordModel.add(name=name)
13 |         return keyword_id
14 |     except IntegrityError:
15 |         pass
16 | 
17 | 
18 | def get_keyword_name_by_id(keyword_id):
19 |     keyword = KeywordModel.get_by_pk(keyword_id)
20 |     if not keyword:
21 |         raise ValueError('Get None when keyword id is {}'.format(keyword_id))
22 |     return keyword.name
23 | 
24 | 
25 | def get_keyword_id_by_name(name):
26 |     keyword = KeywordModel.get_one(filter_by={'name': name})
27 |     if not keyword:
28 |         raise ValueError('Get None when keyword id is {}'.format(name))
29 |     return keyword.id
30 | 


--------------------------------------------------------------------------------
/webspider/controllers/keyword_statistic_ctl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from collections import Counter
 3 | 
 4 | from webspider import utils
 5 | from webspider import constants
 6 | from webspider.models import CompanyModel
 7 | from webspider.controllers import city_ctl, job_ctl
 8 | 
 9 | 
10 | def get_salary_statistic(jobs):
11 |     """
12 |     获取薪水统计情况
13 | 
14 |     :param jobs: webspider.models.JobModel instances list
15 |     :return: collections.Counter
16 |     """
17 |     salary_statistic = Counter()
18 |     for job in jobs:
19 |         start_salary, end_salary = job_ctl.get_salary_section(job.salary)
20 |         if start_salary <= 10:
21 |             salary_statistic['10k及以下'] += 1
22 |         if start_salary <= 20 and end_salary >= 11:
23 |             salary_statistic['11k-20k'] += 1
24 |         if start_salary <= 35 and end_salary >= 21:
25 |             salary_statistic['21k-35k'] += 1
26 |         if start_salary <= 60 and end_salary >= 36:
27 |             salary_statistic['36k-60k'] += 1
28 |         if end_salary >= 61:
29 |             salary_statistic['61k以上'] += 1
30 |     return salary_statistic
31 | 
32 | 
33 | def get_finance_stage_statistic(jobs):
34 |     """
35 |     获取 jobs 的公司的统治情况统计
36 | 
37 |     :param jobs: webspider.models.JobModel instances list
38 |     :return: collections.Counter
39 |     """
40 |     company_ids = [job.company_id for job in jobs]
41 |     companies = CompanyModel.list(filter=CompanyModel.id.in_(company_ids))
42 | 
43 |     finance_stage_statistic = utils.common.get_field_statistics(values=[company.finance_stage for company in companies],
44 |                                                                 constants_dict=constants.FINANCE_STAGE_DICT)
45 |     return finance_stage_statistic
46 | 
47 | 
48 | def get_educations_statistic(jobs):
49 |     """
50 |     获取教育背景要求统计
51 | 
52 |     :param jobs: webspider.models.JobModel instances list
53 |     :return: collections.Counter
54 |     """
55 |     return utils.common.get_field_statistics(values=[job.education for job in jobs],
56 |                                              constants_dict=constants.EDUCATION_REQUEST_DICT)
57 | 
58 | 
59 | def get_work_years_statistic(jobs):
60 |     """
61 |     获取工作年限要求统计
62 | 
63 |     :param jobs: webspider.models.JobModel instances list
64 |     :return: collections.Counter
65 |     """
66 |     return utils.common.get_field_statistics(values=[job.work_year for job in jobs],
67 |                                              constants_dict=constants.WORK_YEARS_REQUEST_DICT)
68 | 
69 | 
70 | def get_city_jobs_count_statistic(jobs, limit=10):
71 |     """
72 |     获取各城市职位统计
73 |     :param jobs: webspider.models.JobModel instances list
74 |     :param limit: 指定获取职位数量前几位的城市
75 |     :return: collections.Counter
76 |     """
77 |     city_name_dict = city_ctl.get_city_name_dict()
78 |     city_job_count = utils.common.get_field_statistics(values=[job.city_id for job in jobs],
79 |                                                        constants_dict=city_name_dict)
80 |     city_job_count = sorted(city_job_count.items(), key=lambda x: x[1], reverse=True)
81 |     if limit:
82 |         city_job_count = city_job_count[:limit]
83 |     return Counter({item[0]: item[1] for item in city_job_count})
84 | 


--------------------------------------------------------------------------------
/webspider/crawlers/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from webspider.crawlers.lg_cites import get_cites_from_lg
 3 | from webspider.crawlers.lg_companies import (get_companies_pagination_from_lg, get_companies_from_lg,
 4 |                                              get_company_detail_from_lg, clean_lg_company_data, )
 5 | from webspider.crawlers.lg_jobs import (get_jobs_pagination_from_lg, get_jobs_from_lg,
 6 |                                         get_job_detail_from_lg, clean_lg_job_data, )
 7 | from webspider.crawlers.lg_jobs_count import get_jobs_count_from_lg
 8 | 
 9 | __all__ = ['get_cites_from_lg', 'get_companies_pagination_from_lg', 'get_companies_from_lg',
10 |            'get_company_detail_from_lg', 'clean_lg_company_data', 'get_jobs_pagination_from_lg',
11 |            'get_jobs_from_lg', 'get_job_detail_from_lg', 'clean_lg_job_data', 'get_jobs_count_from_lg']
12 | 


--------------------------------------------------------------------------------
/webspider/crawlers/lagou_cites.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import re
 3 | import logging
 4 | 
 5 | import requests
 6 | from lxml import etree
 7 | from tornado.util import ObjectDict
 8 | 
 9 | from webspider import constants
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def get_cites_from_lg():
15 |     """
16 |     爬取城市数据
17 | 
18 |     返回的 dict 组成:
19 |     id:
20 |         type: int
21 |         meaning: 城市 id
22 |         eg: 1
23 |     name:
24 |         type: str
25 |         meaning: 城市名
26 |         eg: 北京
27 | 
28 |     :return: 城市数据集合
29 |     :rtype: List[tornado.util.ObjectDict]
30 |     """
31 |     logger.info(u'begin crawl cities info......')
32 | 
33 |     response_html = etree.HTML(requests.get(constants.ALL_CITY_URL).text)
34 |     cities_html_list = response_html.xpath("//ul[@class='city_list']/li/a")
35 | 
36 |     cities_dicts = []
37 |     for city_html in cities_html_list:
38 |         city_name = city_html.xpath('./text()')[0]
39 |         city_id = re.findall(pattern=r'/(\d+)-\d+-\d+', string=city_html.xpath('./@href')[0])[0]
40 |         cities_dicts.append(ObjectDict(id=city_id, name=city_name))
41 | 
42 |     logger.info(u'crawl cities info finished! cites quantity is {cities_count}'.format(
43 |         cities_count=len(cities_dicts)))
44 |     return cities_dicts
45 | 


--------------------------------------------------------------------------------
/webspider/crawlers/lagou_companies.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import re
  3 | import json
  4 | import logging
  5 | 
  6 | from lxml import etree
  7 | from tornado.util import ObjectDict
  8 | 
  9 | from webspider import utils
 10 | from webspider import constants
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def get_companies_pagination_from_lg(city_id=0, finance_stage_id=0, industry_id=0, page_no=1):
 16 |     """
 17 |     爬取公司分页数据
 18 | 
 19 |     :param city_id: 城市 id
 20 |     :param finance_stage_id: 融资阶段 id
 21 |     :param industry_id: 行业 id
 22 |     :param page_no: 页码
 23 |     :return: 公司分页数据
 24 |     :rtype: utils.pagination.Pagination
 25 |     """
 26 |     url = constants.COMPANIES_URL.format(city_id=city_id,
 27 |                                          finance_stage_id=finance_stage_id,
 28 |                                          industry_id=industry_id)
 29 | 
 30 |     params = {'pn': page_no, 'sortField': constants.SORTED_BY_JOBS_COUNT}
 31 |     response_json = utils.http_tools.requests_get(url=url, params=params).json()
 32 |     pagination = utils.pagination.Pagination(per_page=int(response_json['pageSize']),
 33 |                                              total=int(response_json['totalCount']))
 34 | 
 35 |     return pagination
 36 | 
 37 | 
 38 | def get_companies_from_lg(city_id=0, finance_stage_id=0, industry_id=0, page_no=1):
 39 |     """
 40 |     爬取公司数据
 41 | 
 42 |     返回的 dict 组成:
 43 |     lg_company_id:
 44 |         type: int
 45 |         meaning: 接口使用的公司 id
 46 |         eg: 1
 47 |     fullname:
 48 |         type: str
 49 |         meaning: 公司全称
 50 |         eg:  智者四海北京科技有限公司
 51 |     city_name:
 52 |         type: str
 53 |         meaning: 城市名
 54 |         eg: 北京
 55 |     shortname:
 56 |         type: str
 57 |         meaning: 公司简称
 58 |         eg: 知乎
 59 |     fullname:
 60 |         type: str
 61 |         meaning: 公司全称
 62 |         eg:  智者四海北京科技有限公司
 63 |     finance_stage:
 64 |         type: str
 65 |         meaning: 融资阶段
 66 |         eg:  D轮
 67 |     features:
 68 |         type: str
 69 |         meaning: 公司slogan, 一句话简介
 70 |         eg:  发现更大的世界
 71 |     process_rate:
 72 |         type:  int
 73 |         meaning:  简历处理率
 74 |         eg:  94
 75 |     industries:
 76 |         type: str
 77 |         meaning: 所处行业
 78 |         eg: '互联网，社交' or '互联网'
 79 |     advantage:
 80 |         type: List[str]
 81 |         meaning: 公司优势
 82 |         eg: ['双休', '五险一金', ......]
 83 |     address:
 84 |         type: str
 85 |         meaning: 公司地址
 86 |         eg: 北京市海淀区学院路768创意园
 87 |     size:
 88 |         type: str
 89 |         meaning: 公司规模
 90 |         eg: 2000人以上
 91 |     introduce:
 92 |         type: List[str]
 93 |         meaning: 公司介绍
 94 |         eg: ['我们的愿景:', 'blablabla', '我们处于一个知识 balala...']
 95 | 
 96 |     :param city_id: 城市 id
 97 |     :param finance_stage_id: 融资阶段 id
 98 |     :param industry_id: 行业 id
 99 |     :param page_no: 页码
100 |     :return: 公司数据集合
101 |     :rtype: List[tornado.util.ObjectDict]
102 |     """
103 |     url = constants.COMPANIES_URL.format(city_id=city_id,
104 |                                          finance_stage_id=finance_stage_id,
105 |                                          industry_id=industry_id)
106 |     params = {'pn': page_no, 'sortField': constants.SORTED_BY_JOBS_COUNT}
107 |     companies = utils.http_tools.requests_get(url=url, params=params).json()['result']
108 | 
109 |     companies_dicts = []
110 |     for company in companies:
111 |         lg_company_id = int(company.get('companyId'))
112 | 
113 |         company_detail = get_company_detail_from_lg(lg_company_id=lg_company_id)
114 |         companies_dicts.append(ObjectDict(
115 |             lg_company_id=lg_company_id,
116 |             city_name=company.get('city'),
117 |             shortname=company.get('companyShortName'),
118 |             fullname=company.get('companyFullName'),
119 |             finance_stage=company.get('financeStage'),
120 |             features=company.get('companyFeatures'),
121 |             process_rate=company.get('processRate'),
122 |             industries=company.get('industryField'),
123 |             # company detail
124 |             advantage=company_detail.get('advantage'),
125 |             address=company_detail.get('address'),
126 |             size=company_detail.get('size'),
127 |             introduce=company_detail.get('introduce')
128 |         ))
129 |     return companies_dicts
130 | 
131 | 
132 | def get_company_detail_from_lg(lg_company_id):
133 |     """
134 |     爬取公司详情页的数据
135 | 
136 |     返回的 dict 组成:
137 |     advantage:
138 |         type: List[str]
139 |         meaning: 公司优势
140 |         eg: ['双休', '五险一金', ......]
141 |     address:
142 |         type: str
143 |         meaning: 公司地址
144 |         eg: 北京市海淀区学院路768创意园
145 |     size:
146 |         type: str
147 |         meaning: 公司规模
148 |         eg: 2000人以上
149 |     introduce:
150 |         type: List[str]
151 |         meaning: 公司介绍
152 |         eg: ['我们的愿景:', 'blablabla', '我们处于一个知识 balala...']
153 | 
154 |     :param lg_company_id: 接口使用的公司 id
155 |     :return: 公司详情页数据
156 |     :rtype: tornado.util.ObjectDict
157 |     """
158 |     response = utils.http_tools.requests_get(
159 |         url=constants.COMPANY_DETAIL_URL.format(lg_company_id=lg_company_id))
160 |     company_detail_html = etree.HTML(response.text)
161 | 
162 |     advantage = company_detail_html.xpath('//div[@id="tags_container"]//li/text()')
163 |     sizes = company_detail_html.xpath('//div[@id="basic_container"]//li[3]/span/text()')
164 |     address = company_detail_html.xpath('//p[@class="mlist_li_desc"]/text()')
165 |     introduces = company_detail_html.xpath('//span[@class="company_content"]//text()')
166 | 
167 |     if not sizes:
168 |         logger.error(
169 |             'can not get size by lg_company_id = {}, html code is \n{}'.format(lg_company_id, response.text))
170 | 
171 |     return ObjectDict(
172 |         advantage=advantage,
173 |         address=address[0] if address else '',
174 |         size=sizes[0] if sizes else '',
175 |         introduce=introduces,
176 |     )
177 | 
178 | 
179 | def clean_lg_company_data(company_dict):
180 |     """
181 |     清洗爬取到的公司信息
182 | 
183 |     :param company_dict: tornado.util.ObjectDict
184 |     """
185 |     if 'size' in company_dict:
186 |         company_dict.size = company_dict.size.strip()
187 |     if 'finance_stage' in company_dict:
188 |         company_dict.finance_stage = company_dict.finance_stage.strip()
189 |     if 'features' in company_dict:
190 |         company_dict.features = utils.text.to_plaintext(company_dict.features)
191 |     if 'address' in company_dict:
192 |         company_dict.address = utils.text.to_plaintext(company_dict.address)
193 |     if 'introduce' in company_dict:
194 |         company_dict.introduce = ''.join(company_dict.introduce) if company_dict.introduce else ''
195 |         company_dict.introduce = company_dict.introduce[:constants.COMPANY_INTRODUCE_MAX_LEN]
196 |     if 'advantage' in company_dict:
197 |         company_dict.advantage = list(map(utils.text.to_plaintext, company_dict.advantage))
198 |         company_dict.advantage = json.dumps(company_dict.advantage)[
199 |             :constants.COMPANY_ADVANTAGE_MAX_LEN]
200 |     if 'industries' in company_dict:
201 |         company_dict.industries = set(re.split(r",|，|、|\s", company_dict.industries))
202 | 


--------------------------------------------------------------------------------
/webspider/crawlers/lagou_jobs.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import logging
  3 | 
  4 | from lxml import etree
  5 | from tornado.util import ObjectDict
  6 | 
  7 | from webspider import utils
  8 | from webspider import constants
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def get_jobs_pagination_from_lg(lg_company_id, job_type, page_no=1, is_school_job=False):
 14 |     """
 15 |     爬取职位分页数据
 16 | 
 17 |     :param lg_company_id: 接口使用的公司 id
 18 |     :param job_type: 职位类型
 19 |     :param page_no: 页码
 20 |     :param is_school_job: 是否爬取校招职位
 21 |     :return:
 22 |     """
 23 |     params = {
 24 |         'companyId': lg_company_id,
 25 |         'positionFirstType': job_type,
 26 |         'schoolJob': is_school_job,
 27 |         'pageNo': page_no,
 28 |         'pageSize': 10,
 29 |     }
 30 |     response_json = utils.http_tools.requests_get(
 31 |         url=constants.COMPANY_JOBS_URL, params=params).json()
 32 |     pagination = utils.pagination.Pagination(per_page=int(response_json['content']['data']['page']['pageSize']),
 33 |                                              total=int(response_json['content']['data']['page']['totalCount']))
 34 | 
 35 |     return pagination
 36 | 
 37 | 
 38 | def get_jobs_from_lg(lg_company_id, job_type, page_no=1, is_school_job=False):
 39 |     """
 40 |     爬取职位数据
 41 | 
 42 |     返回的 dict 组成:
 43 |     lg_job_id:
 44 |         type: int
 45 |         meaning: 接口使用的职位 id
 46 |         eg: 1
 47 |     city_name:
 48 |         type: str
 49 |         meaning: 城市名
 50 |         eg: 北京
 51 |     title:
 52 |         type: str
 53 |         meaning: 职位标题
 54 |         eg: 招聘后端工程师
 55 |     salary:
 56 |         type: str
 57 |         meaning: 薪酬范围
 58 |         eg:  '10k~20k'
 59 |     education:
 60 |         type: str
 61 |         meaning: 教育背景要求
 62 |         eg: 本科或以上
 63 |     nature:
 64 |         type: str
 65 |         meaning: 职位性质
 66 |         eg: 全职
 67 |     work_year:
 68 |         type: str
 69 |         meaning: 工作年限要求
 70 |         eg:  1~3年
 71 |     advantage:
 72 |         type: str
 73 |         meaning: 职位优势
 74 |         eg:  大平台，五险一金
 75 |     department:
 76 |         type: str
 77 |         meaning: 招聘部门
 78 |         eg:  商业部
 79 |     keywords:
 80 |         type: List[str]
 81 |         meaning: 职位关键词
 82 |         eg: ['后端', 'Web', 'Python']
 83 |     description:
 84 |         type: List[str]
 85 |         meaning: 职位介绍
 86 |         eg: ['职位要求:', 'blablabla', '.......']
 87 | 
 88 |     :param lg_company_id: 接口使用的公司 id
 89 |     :param job_type: 职位类型
 90 |     :param page_no: 页码
 91 |     :param is_school_job: 是否爬取校招职位
 92 |     :param skip_exist: 是否跳过数据库已经存在的职位数据
 93 |     :return: 职位数据集合
 94 |     :rtype: List[tornado.util.ObjectDict]
 95 |     """
 96 |     params = {
 97 |         'companyId': lg_company_id,
 98 |         'positionFirstType': job_type,
 99 |         'schoolJob': is_school_job,
100 |         'pageNo': page_no,
101 |         'pageSize': 10,
102 |     }
103 |     response_json = utils.http_tools.requests_get(
104 |         url=constants.COMPANY_JOBS_URL, params=params).json()
105 |     jobs = response_json['content']['data']['page']['result']
106 | 
107 |     jobs_dicts = []
108 |     for job in jobs:
109 |         lg_job_id = job['positionId']
110 |         job_detail = get_job_detail_from_lg(lg_job_id=lg_job_id)
111 |         jobs_dicts.append(ObjectDict(
112 |             lg_job_id=lg_job_id,
113 |             city_name=job.get('city'),
114 |             title=job.get('positionName'),
115 |             salary=job.get('salary'),
116 |             education=job.get('education'),
117 |             nature=job.get('jobNature'),
118 |             work_year=job.get('workYear'),
119 |             advantage=job.get('positionAdvantage', ''),
120 |             # job detail
121 |             department=job_detail.get('department'),
122 |             keywords=job_detail.get('keywords'),
123 |             description=job_detail.get('description'),
124 |         ))
125 |     return jobs_dicts
126 | 
127 | 
128 | def get_job_detail_from_lg(lg_job_id):
129 |     """
130 |     爬取职位详情页的数据
131 | 
132 |     返回的 dict 组成:
133 |     department:
134 |         type: str
135 |         meaning: 招聘部门
136 |         eg:  商业部
137 |     keywords:
138 |         type: List[str]
139 |         meaning: 职位关键词
140 |         eg: ['后端', 'Web', 'Python']
141 |     description:
142 |         type: List[str]
143 |         meaning: 职位介绍
144 |         eg: ['职位要求:', 'blablabla', '.......']
145 | 
146 |     :param lg_job_id: 接口使用的职位 id
147 |     :return: 职位详情页数据
148 |     :rtype: tornado.util.ObjectDict
149 |     """
150 |     response = utils.http_tools.requests_get(
151 |         url=constants.JOB_DETAIL_URL.format(lg_job_id=lg_job_id))
152 |     job_detail_html = etree.HTML(response.text)
153 | 
154 |     department = job_detail_html.xpath('//div[@class="job-name"]/div[@class="company"]/text()')
155 |     description = job_detail_html.xpath('//dd[@class="job_bt"]/div//text()')
156 |     keywords = job_detail_html.xpath('//dd[@class="job_request"]//li[@class="labels"]/text()')
157 | 
158 |     if not department:
159 |         logger.error('can not get department by lg_job_id = {}, html is \n {}'.format(
160 |             lg_job_id, response.text))
161 | 
162 |     return ObjectDict(
163 |         department=department[0] if department else '',
164 |         description=description,
165 |         keywords=keywords,
166 |     )
167 | 
168 | 
169 | def clean_lg_job_data(job_dict):
170 |     """
171 |     清洗爬取到的职位信息
172 | 
173 |     :param job_dict: tornado.util.ObjectDict
174 |     """
175 |     if 'keywords' in job_dict:
176 |         job_dict.keywords = set(map(lambda keyword: keyword.strip().lower(), job_dict.keywords))
177 |     if 'description' in job_dict:
178 |         job_dict.description = ''.join(job_dict.description) if job_dict.description else ''
179 |         job_dict.description = job_dict.description[:constants.JOB_DESCRIPTION_MAX_LEN]
180 |     if 'advantage' in job_dict:
181 |         job_dict.advantage = job_dict.advantage[:constants.JOB_ADVANTAGE_MAX_LEN]
182 | 


--------------------------------------------------------------------------------
/webspider/crawlers/lagou_jobs_count.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from webspider.constants import JOB_JSON_URL
 3 | from webspider.utils.http_tools import requests_post, generate_http_request_headers
 4 | 
 5 | 
 6 | def get_jobs_count_from_lg(city_name, keyword_name):
 7 |     """
 8 |     爬取职位数量
 9 | 
10 |     :param city_name: 城市名
11 |     :param keyword_name: 关键词名
12 |     :return: 城市下的关于关键词的职位数量,如北京的 python 职位数量
13 |     :rtype: int
14 |     """
15 |     query_string = {'needAddtionalResult': False}
16 |     if city_name != '全国':
17 |         query_string['city'] = city_name
18 |     form_data = {
19 |         'first': False,
20 |         'pn': 1,
21 |         'kd': keyword_name
22 |     }
23 |     headers = generate_http_request_headers(
24 |         referer='https://www.lg.com/jobs/list_java?labelWords=&fromSearch=true')
25 |     response_json = requests_post(url=JOB_JSON_URL, params=query_string,
26 |                                   data=form_data, headers=headers).json()
27 |     return int(response_json['content']['positionResult']['totalCount'])
28 | 


--------------------------------------------------------------------------------
/webspider/exceptions.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | __all__ = ['BaseException', 'ResourceNotFoundWebException', 'DowngradeException']
 4 | 
 5 | 
 6 | class BaseException(Exception):
 7 |     ERROR_CODE = None
 8 |     STATUS_CODE = 200
 9 | 
10 |     def __init__(self, message, data=None, debug_message=None):
11 |         if self.ERROR_CODE is None:
12 |             raise NotImplementedError()
13 |         self._message = message
14 |         self._data = dict(data) if data else None
15 |         self._debug_message = debug_message
16 | 
17 |     @property
18 |     def code(self):
19 |         return self.ERROR_CODE
20 | 
21 |     @property
22 |     def message(self):
23 |         return self._message
24 | 
25 |     @property
26 |     def data(self):
27 |         return self._data
28 | 
29 |     @property
30 |     def debug_message(self):
31 |         return self._debug_message
32 | 
33 |     def __str__(self):
34 |         return "Exception: code={code}, message={message}, data={data}, debug_message={debug_message}".format(
35 |             code=self.code, message=self.message, data=self.data, debug_message=self.debug_message)
36 | 
37 |     def __repr__(self):
38 |         return self.__str__()
39 | 
40 | 
41 | class ResourceNotFoundWebException(BaseException):
42 |     """
43 |     Corresponding to HTTP code 404
44 |     """
45 |     ERROR_CODE = 4041
46 |     STATUS_CODE = 404
47 | 
48 |     def __init__(self, message=u'资源不存在', data=None, debug_message=None):
49 |         super(ResourceNotFoundWebException, self).__init__(message, data, debug_message)
50 | 
51 | 
52 | class DowngradeException(BaseException):
53 |     ERROR_CODE = 101
54 | 
55 |     def __init__(self, message=u'降级异常', data=None, debug_message=None):
56 |         super(DowngradeException, self).__init__(message, data, debug_message)
57 | 


--------------------------------------------------------------------------------
/webspider/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from webspider.models.city import CityModel
 3 | from webspider.models.job import JobModel
 4 | from webspider.models.jobs_count import JobsCountModel
 5 | from webspider.models.company import CompanyModel
 6 | from webspider.models.company_industry import CompanyIndustryModel
 7 | from webspider.models.industry import IndustryModel
 8 | from webspider.models.job_keyword import JobKeywordModel
 9 | from webspider.models.keyword import KeywordModel
10 | from webspider.models.keyword_statistic import KeywordStatisticModel
11 | 
12 | __all__ = ['CityModel', 'JobModel', 'JobsCountModel', 'CompanyModel', 'CompanyIndustryModel', 'IndustryModel',
13 |            'JobKeywordModel', 'KeywordModel', 'KeywordStatisticModel']
14 | 


--------------------------------------------------------------------------------
/webspider/models/base.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import logging
  4 | 
  5 | from sqlalchemy import MetaData, inspect, func, text
  6 | from sqlalchemy.ext.declarative import declarative_base
  7 | from tornado.util import ObjectDict
  8 | 
  9 | from webspider.utils import sql
 10 | from webspider.utils.classproperty import classproperty
 11 | 
 12 | __all__ = ['BaseModel']
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | _Base = declarative_base()
 17 | 
 18 | 
 19 | class BaseModel(_Base):
 20 |     __abstract__ = True
 21 |     __table_args__ = {
 22 |         'mysql_engine': 'InnoDB',
 23 |         'mysql_charset': 'utf8mb4',
 24 |         'extend_existing': True,
 25 |     }
 26 | 
 27 |     metadata = MetaData(bind=sql.db_engine, reflect=True)
 28 | 
 29 |     @classproperty
 30 |     def session(cls):
 31 |         return sql.get_session()
 32 | 
 33 |     @classproperty
 34 |     def pk_name(cls):
 35 |         """主键名"""
 36 |         return inspect(cls).primary_key[0].name
 37 | 
 38 |     @classproperty
 39 |     def pk(cls):
 40 |         """表主键"""
 41 |         return getattr(cls, cls.pk_name)
 42 | 
 43 |     def dict(self):
 44 |         """sqlalchemy object -> dict"""
 45 |         columns = self.__table__.columns.keys()
 46 |         return ObjectDict((column, getattr(self, column)) for column in columns)
 47 | 
 48 |     @classmethod
 49 |     def count(cls, filter=None, filter_by=None):
 50 |         """
 51 |         获取数据库中记录的数目
 52 |         :param filter: apply the given filtering criterion to a copy of this Query,
 53 |         using SQL expressions.
 54 |         :param filter_by: apply the given filtering criterion to a copy of this Query,
 55 |         using keyword expressions as a dict.
 56 |         :return:
 57 |         """
 58 |         query = cls.session.query(func.count(cls.pk))
 59 | 
 60 |         if filter is not None:
 61 |             query = query.filter(filter)
 62 |         if filter_by is not None:
 63 |             query = query.filter_by(**filter_by)
 64 | 
 65 |         return query.scalar()
 66 | 
 67 |     @classmethod
 68 |     def add(cls, **values):
 69 |         """添加记录"""
 70 |         obj = cls(**values)
 71 |         cls.session.add(obj)
 72 |         cls.session.flush()
 73 |         return getattr(obj, obj.pk_name)
 74 | 
 75 |     @classmethod
 76 |     def get_by_pk(cls, pk):
 77 |         """通过主键值获取记录"""
 78 |         query = cls.session.query(cls).filter(cls.pk == pk)
 79 |         return query.scalar()
 80 | 
 81 |     @classmethod
 82 |     def get_one(cls, filter=None, filter_by=None):
 83 |         """
 84 |         获取记录
 85 |         :param filter: apply the given filtering criterion to a copy of this Query,
 86 |         using SQL expressions.
 87 |         :param filter_by: apply the given filtering criterion to a copy of this Query,
 88 |         using keyword expressions as a dict.
 89 |         :return:
 90 |         """
 91 |         query = cls.session.query(cls)
 92 | 
 93 |         if filter is not None:
 94 |             query = query.filter(filter)
 95 |         if filter_by is not None:
 96 |             query = query.filter_by(**filter_by)
 97 | 
 98 |         return query.first()
 99 | 
100 |     @classmethod
101 |     def list(cls, columns=None, filter=None, filter_by=None, order_by=None, group_by=None, offset=None, limit=None):
102 |         """
103 |         批量获取记录
104 |         :param columns: the columns you want to query, SQL expression, column, or mapped entity expected
105 |         :param filter: apply the given filtering criterion to a copy of this Query,
106 |         using SQL expressions.
107 |         :param filter_by: apply the given filtering criterion to a copy of this Query,
108 |         using keyword expressions as a dict.
109 |         :param order_by: apply one or more ORDER BY criterion to the query and return
110 |         the newly resulting ``Query``
111 |         :param group_by: apply one or more GROUP BY criterion to the query and return
112 |         the newly resulting :class:`.Query`
113 |         :param offset: Apply an ``OFFSET`` to the query and return the newly resulting
114 |         ``Query``.
115 |         :param limit: Apply a ``LIMIT`` to the query and return the newly resulting
116 |         ``Query``.
117 |         :return:
118 |         """
119 |         query = cls.session.query(cls)
120 |         if columns:
121 |             query = cls.session.query(columns)
122 |         if filter is not None:
123 |             query = query.filter(filter)
124 |         if filter_by is not None:
125 |             query = query.filter_by(**filter_by)
126 |         if group_by is not None:
127 |             query = query.group_by(group_by)
128 |         if order_by is not None:
129 |             query = query.order_by(order_by)
130 |         if offset is not None:
131 |             query = query.offset(offset)
132 |         if limit is not None:
133 |             query = query.limit(limit)
134 | 
135 |         result = query.all()
136 | 
137 |         return result
138 | 
139 |     @classmethod
140 |     def is_exist(cls, filter=None, filter_by=None):
141 |         """
142 |         判断某个记录是否存在
143 |         :param filter: apply the given filtering criterion to a copy of this Query,
144 |         using SQL expressions.
145 |         :param filter_by: apply the given filtering criterion to a copy of this Query,
146 |         using keyword expressions as a dict.
147 |         :return: boolean
148 |         """
149 | 
150 |         return cls.count(filter=filter, filter_by=filter_by) != 0
151 | 
152 |     @classmethod
153 |     def update(cls, filter=None, filter_by=None, values=None):
154 |         """更新数据
155 |         :param filter: apply the given filtering criterion to a copy of this Query,
156 |         using SQL expressions.
157 |         :param filter_by: apply the given filtering criterion to a copy of this Query,
158 |         using keyword expressions as a dict.
159 |         :param values: values to update
160 |         :return: type: int, affected rows
161 |         """
162 |         query = cls.session.query(cls)
163 | 
164 |         if filter is not None:
165 |             query = query.filter(filter)
166 | 
167 |         if filter_by is not None:
168 |             query = query.filter_by(**filter_by)
169 | 
170 |         affect_rows = query.update(values)
171 |         return affect_rows
172 | 
173 |     @classmethod
174 |     def update_by_pk(cls, pk, values):
175 |         """主键更新数据
176 | 
177 |         :param pk: 主键值
178 |         :param values: dict 要更新的值，key=value 形式
179 |         :return: 返回变更的行数
180 |         """
181 |         return cls.update(filter=(cls.pk == pk), values=values)
182 | 
183 |     @classmethod
184 |     def execute_sql_string(cls, sql_string, parameters_dict=None):
185 |         """
186 |         直接执行 sql 语句
187 |         eg:
188 |             sql_string = 'select * from temp where id = :numbers' and parameters_dict = {'numbers': 1}
189 |             >> select * from temp where id = 1
190 |         :param sql_string: the sql string you want to execute
191 |         :param parameters_dict: parameters
192 |         :return: if query returns_rows return rows(List(tuple)) else return affect_rows(int)
193 |         """
194 |         query = cls.session.execute(text(sql_string), parameters_dict)
195 |         if query.returns_rows:
196 |             return query.fetchall()
197 |         else:
198 |             return query.rowcount
199 | 
200 |     @classmethod
201 |     def batch_add(cls, instances):
202 |         """批量添加记录"""
203 |         if not all([isinstance(instance, cls) for instance in instances]):
204 |             raise ValueError('all instances must be {table_name} model instance'.format(table_name=cls.__tablename__))
205 |         cls.session.bulk_save_objects(instances)
206 | 


--------------------------------------------------------------------------------
/webspider/models/city.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP
 6 | 
 7 | from webspider.models.base import BaseModel
 8 | 
 9 | 
10 | class CityModel(BaseModel):
11 |     __tablename__ = 'city'
12 | 
13 |     id = Column(INTEGER, primary_key=True, nullable=False, autoincrement=True)
14 |     name = Column(VARCHAR(64), nullable=False, doc=u'城市名')
15 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
16 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间')
17 | 


--------------------------------------------------------------------------------
/webspider/models/company.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP, TINYINT
 6 | 
 7 | from webspider import constants
 8 | from webspider.models.base import BaseModel
 9 | 
10 | 
11 | class CompanyModel(BaseModel):
12 |     __tablename__ = 'company'
13 | 
14 |     id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
15 |     lg_company_id = Column(INTEGER, nullable=False, doc='所使用的公司id')
16 |     city_id = Column(INTEGER, nullable=False, doc=u'所在城市 id')
17 |     shortname = Column(VARCHAR(64), nullable=False, doc=u'公司名称')
18 |     fullname = Column(VARCHAR(128), nullable=False, doc=u'公司全称')
19 |     finance_stage = Column(TINYINT, nullable=False, doc=u'融资阶段')
20 |     size = Column(TINYINT, nullable=False, doc=u'公司规模')
21 |     address = Column(VARCHAR(128), nullable=False, doc=u'公司地址')
22 |     features = Column(VARCHAR(128), nullable=False, doc=u'公司特点')
23 |     process_rate = Column(TINYINT, nullable=False, doc=u'简历处理率')
24 |     introduce = Column(VARCHAR(constants.COMPANY_INTRODUCE_MAX_LEN), nullable=False, doc=u'公司简介')
25 |     advantage = Column(VARCHAR(constants.COMPANY_ADVANTAGE_MAX_LEN), nullable=False, doc=u'公司优势')
26 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
27 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now,
28 |                         onupdate=datetime.now, doc=u'最后更新时间')
29 | 


--------------------------------------------------------------------------------
/webspider/models/company_industry.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP
 6 | 
 7 | from webspider.models.base import BaseModel
 8 | 
 9 | 
10 | class CompanyIndustryModel(BaseModel):
11 |     __tablename__ = 'company_industry'
12 | 
13 |     id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 |     company_id = Column(INTEGER, nullable=False, doc=u'公司 id')
15 |     industry_id = Column(INTEGER, nullable=False, doc=u'行业 id')
16 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
17 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间')
18 | 


--------------------------------------------------------------------------------
/webspider/models/industry.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP
 6 | 
 7 | from webspider.models.base import BaseModel
 8 | 
 9 | 
10 | class IndustryModel(BaseModel):
11 |     __tablename__ = 'industry'
12 | 
13 |     id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 |     name = Column(VARCHAR(64), nullable=False, doc=u'行业名称')
15 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
16 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间')
17 | 


--------------------------------------------------------------------------------
/webspider/models/job.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TINYINT, TIMESTAMP
 6 | 
 7 | from webspider import constants
 8 | from webspider.models.base import BaseModel
 9 | 
10 | 
11 | class JobModel(BaseModel):
12 |     __tablename__ = 'job'
13 | 
14 |     id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
15 |     lg_job_id = Column(INTEGER, nullable=False, doc=u'接口使用的 job id')
16 |     city_id = Column(INTEGER, nullable=False, doc=u'城市 id')
17 |     company_id = Column(INTEGER, nullable=False, doc=u'公司 id')
18 |     title = Column(VARCHAR(64), nullable=False, default='', doc=u'职位标题')
19 |     work_year = Column(TINYINT, nullable=False, doc=u'工作年限要求')
20 |     department = Column(VARCHAR(64), nullable=False, doc=u'招聘部门')
21 |     salary = Column(VARCHAR(32), nullable=False, doc=u'薪水')
22 |     education = Column(TINYINT, nullable=False, doc=u'教育背景要求')
23 |     nature = Column(TINYINT, nullable=False, doc=u'工作性质')
24 |     description = Column(VARCHAR(constants.JOB_DESCRIPTION_MAX_LEN), nullable=False, doc=u'额外描述')
25 |     advantage = Column(VARCHAR(constants.JOB_ADVANTAGE_MAX_LEN), nullable=False, doc=u'职位优势')
26 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'职位创建时间')
27 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now,
28 |                         onupdate=datetime.now, doc=u'职位创建时间')
29 | 


--------------------------------------------------------------------------------
/webspider/models/job_keyword.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP
 6 | 
 7 | from webspider.models.base import BaseModel
 8 | 
 9 | 
10 | class JobKeywordModel(BaseModel):
11 |     __tablename__ = 'job_keyword'
12 | 
13 |     id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 |     job_id = Column(INTEGER, nullable=False, doc=u'职位 id')
15 |     keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id')
16 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
17 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间')
18 | 


--------------------------------------------------------------------------------
/webspider/models/jobs_count.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, TIMESTAMP
 6 | 
 7 | from webspider.models.base import BaseModel
 8 | 
 9 | 
10 | class JobsCountModel(BaseModel):
11 |     __tablename__ = 'jobs_count'
12 | 
13 |     id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 |     date = Column(INTEGER, nullable=False, doc=u'日期')
15 |     keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id')
16 |     all_city = Column(INTEGER, nullable=False, default=0, doc=u'全国岗位数量')
17 |     beijing = Column(INTEGER, nullable=False, default=0, doc=u'北京岗位数量')
18 |     guangzhou = Column(INTEGER, nullable=False, default=0, doc=u'广州岗位数量')
19 |     shenzhen = Column(INTEGER, nullable=False, default=0, doc=u'深圳岗位数量')
20 |     shanghai = Column(INTEGER, nullable=False, default=0, doc=u'上海岗位数量')
21 |     hangzhou = Column(INTEGER, nullable=False, default=0, doc=u'杭州岗位数量')
22 |     chengdu = Column(INTEGER, nullable=False, default=0, doc=u'成都岗位数量')
23 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
24 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'最后更新时间')
25 | 


--------------------------------------------------------------------------------
/webspider/models/keyword.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP
 6 | 
 7 | from webspider.models.base import BaseModel
 8 | 
 9 | 
10 | class KeywordModel(BaseModel):
11 |     __tablename__ = 'keyword'
12 | 
13 |     id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
14 |     name = Column(VARCHAR(64), nullable=False, doc=u'关键词名称')
15 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
16 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间')
17 | 


--------------------------------------------------------------------------------
/webspider/models/keyword_statistic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from datetime import datetime
 3 | 
 4 | from sqlalchemy import Column
 5 | from sqlalchemy.dialects.mysql import INTEGER, VARCHAR, TIMESTAMP
 6 | 
 7 | from webspider.models.base import BaseModel
 8 | from webspider.models.jobs_count import JobsCountModel
 9 | 
10 | 
11 | class KeywordStatisticModel(BaseModel):
12 |     __tablename__ = 'keyword_statistic'
13 | 
14 |     id = Column(INTEGER, nullable=False, primary_key=True, autoincrement=True)
15 |     keyword_id = Column(INTEGER, nullable=False, doc=u'关键词 id')
16 |     educations = Column(VARCHAR(2048), nullable=False, doc=u'教育背景要求统计')
17 |     city_jobs_count = Column(VARCHAR(2048), nullable=False, doc=u'城市职位数量统计')
18 |     salary = Column(VARCHAR(2048), nullable=False, doc=u'薪水分布统计')
19 |     financing_stage = Column(VARCHAR(2048), nullable=False, doc=u'招聘公司的融资统计')
20 |     work_years = Column(VARCHAR(2048), nullable=False, doc=u'职位薪水统计')
21 |     created_at = Column(TIMESTAMP, nullable=False, default=datetime.now, doc=u'创建时间')
22 |     updated_at = Column(TIMESTAMP, nullable=False, default=datetime.now, onupdate=datetime.now, doc=u'创建时间')
23 | 
24 |     @property
25 |     def per_day_jobs_count(self):
26 |         return JobsCountModel.list(filter_by={'keyword_id': self.keyword_id}, order_by=JobsCountModel.date.asc())
27 | 


--------------------------------------------------------------------------------
/webspider/quickly_cmd.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # flake8: noqa
 3 | import os
 4 | import logging
 5 | 
 6 | from tornado.options import options, define
 7 | 
 8 | from webspider import constants
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def run_web_app_by_gunicorn():
14 |     define(name='port', default=8000, type=int, help='run on the given port')
15 |     logger.info(
16 |         '\n================ spider web server(require gunicorn and gevent) has started ================ ')
17 |     logger.info('\n                       server start at port -> {}, debug mode = {} '.format(options.port,
18 |                                                                                                constants.DEBUG))
19 |     os.system(
20 |         "env/bin/gunicorn 'webspider.web_app:make_wsgi_app()' -b 0.0.0.0:{port} -w 1 -k gevent".format(
21 |             port=options.port
22 |         )
23 |     )
24 | 
25 | 
26 | def run_celery_default_worker():
27 |     os.system(
28 |         u'env/bin/celery worker -A webspider.tasks.celery_app -Q default -n default_worker --loglevel=debug')
29 | 
30 | 
31 | def run_celery_lg_data_worker():
32 |     os.system(
33 |         u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_data -n lg_data_worker --loglevel=debug')
34 | 
35 | 
36 | def run_celery_lg_jobs_data_worker():
37 |     os.system(
38 |         u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_jobs_data -n lg_jobs_data_worker --loglevel=debug')
39 | 
40 | 
41 | def run_celery_lg_jobs_count_worker():
42 |     os.system(
43 |         u'env/bin/celery worker -A webspider.tasks.celery_app -Q lg_jobs_count -n lg_jobs_count_worker --loglevel=debug ')
44 | 
45 | 
46 | def run_celery_beat():
47 |     os.system(u'env/bin/celery -A webspider.tasks.celery_app beat --loglevel=debug')
48 | 
49 | 
50 | def run_celery_flower():
51 |     os.system(u'env/bin/celery flower --broker=redis://localhost:6379/0 --broker_api=redis://localhost:6379/0')
52 | 


--------------------------------------------------------------------------------
/webspider/setting.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os
 3 | 
 4 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 5 | 
 6 | # smtp
 7 | SMTP_HOST = os.environ.get('SMTP_HOST')
 8 | SMTP_PORT = os.environ.get('SMTP_PORT')
 9 | 
10 | # email
11 | MAIL_USER_NAME = os.environ.get('MAIL_USER_NAME')
12 | MAIL_USER_PASSWORD = os.environ.get('MAIL_USER_PASSWORD')
13 | FROM_EMAIL_ADDRESS = os.environ.get('FROM_EMAIL_ADDRESS')
14 | TO_EMAIL_ADDRESS = os.environ.get('TO_EMAIL_ADDRESS')
15 | 
16 | # MYSQL
17 | MYSQL_USERNAME = os.environ.get('MYSQL_USERNAME', 'root')
18 | MYSQL_PASSWORD = os.environ.get('MYSQL_PASSWORD', '')
19 | DB_HOST = os.environ.get('DB_HOST', 'localhost')
20 | DB_PORT = os.environ.get('DB_PORT', '3306')
21 | DB_NAME = os.environ.get('DB_NAME', 'spider')
22 | DB_CONNECT_STRING_FORMAT = 'mysql+mysqldb://{username}:{password}@{db_host}:{db_port}/{db_name}?charset=utf8mb4'
23 | 
24 | # REDIS
25 | REDIS_HOST = os.environ.get('DB_HOST', 'localhost')
26 | REDIS_PORT = os.environ.get('DB_PORT', '6379')
27 | 
28 | # MYSQL 配置
29 | MYSQL_CONF = {
30 |     'connect_string': DB_CONNECT_STRING_FORMAT.format(
31 |         username=MYSQL_USERNAME,
32 |         password=MYSQL_PASSWORD,
33 |         db_host=DB_HOST,
34 |         db_port=DB_PORT,
35 |         db_name=DB_NAME
36 |     ),
37 |     'host': DB_HOST,
38 |     'port': DB_PORT,
39 |     'username': MYSQL_USERNAME,
40 |     'password': MYSQL_PASSWORD,
41 | }
42 | 
43 | SMTP_CONF = {
44 |     'host': SMTP_HOST,
45 |     'port': SMTP_PORT,
46 |     'from_email': FROM_EMAIL_ADDRESS,
47 |     'to_email': TO_EMAIL_ADDRESS,
48 | }
49 | 
50 | MAIL_CONF = {
51 |     'username': MAIL_USER_NAME,
52 |     'password': MAIL_USER_PASSWORD,
53 | }
54 | 
55 | REDIS_CONF = {
56 |     'host': REDIS_HOST,
57 |     'port': REDIS_PORT
58 | }
59 | 


--------------------------------------------------------------------------------
/webspider/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/webspider/tasks/actor/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/webspider/tasks/actor/keyword_statistic.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import logging
 3 | import json
 4 | 
 5 | from webspider.tasks.celery_app import celery_app
 6 | from webspider.controllers import keyword_statistic_ctl
 7 | from webspider.models import (KeywordModel, JobModel, JobKeywordModel, KeywordStatisticModel)
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @celery_app.task()
13 | def update_keywords_statistic_task():
14 |     """更新关键词统计任务"""
15 |     keywords = KeywordModel.list()
16 |     for keyword in keywords:
17 |         update_single_keyword_statistic_task.delay(keyword.id)
18 | 
19 | 
20 | @celery_app.task()
21 | def update_single_keyword_statistic_task(keyword_id):
22 |     """更新关键词统计任务"""
23 | 
24 |     job_keywords = JobKeywordModel.list(filter_by={'keyword_id': keyword_id})
25 |     jobs = JobModel.list(filter=(JobModel.id.in_([job_keyword.job_id for job_keyword in job_keywords])))
26 |     if not jobs:
27 |         return
28 | 
29 |     educations_statistic = keyword_statistic_ctl.get_educations_statistic(jobs=jobs)
30 |     finance_stage_statistic = keyword_statistic_ctl.get_finance_stage_statistic(jobs=jobs)
31 |     city_jobs_count_statistic = keyword_statistic_ctl.get_city_jobs_count_statistic(jobs=jobs)
32 |     salary_statistic = keyword_statistic_ctl.get_salary_statistic(jobs=jobs)
33 |     work_years_statistic = keyword_statistic_ctl.get_work_years_statistic(jobs=jobs)
34 | 
35 |     statistic_values = dict(
36 |         keyword_id=keyword_id,
37 |         educations=json.dumps(educations_statistic),
38 |         city_jobs_count=json.dumps(city_jobs_count_statistic),
39 |         salary=json.dumps(salary_statistic),
40 |         financing_stage=json.dumps(finance_stage_statistic),
41 |         work_years=json.dumps(work_years_statistic)
42 |     )
43 | 
44 |     if KeywordStatisticModel.is_exist(filter_by={'keyword_id': keyword_id}):
45 |         KeywordStatisticModel.update(filter_by={'keyword_id': keyword_id}, values=statistic_values)
46 |     else:
47 |         KeywordStatisticModel.add(**statistic_values)
48 | 


--------------------------------------------------------------------------------
/webspider/tasks/actor/lagou_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import logging
  3 | 
  4 | from webspider import utils
  5 | from webspider import crawlers
  6 | from webspider import constants
  7 | from webspider.utils.cache import redis_instance
  8 | from webspider.tasks.celery_app import celery_app
  9 | from webspider.controllers import industry_ctl, keyword_ctl, city_ctl
 10 | from webspider.models import (CityModel, CompanyModel,
 11 |                               CompanyIndustryModel, JobModel, JobKeywordModel)
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | @celery_app.task()
 17 | def crawl_lg_data_task():
 18 |     """爬取数据任务"""
 19 | 
 20 |     # 清除抓取记录
 21 |     keys = redis_instance.keys('crawled_company_jobs*')
 22 |     if keys:
 23 |         redis_instance.delete(*keys)
 24 | 
 25 |     crawl_lg_city_data_task.delay()
 26 |     # 目前只抓取这几个城市 全国:0, 北京:2 上海:3 杭州:6 深圳:215 广州:213 成都:252
 27 |     lg_city_ids = [2, 3, 6, 215, 213, 252]
 28 |     lg_finance_stage_ids = [1, 2, 3, 4, 5, 6, 7, 8]
 29 |     lg_industry_ids = [24, 25, 33, 27, 29, 45, 31, 28,
 30 |                        47, 34, 35, 43, 32, 41, 26, 48, 38, 49, 10594]
 31 |     # 爬取公司数据
 32 |     for industry_id in lg_industry_ids:
 33 |         for city_id in lg_city_ids:
 34 |             for finance_stage_id in lg_finance_stage_ids:
 35 |                 crawl_lg_company_data_task.delay(city_id=city_id, finance_stage_id=finance_stage_id,
 36 |                                                  industry_id=industry_id)
 37 | 
 38 | 
 39 | @celery_app.task()
 40 | def crawl_lg_city_data_task():
 41 |     """爬取城市数据任务"""
 42 |     city_dicts = crawlers.get_cites_from_lg()
 43 |     for city_dict in city_dicts:
 44 |         if CityModel.is_exist(filter_by={'id': city_dict.id}):
 45 |             CityModel.update_by_pk(pk=city_dict.id, values=city_dict)
 46 |         else:
 47 |             CityModel.add(**city_dict)
 48 | 
 49 | 
 50 | @celery_app.task()
 51 | def crawl_lg_company_data_task(city_id, finance_stage_id, industry_id):
 52 |     """爬取公司数据任务"""
 53 |     companies_pagination = crawlers.get_companies_pagination_from_lg(city_id=city_id,
 54 |                                                                      finance_stage_id=finance_stage_id,
 55 |                                                                      industry_id=industry_id)
 56 |     for page_no in companies_pagination.iter_pages:
 57 |         company_dicts = crawlers.get_companies_from_lg(city_id=city_id,
 58 |                                                        finance_stage_id=finance_stage_id,
 59 |                                                        industry_id=industry_id,
 60 |                                                        page_no=page_no)
 61 |         if not company_dicts:
 62 |             break
 63 |         for company_dict in company_dicts:
 64 |             crawlers.clean_lg_company_data(company_dict)
 65 |             utils.convert.convert_dict_field_to_constants(company_dict)
 66 | 
 67 |             industries = company_dict.pop('industries')
 68 |             city_name = company_dict.pop('city_name')
 69 | 
 70 |             city_ctl.insert_city_if_not_exist(city_name)
 71 |             company_dict['city_id'] = city_ctl.get_city_id_by_name(city_name)
 72 | 
 73 |             company = CompanyModel.get_one(
 74 |                 filter_by={'lg_company_id': company_dict.lg_company_id})
 75 |             if company:
 76 |                 CompanyModel.update_by_pk(pk=company.id, values=company_dict)
 77 |             else:
 78 |                 company_id = CompanyModel.add(**company_dict)
 79 | 
 80 |                 for industry in industries:
 81 |                     industry_ctl.insert_industry_if_not_exist(name=industry)
 82 |                     industry_id = industry_ctl.get_industry_id_by_name(name=industry)
 83 |                     CompanyIndustryModel.add(industry_id=industry_id, company_id=company_id)
 84 | 
 85 |             crawl_lg_job_data_task.delay(company_dict.lg_company_id)
 86 | 
 87 | 
 88 | @celery_app.task()
 89 | def crawl_lg_job_data_task(lg_company_id):
 90 |     """爬取职位数据任务"""
 91 |     # 过滤本轮已经爬取过职位的公司
 92 |     if not redis_instance.setnx(constants.CRAWLED_COMPANY_JOBS_REDIS_KEY.format(lg_company_id=lg_company_id), 1):
 93 |         return
 94 |     jobs_pagination = crawlers.get_jobs_pagination_from_lg(lg_company_id=lg_company_id,
 95 |                                                            job_type=constants.LGJobType.technology)
 96 |     for page_no in jobs_pagination.iter_pages:
 97 |         job_dicts = crawlers.get_jobs_from_lg(lg_company_id=lg_company_id,
 98 |                                               job_type=constants.LGJobType.technology,
 99 |                                               page_no=page_no)
100 |         if not job_dicts:
101 |             break
102 |         for job_dict in job_dicts:
103 |             crawlers.clean_lg_job_data(job_dict)
104 |             utils.convert.convert_dict_field_to_constants(job_dict)
105 | 
106 |             keywords = job_dict.pop('keywords')
107 |             city_name = job_dict.pop('city_name')
108 | 
109 |             city_ctl.insert_city_if_not_exist(city_name)
110 |             job_dict['city_id'] = city_ctl.get_city_id_by_name(city_name)
111 |             company = CompanyModel.get_one(filter_by={'lg_company_id': lg_company_id})
112 |             job_dict['company_id'] = company.id
113 | 
114 |             job = JobModel.get_one(filter_by={'lg_job_id': job_dict.lg_job_id})
115 |             if job:
116 |                 JobModel.update_by_pk(pk=job.id, values=job_dict)
117 |             else:
118 |                 job_id = JobModel.add(**job_dict)
119 | 
120 |                 for keyword in keywords:
121 |                     keyword_ctl.insert_keyword_if_not_exist(name=keyword)
122 |                     keyword_id = keyword_ctl.get_keyword_id_by_name(name=keyword)
123 |                     JobKeywordModel.add(keyword_id=keyword_id, job_id=job_id)
124 | 


--------------------------------------------------------------------------------
/webspider/tasks/actor/lagou_jobs_count.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import logging
 3 | from datetime import datetime
 4 | 
 5 | from webspider import crawlers
 6 | from webspider.tasks.celery_app import celery_app
 7 | from webspider.controllers import keyword_ctl, job_keyword_ctl
 8 | from webspider.models import JobsCountModel
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | @celery_app.task()
14 | def crawl_lg_jobs_count_task():
15 |     keyword_ids = job_keyword_ctl.get_most_frequently_keyword_ids(limit=1000)
16 |     for keyword_id in keyword_ids:
17 |         crawl_lg_keyword_jobs_count_task.delay(keyword_id)
18 | 
19 | 
20 | @celery_app.task()
21 | def crawl_lg_keyword_jobs_count_task(keyword_id):
22 |     cities_name_map = {
23 |         'all_city': u'全国',
24 |         'beijing': u'北京',
25 |         'shanghai': u'上海',
26 |         'guangzhou': u'广州',
27 |         'shenzhen': u'深圳',
28 |         'hangzhou': u'杭州',
29 |         'chengdu': u'成都',
30 |     }
31 |     keyword_name = keyword_ctl.get_keyword_name_by_id(keyword_id)
32 |     jobs_count_dict = dict(keyword_id=keyword_id)
33 |     for city_name_key, city_name in cities_name_map.items():
34 |         jobs_count_dict[city_name_key] = crawlers.get_jobs_count_from_lg(city_name=city_name,
35 |                                                                          keyword_name=keyword_name)
36 |     jobs_count_dict['date'] = int(datetime.today().strftime('%Y%m%d'))
37 | 
38 |     JobsCountModel.add(**jobs_count_dict)
39 | 


--------------------------------------------------------------------------------
/webspider/tasks/celery_app.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | from celery import Celery
4 | 
5 | celery_app = Celery('tasks')
6 | celery_app.config_from_object('webspider.tasks.celery_config')
7 | 


--------------------------------------------------------------------------------
/webspider/tasks/celery_config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from kombu import Queue
 3 | from kombu import Exchange
 4 | 
 5 | from celery.schedules import crontab
 6 | 
 7 | BROKER_URL = 'redis://127.0.0.1:6379'  # 指定 Broker
 8 | 
 9 | CELERY_RESULT_BACKEND = 'redis://127.0.0.1:6379/0'  # 指定 Backend
10 | 
11 | CELERY_CREATE_MISSING_QUEUES = True  # 某个程序中出现的队列，在broker中不存在，则立刻创建它
12 | 
13 | CELERY_TIMEZONE = 'Asia/Shanghai'  # 指定时区，默认是 UTC
14 | 
15 | CELERYD_CONCURRENCY = 2  # 并发worker数
16 | 
17 | CELERY_ENABLE_UTC = False
18 | 
19 | CELERYD_FORCE_EXECV = True  # 强制退出
20 | 
21 | CELERY_TASK_SERIALIZER = 'json'  # 任务序列化和反序列化
22 | 
23 | CELERY_RESULT_SERIALIZER = 'json'  # 读取任务结果一般性能要求不高，所以使用了可读性更好的JSON
24 | 
25 | CELERY_IGNORE_RESULT = True  # 忽略任务结果
26 | 
27 | # CELERY_TASK_RESULT_EXPIRES = 60 * 60 * 1  # 任务结果过期时间
28 | 
29 | CELERY_IMPORTS = (  # 指定导入的任务模块
30 |     'webspider.tasks.actor.lg_data',
31 |     'webspider.tasks.actor.lg_jobs_count',
32 |     'webspider.tasks.actor.keyword_statistic',
33 | )
34 | 
35 | CELERY_TASK_PUBLISH_RETRY = False  # 重试
36 | 
37 | CELERYBEAT_SCHEDULE = {
38 |     'crawl_lg_jobs_count_task': {
39 |         'task': 'webspider.tasks.actor.lg_jobs_count.crawl_lg_jobs_count_task',
40 |         'schedule': crontab(hour='01', minute='01', day_of_week='2, 5'),
41 |     },
42 |     'crawl_lg_data_task': {
43 |         'task': 'webspider.tasks.actor.lg_data.crawl_lg_data_task',
44 |         'schedule': crontab(hour='01', minute='01', day_of_month='1'),
45 |     },
46 |     'update_keyword_statistic': {
47 |         'task': 'webspider.tasks.actor.keyword_statistic.update_keywords_statistic_task',
48 |         'schedule': crontab(hour='01', minute='01', day_of_week='1, 4'),
49 |     },
50 | }
51 | 
52 | default_exchange = Exchange('default', type='direct')
53 | lg_exchange = Exchange('lg', type='direct')
54 | 
55 | CELERY_QUEUES = (
56 |     Queue(name='default', exchange=default_exchange, routing_key='default'),
57 |     Queue(name='lg_data', exchange=lg_exchange, routing_key='for_lg_data'),
58 |     Queue(name='lg_jobs_data', exchange=lg_exchange, routing_key='for_lg_jobs_data'),
59 |     Queue(name='lg_jobs_count', exchange=lg_exchange, routing_key='for_lg_jobs_count'),
60 | )
61 | 
62 | CELERY_ROUTES = {
63 |     'webspider.tasks.actor.lg_data.crawl_lg_job_data_task': {'exchange': 'lg',
64 |                                                              'routing_key': 'for_lg_jobs_data'},
65 |     'webspider.tasks.actor.lg_jobs_count.*': {'exchange': 'lg', 'routing_key': 'for_lg_jobs_count'},
66 |     'webspider.tasks.actor.lg_data.*': {'exchange': 'lg', 'routing_key': 'for_lg_data'},
67 |     'webspider.tasks.actor.keyword_statistic.*': {'exchange': 'default', 'routing_key': 'default'}
68 | }
69 | 


--------------------------------------------------------------------------------
/webspider/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from webspider.utils import cache
 3 | from webspider.utils import classproperty
 4 | from webspider.utils import common
 5 | from webspider.utils import convert
 6 | from webspider.utils import http_tools
 7 | from webspider.utils import log
 8 | from webspider.utils import pagination
 9 | from webspider.utils import sql
10 | from webspider.utils import text
11 | from webspider.utils import time_tools
12 | 
13 | __all__ = ['cache', 'classproperty', 'common', 'convert', 'http_tools', 'log', 'pagination', 'sql', 'text',
14 |            'time_tools']
15 | 


--------------------------------------------------------------------------------
/webspider/utils/cache.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import logging
 3 | import pickle
 4 | from functools import wraps
 5 | 
 6 | import redis
 7 | 
 8 | from webspider import setting
 9 | 
10 | redis_pool = redis.ConnectionPool(host=setting.REDIS_CONF['host'],
11 |                                   port=setting.REDIS_CONF['port'])
12 | redis_instance = redis.Redis(connection_pool=redis_pool)
13 | 
14 | 
15 | def simple_cache(ex=None):
16 |     """利用 redis 进行缓存，暂不支持 kwargs 类型的参数传入方式"""
17 | 
18 |     def decorator(func):
19 |         @wraps(func)
20 |         def wrapper(*args, **kwargs):
21 |             if kwargs:
22 |                 raise ValueError(
23 |                     "args key generator does not accept kwargs arguments")
24 |             redis_key = func.__name__ + '(' + ','.join(map(str, args)) + ')'
25 |             result = redis_instance.get(redis_key)
26 |             if result:
27 |                 logging.debug('cache: get func result from redis key - {}'.format(redis_key))
28 |                 result = pickle.loads(result)
29 |             else:
30 |                 logging.debug('cache: get func result from func key - {}'.format(redis_key))
31 |                 result = func(*args)
32 |                 redis_instance.set(name=redis_key, value=pickle.dumps(result), ex=ex)
33 |             return result
34 | 
35 |         return wrapper
36 | 
37 |     return decorator
38 | 
39 | 
40 | def cache_clear(func, *args):
41 |     """失效缓存"""
42 |     redis_key = func.__name__
43 |     if args:
44 |         redis_key += ('(' + ','.join(map(str, args)) + ')')
45 |     logging.info('remove cache redis-key: {}'.format(redis_key))
46 |     keys = redis_instance.keys('*' + redis_key + '*')
47 |     if keys:
48 |         remove_count = redis_instance.delete(*keys)
49 |         logging.info('cache clear count {}'.format(remove_count))
50 |         return remove_count
51 | 


--------------------------------------------------------------------------------
/webspider/utils/classproperty.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | 
 4 | class ClassPropertyDescriptor(object):
 5 |     """类属性"""
 6 |     def __init__(self, fget, fset=None):
 7 |         self.fget = fget
 8 |         self.fset = fset
 9 | 
10 |     def __get__(self, obj, obj_type=None):
11 |         if obj_type is None:
12 |             obj_type = type(obj)
13 |         return self.fget.__get__(obj, obj_type)()
14 | 
15 |     def __set__(self, obj, value):
16 |         raise AttributeError("can't set attribute")
17 | 
18 | 
19 | def classproperty(func):
20 |     if not isinstance(func, (classmethod, staticmethod)):
21 |         func = classmethod(func)
22 | 
23 |     return ClassPropertyDescriptor(func)
24 | 


--------------------------------------------------------------------------------
/webspider/utils/common.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import logging
 3 | from collections import Counter
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def get_key_from_dict_by_value(value, dictionary):
 9 |     keys = [_key for (_key, _value) in dictionary.items() if _value == value]
10 |     if not keys:
11 |         raise ValueError(u'can not get key from dict by value {}'.format(value))
12 |     if len(keys) > 1:
13 |         raise AttributeError(u'get multi keys from dict by value {}'.format(value))
14 |     return keys[0]
15 | 
16 | 
17 | def get_field_statistics(values, constants_dict):
18 |     """
19 |     获得某批数据的统计情况
20 |     eg:
21 |         >>>get_field_statistics([0, 0, 0, 1, 1], {'男': 0, '女': 1})
22 |         >>>{'男':3, '女':2}
23 | 
24 |     :param values: list[int], field values list
25 |     :param constants_dict: Dict
26 |     :return: collections.Counter
27 |     """
28 |     statistics_counter = Counter()
29 |     for value in values:
30 |         field_name = get_key_from_dict_by_value(value=value, dictionary=constants_dict)
31 |         statistics_counter[field_name] += 1
32 |     return statistics_counter
33 | 


--------------------------------------------------------------------------------
/webspider/utils/convert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import logging
 3 | 
 4 | from webspider import constants
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | CONSTANTS_MAP = {
 9 |     'finance_stage': constants.FINANCE_STAGE_DICT,
10 |     'nature': constants.JOB_NATURE_DICT,
11 |     'work_year': constants.WORK_YEARS_REQUEST_DICT,
12 |     'education': constants.EDUCATION_REQUEST_DICT,
13 |     'size': constants.COMPANY_SIZE_DICT,
14 | }
15 | 
16 | 
17 | def convert_dict_field_to_constants(to_converted_dict, constants_map=CONSTANTS_MAP):
18 |     """
19 |     把dict的字段转换为相应常量
20 |     :param to_converted_dict: 需要转换的字典
21 |     :param constants_map: 字段常量对应关系
22 |     :return: 转换后的字段
23 |     """
24 |     for field_name, field_value in to_converted_dict.items():
25 |         if field_name in constants_map:
26 |             to_converted_dict[field_name] = convert_field_to_constants(field_name, field_value, constants_map)
27 | 
28 | 
29 | def convert_field_to_constants(field_name, field_value, constants_map=CONSTANTS_MAP):
30 |     """
31 |     把字段转化为相应的常量, 如果无法转换，返回 -1
32 | 
33 |     eg:
34 |         convert_field_to_constants(field_name='size', field_value='2000人以上', constants_map={'size': {'2000人以上': 1}})
35 |         return: 1
36 |     :param field_name: 字段名
37 |     :param field_value: 字段值
38 |     :param constants_map: 字段常量对应关系
39 |     :rtype: int
40 |     """
41 |     if field_name not in constants_map:
42 |         raise ValueError(u'can not find the field in constants_map, field name is {}'.find(field_name))
43 | 
44 |     field_constant_map = constants_map[field_name]
45 | 
46 |     if field_value in field_constant_map:
47 |         return field_constant_map[field_value]
48 |     else:
49 |         logger.error('error {field_name}, value is {field_value}'.format(field_name=field_name,
50 |                                                                          field_value=field_value))
51 |         return field_constant_map['unknown'] if 'unknown' in field_constant_map else -1
52 | 


--------------------------------------------------------------------------------
/webspider/utils/http_tools.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import time
 3 | import random
 4 | 
 5 | import requests
 6 | from retrying import retry
 7 | 
 8 | from webspider import constants
 9 | 
10 | 
11 | def generate_http_request_headers(referer=None):
12 |     """构造 HTTP 请求头"""
13 |     header = constants.HTTP_HEADER
14 |     header['User-Agent'] = random.choice(constants.USER_AGENT_LIST)
15 |     if referer:
16 |         header['Referer'] = referer
17 |     return header
18 | 
19 | 
20 | @retry(stop_max_attempt_number=constants.RETRY_TIMES, stop_max_delay=constants.STOP_MAX_DELAY,
21 |        wait_fixed=constants.WAIT_FIXED)
22 | def requests_get(url, params=None, headers=None, allow_redirects=False, timeout=constants.REQUEST_TIMEOUT,
23 |                  need_sleep=True, **kwargs):
24 |     if need_sleep:
25 |         time.sleep(random.randint(constants.MIN_SLEEP_SECS, constants.MAX_SLEEP_SECS))
26 |     if not headers:
27 |         headers = generate_http_request_headers()
28 |     return requests.get(url=url, params=params, headers=headers, allow_redirects=allow_redirects,
29 |                         timeout=timeout, **kwargs)
30 | 
31 | 
32 | @retry(stop_max_attempt_number=constants.RETRY_TIMES, stop_max_delay=constants.STOP_MAX_DELAY,
33 |        wait_fixed=constants.WAIT_FIXED)
34 | def requests_post(url, data=None, params=None, headers=None, allow_redirects=False, timeout=constants.REQUEST_TIMEOUT,
35 |                   need_sleep=True, **kwargs):
36 |     if need_sleep:
37 |         time.sleep(random.randint(constants.MIN_SLEEP_SECS, constants.MAX_SLEEP_SECS))
38 |     if not headers:
39 |         headers = generate_http_request_headers()
40 |     return requests.post(url=url, data=data, params=params, headers=headers, allow_redirects=allow_redirects,
41 |                          timeout=timeout, **kwargs)
42 | 


--------------------------------------------------------------------------------
/webspider/utils/log.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import logging.config
 4 | 
 5 | from webspider import setting
 6 | 
 7 | LOG_FILE_PATH = os.path.join(setting.BASE_DIR, 'log', 'spider_log.txt')
 8 | 
 9 | LOGGING_CONFIG = {
10 |     'version': 1,
11 |     'disable_existing_loggers': True,
12 | 
13 |     'formatters': {
14 |         'default': {
15 |             'format': '%(asctime)s- %(module)s:%(lineno)d [%(levelname)1.1s] %(name)s: %(message)s',
16 |             'datefmt': '%Y/%m/%d %H:%M:%S'
17 |         },
18 |     },
19 | 
20 |     'handlers': {
21 |         'console': {
22 |             'level': 'DEBUG',
23 |             'formatter': 'default',
24 |             'class': 'logging.StreamHandler'
25 |         },
26 |         'smtp': {
27 |             'level': 'ERROR',
28 |             'class': 'logging.handlers.SMTPHandler',
29 |             'formatter': 'default',
30 |             'mailhost': (setting.SMTP_CONF['host'], setting.SMTP_CONF['port']),
31 |             'fromaddr': setting.SMTP_CONF['from_email'],
32 |             'toaddrs': [setting.SMTP_CONF['to_email'], ],
33 |             'subject': '爬虫系统出现异常',
34 |             'credentials': (setting.MAIL_CONF['username'], setting.MAIL_CONF['password'])
35 |         },
36 |         'file': {
37 |             'level': 'ERROR',
38 |             'formatter': 'default',
39 |             'class': 'logging.handlers.RotatingFileHandler',
40 |             'filename': LOG_FILE_PATH,
41 |             'encoding': 'utf8'
42 |         },
43 |     },
44 | 
45 |     'loggers': {
46 |         '': {
47 |             'handlers': ['console', 'file'],
48 |             'level': 'DEBUG',
49 |             'propagate': False,
50 |         },
51 |         'webspider': {
52 |             'handlers': ['console', 'file'],
53 |             'level': 'DEBUG',
54 |             'propagate': False,
55 |         },
56 |         'tornado': {
57 |             'handlers': ['console', 'file'],
58 |             'level': 'DEBUG',
59 |             'propagate': False,
60 |         },
61 |         'tornado.access': {
62 |             'handlers': ['console', 'file'],
63 |             'level': 'INFO',
64 |             'propagate': False,
65 |         },
66 |         'tornado.application': {
67 |             'handlers': ['console', 'file'],
68 |             'level': 'INFO',
69 |             'propagate': False,
70 |         },
71 |         'tornado.general': {
72 |             'handlers': ['console', 'file'],
73 |             'propagate': False,
74 |             'level': 'INFO',
75 |         },
76 |         'sqlalchemy.engine': {
77 |             'handlers': ['console', 'file'],
78 |             'level': 'INFO',
79 |             'propagate': False,
80 |         },
81 |         'gunicorn': {
82 |             'handlers': ['console', 'file'],
83 |             'level': 'INFO',
84 |             'propagate': False,
85 |         },
86 |         'celery': {
87 |             'handlers': ['console', 'file'],
88 |             'level': 'DEBUG',
89 |             'propagate': False,
90 |         },
91 |     },
92 | }
93 | 
94 | 
95 | def config_logging():
96 |     """配置日志"""
97 |     logging.config.dictConfig(LOGGING_CONFIG)
98 | 


--------------------------------------------------------------------------------
/webspider/utils/pagination.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from math import ceil
 3 | 
 4 | 
 5 | class Pagination(object):
 6 |     """分页"""
 7 |     def __init__(self, page=1, per_page=10, total=0):
 8 |         self.page = page
 9 |         self.per_page = per_page
10 |         self.total = total
11 | 
12 |     @property
13 |     def pages(self):
14 |         if self.per_page == 0:
15 |             pages = 0
16 |         else:
17 |             pages = int(ceil(self.total / float(self.per_page)))
18 |         return pages
19 | 
20 |     @property
21 |     def prev_num(self):
22 |         if not self.has_prev:
23 |             return None
24 |         return self.page - 1
25 | 
26 |     @property
27 |     def has_prev(self):
28 |         return self.page > 1
29 | 
30 |     @property
31 |     def has_next(self):
32 |         return self.page < self.pages
33 | 
34 |     @property
35 |     def next_num(self):
36 |         if not self.has_next:
37 |             return None
38 |         return self.page + 1
39 | 
40 |     @property
41 |     def iter_pages(self):
42 |         for num in range(1, self.pages + 1):
43 |             yield num
44 | 


--------------------------------------------------------------------------------
/webspider/utils/sql.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import logging
 3 | 
 4 | from sqlalchemy import create_engine
 5 | from sqlalchemy.orm import sessionmaker, scoped_session
 6 | 
 7 | from webspider import setting
 8 | from webspider import constants
 9 | 
10 | __all__ = ['get_session', 'remove_sessions', 'db_engine']
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | db_engine = create_engine(
15 |     setting.MYSQL_CONF['connect_string'],
16 |     echo=constants.DEBUG, max_overflow=48,
17 |     pool_timeout=0, pool_recycle=3600,
18 |     logging_name='sql')
19 | 
20 | _session = scoped_session(sessionmaker(bind=db_engine, autocommit=True, autoflush=True))
21 | 
22 | 
23 | def get_session():
24 |     return _session
25 | 
26 | 
27 | def remove_sessions():
28 |     _session.remove()
29 | 


--------------------------------------------------------------------------------
/webspider/utils/text.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import re
 3 | 
 4 | 
 5 | def to_plaintext(content, pattern=r'<br/?>|\n', strip=True):
 6 |     """
 7 |     根据 pattern 过滤文本
 8 |     :param content: 需要过滤的文本
 9 |     :param pattern: 需要过滤内容的正则表达式
10 |     :param strip: 是否去掉首尾空格
11 |     :return:
12 |     """
13 |     plaintext = re.sub(pattern=pattern, repl='', string=content)
14 |     if strip:
15 |         plaintext = plaintext.strip()
16 |     return plaintext
17 | 


--------------------------------------------------------------------------------
/webspider/utils/time_tools.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import time
 3 | import datetime as datetime
 4 | 
 5 | 
 6 | def datetime_to_timestamp(datetime_obj):
 7 |     return int(time.mktime(datetime_obj.timetuple()))
 8 | 
 9 | 
10 | def timestamp_to_datetime(timestamp):
11 |     return datetime.datetime.fromtimestamp(timestamp)
12 | 
13 | 
14 | def timestamp_to_datetime_str(ts, time_format=None):
15 |     """
16 |     时间戳转化为日期字符串(1476547200->'2016-10-16')
17 |     :param ts: 时间戳
18 |     :param time_format: '日期格式'
19 |     :return: 日期字符串
20 |     """
21 |     if time_format is None or time_format == '':
22 |         time_format = '%Y-%m-%d'
23 |     ts = time.localtime(float(ts))
24 |     return time.strftime(time_format, ts)
25 | 


--------------------------------------------------------------------------------
/webspider/web/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/webspider/web/app.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # flake8: noqa
 4 | 
 5 | import os
 6 | import logging.config
 7 | 
 8 | import tornado
 9 | import tornado.web
10 | import tornado.ioloop
11 | import tornado.httpserver
12 | from tornado.options import options, define, parse_command_line
13 | from tornado.wsgi import WSGIAdapter
14 | 
15 | from webspider import constants
16 | from webspider.web.urls import url_handlers
17 | from webspider.utils.log import config_logging
18 | 
19 | config_logging()
20 | logger = logging.getLogger(__name__)
21 | 
22 | 
23 | def make_wsgi_app():
24 |     web_app = make_web_app()
25 |     return tornado.wsgi.WSGIAdapter(web_app)
26 | 
27 | 
28 | def make_web_app():
29 |     settings = {
30 |         'debug': constants.DEBUG,
31 |         'template_path': os.path.join(
32 |             os.path.dirname(__file__), "templates"
33 |         ),
34 |         'static_path': os.path.join(
35 |             os.path.dirname(__file__), 'static'
36 |         )
37 |     }
38 | 
39 |     app = tornado.web.Application(url_handlers, **settings)
40 |     return app
41 | 
42 | 
43 | def main():
44 |     define(name='port', default=8000, type=int, help='run on the given port')
45 |     parse_command_line()
46 |     logger.info('====== web server starting at http://0.0.0.0:{} ======'.format(options.port))
47 |     if constants.DEBUG:
48 |         logger.info('debug mode is enabled!!!')
49 | 
50 |     app = make_web_app()
51 |     http_server = tornado.httpserver.HTTPServer(app)
52 |     http_server.listen(options.port)
53 |     http_server.start()
54 | 
55 |     tornado.ioloop.IOLoop.instance().start()
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/webspider/web/formatter/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from webspider.models import KeywordStatisticModel, JobsCountModel
 3 | from webspider.web.formatter.jobs_count import JobsCountFormatter
 4 | from webspider.web.formatter.keyword_statistic import KeywordStatisticFormatter
 5 | 
 6 | from webspider.web.formatter.base import Formatter
 7 | 
 8 | formatter_mappings = {
 9 |     JobsCountModel: JobsCountFormatter,
10 |     KeywordStatisticModel: KeywordStatisticFormatter,
11 | }
12 | 
13 | Formatter.register_formatter(formatter_mappings)
14 | 


--------------------------------------------------------------------------------
/webspider/web/formatter/base.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from tornado.util import ObjectDict
 3 | 
 4 | from webspider.exceptions import DowngradeException
 5 | 
 6 | 
 7 | class Downgrade(object):
 8 |     """降级"""
 9 |     def __init__(self, value):
10 |         self.value = value
11 | 
12 | 
13 | class Field(object):
14 |     """Formatter字段"""
15 |     def __init__(self, name, converter=None, downgrade=None):
16 |         self.name = name
17 |         self.converter = converter
18 |         if downgrade is not None and not isinstance(downgrade, Downgrade):
19 |             raise DowngradeException(u'downgrade must be Downgrade instance')
20 |         self.downgrade = downgrade
21 | 
22 | 
23 | class Formatter(object):
24 |     """Formatter 根据设定的 FORMATTER_MAPS 自动渲染"""
25 |     _FORMATTER_MAPS = {}
26 |     FIELDS = {}
27 | 
28 |     @classmethod
29 |     def register_formatter(cls, mapping):
30 |         cls._FORMATTER_MAPS.update(mapping)
31 | 
32 |     @classmethod
33 |     def format(cls, data):
34 |         if isinstance(data, list):
35 |             return [cls.format(item) for item in data]
36 |         else:
37 |             formatter = cls.get_formatter(data)
38 |             if not formatter:
39 |                 raise ValueError(u'Can not find the formatter by model {}'.format(type(data)))
40 | 
41 |             format_result = ObjectDict()
42 |             for field in formatter.FIELDS:
43 |                 if not isinstance(field, Field):
44 |                     raise ValueError('formatter field must be Field instance')
45 |                 try:
46 |                     value = getattr(data, field.name)
47 |                     # 可再次渲染
48 |                     if isinstance(value, list) or cls.get_formatter(value):
49 |                         value = cls.format(value)
50 |                     if field.converter:
51 |                         value = field.converter(value)
52 |                 except Exception:
53 |                     # Field 设置了降级
54 |                     if field.downgrade:
55 |                         value = field.downgrade.value
56 |                     else:
57 |                         raise
58 |                 format_result[field.name] = value
59 | 
60 |             return format_result
61 | 
62 |     @classmethod
63 |     def get_formatter(cls, data):
64 |         if data in cls._FORMATTER_MAPS:
65 |             return cls._FORMATTER_MAPS[data]
66 |         for model, formatter in cls._FORMATTER_MAPS.items():
67 |             if type(data) is model:
68 |                 return formatter
69 | 


--------------------------------------------------------------------------------
/webspider/web/formatter/jobs_count.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from webspider import utils
 3 | from webspider.web.formatter.base import Field, Formatter
 4 | 
 5 | 
 6 | class JobsCountFormatter(Formatter):
 7 |     FIELDS = [
 8 |         Field('date'),
 9 |         Field('all_city'),
10 |         Field('beijing'),
11 |         Field('guangzhou'),
12 |         Field('shenzhen'),
13 |         Field('shanghai'),
14 |         Field('hangzhou'),
15 |         Field('chengdu'),
16 |         Field('created_at', converter=utils.time_tools.datetime_to_timestamp),
17 |         Field('updated_at', converter=utils.time_tools.datetime_to_timestamp),
18 |     ]
19 | 


--------------------------------------------------------------------------------
/webspider/web/formatter/keyword_statistic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | 
 4 | from webspider import utils
 5 | from webspider.web.formatter.base import Field, Downgrade, Formatter
 6 | 
 7 | 
 8 | class KeywordStatisticFormatter(Formatter):
 9 |     FIELDS = [
10 |         Field('educations', converter=json.loads, downgrade=Downgrade({})),
11 |         Field('city_jobs_count', converter=json.loads, downgrade=Downgrade({})),
12 |         Field('salary', converter=json.loads, downgrade=Downgrade({})),
13 |         Field('financing_stage', converter=json.loads, downgrade=Downgrade({})),
14 |         Field('work_years', converter=json.loads, downgrade=Downgrade({})),
15 |         Field('per_day_jobs_count'),
16 |         Field('created_at', converter=utils.time_tools.datetime_to_timestamp),
17 |         Field('updated_at', converter=utils.time_tools.datetime_to_timestamp),
18 |     ]
19 | 


--------------------------------------------------------------------------------
/webspider/web/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | from webspider.web.handlers.keyword_statistics import KeywordStatisticsApiHandler, KeywordStatisticsPageHandler
3 | 
4 | __all__ = [
5 |     'KeywordStatisticsApiHandler',
6 |     'KeywordStatisticsPageHandler'
7 | ]
8 | 


--------------------------------------------------------------------------------
/webspider/web/handlers/base.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from tornado.escape import json_encode
 3 | from tornado.web import RequestHandler
 4 | 
 5 | from webspider import constants
 6 | from webspider.exceptions import BaseException, ResourceNotFoundWebException
 7 | from webspider.web.formatter import Formatter
 8 | from webspider.utils.sql import remove_sessions
 9 | 
10 | 
11 | class BaseApiHandler(RequestHandler):
12 |     def write_error(self, status_code, **kwargs):
13 |         exception = kwargs['exc_info'][1]
14 | 
15 |         # TODO 后端改成纯 API 后，删除其逻辑
16 |         # 生产环境下, 且请求非 API 接口, 渲染错误页面
17 |         if not constants.DEBUG and isinstance(self, BasePageHandler):
18 |             self._handler_production_page_error(exception)
19 |             return
20 | 
21 |         if isinstance(exception, BaseException):
22 |             self.render_exception(exception)
23 |         else:
24 |             RequestHandler.write_error(self, status_code=status_code, **kwargs)
25 | 
26 |     def auto_render(self, data):
27 |         formatted_dict = Formatter.format(data)
28 |         self.render_json(formatted_dict)
29 | 
30 |     def _handler_production_page_error(self, exception):
31 |         """处理生产环境下页面的错误"""
32 |         if isinstance(exception, ResourceNotFoundWebException):
33 |             self.render('404.html')
34 |         else:
35 |             self.render('500.html')
36 | 
37 |     def render_exception(self, exception):
38 |         self.set_status(
39 |             status_code=exception.STATUS_CODE,
40 |             reason=exception.message
41 |         )
42 |         error_dict = {
43 |             'error': {
44 |                 'code': exception.code,
45 |                 'name': exception.__class__.__name__,
46 |                 'message': exception.message,
47 |                 'data': exception.data if exception.data else '',
48 |                 'debug_message': exception.debug_message if exception.data else ''
49 |             }
50 |         }
51 |         self.render_json(error_dict)
52 | 
53 |     def render_json(self, data):
54 |         self.set_header('Content-Type', 'application/json')
55 |         self.finish(json_encode(data))
56 | 
57 |     def on_finish(self):
58 |         remove_sessions()
59 | 
60 | 
61 | # TODO page to api
62 | class BasePageHandler(BaseApiHandler):
63 |     """前后端代码混合型的页面 Handler"""
64 |     pass
65 | 


--------------------------------------------------------------------------------
/webspider/web/handlers/keyword_statistics.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | from webspider.web.handlers.base import BasePageHandler, BaseApiHandler
 5 | from webspider.exceptions import ResourceNotFoundWebException
 6 | from webspider.models import KeywordModel, KeywordStatisticModel
 7 | 
 8 | 
 9 | class KeywordStatisticsApiHandler(BaseApiHandler):
10 |     def get(self):
11 |         keyword_name = self.get_argument('keyword_name', '')
12 |         if not keyword_name:
13 |             raise ResourceNotFoundWebException(u'请输入关键词')
14 | 
15 |         keyword = KeywordModel.get_one(filter_by={'name': keyword_name})
16 |         if not keyword:
17 |             raise ResourceNotFoundWebException(u'找不到该关键词')
18 | 
19 |         keyword_statistic = KeywordStatisticModel.get_one(filter_by={'keyword_id': keyword.id})
20 |         if not keyword_statistic:
21 |             raise ResourceNotFoundWebException(u'暂无该关键词的统计结果')
22 | 
23 |         self.auto_render(keyword_statistic)
24 | 
25 | 
26 | class KeywordStatisticsPageHandler(BasePageHandler):
27 |     def get(self):
28 |         keyword_name = self.get_argument('keyword_name', '')
29 |         if not keyword_name:
30 |             raise ResourceNotFoundWebException(u'请输入关键词')
31 | 
32 |         keyword = KeywordModel.get_one(filter_by={'name': keyword_name})
33 |         if not keyword:
34 |             raise ResourceNotFoundWebException(u'找不到该关键词')
35 | 
36 |         keyword_statistic = KeywordStatisticModel.get_one(filter_by={'keyword_id': keyword.id})
37 |         if not keyword_statistic:
38 |             raise ResourceNotFoundWebException(u'暂无该关键词的统计结果')
39 | 
40 |         self.render(
41 |             "statistics.html",
42 |             keyword_name=keyword_name,
43 |             educations_statistic=json.loads(keyword_statistic.educations),
44 |             city_jobs_count_statistic=json.loads(keyword_statistic.city_jobs_count),
45 |             salary_statistic=json.loads(keyword_statistic.salary),
46 |             finance_stage_statistic=json.loads(keyword_statistic.financing_stage),
47 |             work_years_statistic=json.loads(keyword_statistic.work_years),
48 |             per_day_jobs_count_statistic=keyword_statistic.per_day_jobs_count
49 |         )
50 | 


--------------------------------------------------------------------------------
/webspider/web/static/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/css/bootstrap-theme.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 |  * Bootstrap v3.3.7 (http://getbootstrap.com)
3 |  * Copyright 2011-2016 Twitter, Inc.
4 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
5 |  */.btn-danger,.btn-default,.btn-info,.btn-primary,.btn-success,.btn-warning{text-shadow:0 -1px 0 rgba(0,0,0,.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 1px rgba(0,0,0,.075)}.btn-danger.active,.btn-danger:active,.btn-default.active,.btn-default:active,.btn-info.active,.btn-info:active,.btn-primary.active,.btn-primary:active,.btn-success.active,.btn-success:active,.btn-warning.active,.btn-warning:active{-webkit-box-shadow:inset 0 3px 5px rgba(0,0,0,.125);box-shadow:inset 0 3px 5px rgba(0,0,0,.125)}.btn-danger.disabled,.btn-danger[disabled],.btn-default.disabled,.btn-default[disabled],.btn-info.disabled,.btn-info[disabled],.btn-primary.disabled,.btn-primary[disabled],.btn-success.disabled,.btn-success[disabled],.btn-warning.disabled,.btn-warning[disabled],fieldset[disabled] .btn-danger,fieldset[disabled] .btn-default,fieldset[disabled] .btn-info,fieldset[disabled] .btn-primary,fieldset[disabled] .btn-success,fieldset[disabled] .btn-warning{-webkit-box-shadow:none;box-shadow:none}.btn-danger .badge,.btn-default .badge,.btn-info .badge,.btn-primary .badge,.btn-success .badge,.btn-warning .badge{text-shadow:none}.btn.active,.btn:active{background-image:none}.btn-default{text-shadow:0 1px 0 #fff;background-image:-webkit-linear-gradient(top,#fff 0,#e0e0e0 100%);background-image:-o-linear-gradient(top,#fff 0,#e0e0e0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fff),to(#e0e0e0));background-image:linear-gradient(to bottom,#fff 0,#e0e0e0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#ffe0e0e0', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#dbdbdb;border-color:#ccc}.btn-default:focus,.btn-default:hover{background-color:#e0e0e0;background-position:0 -15px}.btn-default.active,.btn-default:active{background-color:#e0e0e0;border-color:#dbdbdb}.btn-default.disabled,.btn-default.disabled.active,.btn-default.disabled.focus,.btn-default.disabled:active,.btn-default.disabled:focus,.btn-default.disabled:hover,.btn-default[disabled],.btn-default[disabled].active,.btn-default[disabled].focus,.btn-default[disabled]:active,.btn-default[disabled]:focus,.btn-default[disabled]:hover,fieldset[disabled] .btn-default,fieldset[disabled] .btn-default.active,fieldset[disabled] .btn-default.focus,fieldset[disabled] .btn-default:active,fieldset[disabled] .btn-default:focus,fieldset[disabled] .btn-default:hover{background-color:#e0e0e0;background-image:none}.btn-primary{background-image:-webkit-linear-gradient(top,#337ab7 0,#265a88 100%);background-image:-o-linear-gradient(top,#337ab7 0,#265a88 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#265a88));background-image:linear-gradient(to bottom,#337ab7 0,#265a88 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff265a88', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#245580}.btn-primary:focus,.btn-primary:hover{background-color:#265a88;background-position:0 -15px}.btn-primary.active,.btn-primary:active{background-color:#265a88;border-color:#245580}.btn-primary.disabled,.btn-primary.disabled.active,.btn-primary.disabled.focus,.btn-primary.disabled:active,.btn-primary.disabled:focus,.btn-primary.disabled:hover,.btn-primary[disabled],.btn-primary[disabled].active,.btn-primary[disabled].focus,.btn-primary[disabled]:active,.btn-primary[disabled]:focus,.btn-primary[disabled]:hover,fieldset[disabled] .btn-primary,fieldset[disabled] .btn-primary.active,fieldset[disabled] .btn-primary.focus,fieldset[disabled] .btn-primary:active,fieldset[disabled] .btn-primary:focus,fieldset[disabled] .btn-primary:hover{background-color:#265a88;background-image:none}.btn-success{background-image:-webkit-linear-gradient(top,#5cb85c 0,#419641 100%);background-image:-o-linear-gradient(top,#5cb85c 0,#419641 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5cb85c),to(#419641));background-image:linear-gradient(to bottom,#5cb85c 0,#419641 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff419641', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#3e8f3e}.btn-success:focus,.btn-success:hover{background-color:#419641;background-position:0 -15px}.btn-success.active,.btn-success:active{background-color:#419641;border-color:#3e8f3e}.btn-success.disabled,.btn-success.disabled.active,.btn-success.disabled.focus,.btn-success.disabled:active,.btn-success.disabled:focus,.btn-success.disabled:hover,.btn-success[disabled],.btn-success[disabled].active,.btn-success[disabled].focus,.btn-success[disabled]:active,.btn-success[disabled]:focus,.btn-success[disabled]:hover,fieldset[disabled] .btn-success,fieldset[disabled] .btn-success.active,fieldset[disabled] .btn-success.focus,fieldset[disabled] .btn-success:active,fieldset[disabled] .btn-success:focus,fieldset[disabled] .btn-success:hover{background-color:#419641;background-image:none}.btn-info{background-image:-webkit-linear-gradient(top,#5bc0de 0,#2aabd2 100%);background-image:-o-linear-gradient(top,#5bc0de 0,#2aabd2 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5bc0de),to(#2aabd2));background-image:linear-gradient(to bottom,#5bc0de 0,#2aabd2 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff2aabd2', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#28a4c9}.btn-info:focus,.btn-info:hover{background-color:#2aabd2;background-position:0 -15px}.btn-info.active,.btn-info:active{background-color:#2aabd2;border-color:#28a4c9}.btn-info.disabled,.btn-info.disabled.active,.btn-info.disabled.focus,.btn-info.disabled:active,.btn-info.disabled:focus,.btn-info.disabled:hover,.btn-info[disabled],.btn-info[disabled].active,.btn-info[disabled].focus,.btn-info[disabled]:active,.btn-info[disabled]:focus,.btn-info[disabled]:hover,fieldset[disabled] .btn-info,fieldset[disabled] .btn-info.active,fieldset[disabled] .btn-info.focus,fieldset[disabled] .btn-info:active,fieldset[disabled] .btn-info:focus,fieldset[disabled] .btn-info:hover{background-color:#2aabd2;background-image:none}.btn-warning{background-image:-webkit-linear-gradient(top,#f0ad4e 0,#eb9316 100%);background-image:-o-linear-gradient(top,#f0ad4e 0,#eb9316 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f0ad4e),to(#eb9316));background-image:linear-gradient(to bottom,#f0ad4e 0,#eb9316 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffeb9316', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#e38d13}.btn-warning:focus,.btn-warning:hover{background-color:#eb9316;background-position:0 -15px}.btn-warning.active,.btn-warning:active{background-color:#eb9316;border-color:#e38d13}.btn-warning.disabled,.btn-warning.disabled.active,.btn-warning.disabled.focus,.btn-warning.disabled:active,.btn-warning.disabled:focus,.btn-warning.disabled:hover,.btn-warning[disabled],.btn-warning[disabled].active,.btn-warning[disabled].focus,.btn-warning[disabled]:active,.btn-warning[disabled]:focus,.btn-warning[disabled]:hover,fieldset[disabled] .btn-warning,fieldset[disabled] .btn-warning.active,fieldset[disabled] .btn-warning.focus,fieldset[disabled] .btn-warning:active,fieldset[disabled] .btn-warning:focus,fieldset[disabled] .btn-warning:hover{background-color:#eb9316;background-image:none}.btn-danger{background-image:-webkit-linear-gradient(top,#d9534f 0,#c12e2a 100%);background-image:-o-linear-gradient(top,#d9534f 0,#c12e2a 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9534f),to(#c12e2a));background-image:linear-gradient(to bottom,#d9534f 0,#c12e2a 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc12e2a', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-color:#b92c28}.btn-danger:focus,.btn-danger:hover{background-color:#c12e2a;background-position:0 -15px}.btn-danger.active,.btn-danger:active{background-color:#c12e2a;border-color:#b92c28}.btn-danger.disabled,.btn-danger.disabled.active,.btn-danger.disabled.focus,.btn-danger.disabled:active,.btn-danger.disabled:focus,.btn-danger.disabled:hover,.btn-danger[disabled],.btn-danger[disabled].active,.btn-danger[disabled].focus,.btn-danger[disabled]:active,.btn-danger[disabled]:focus,.btn-danger[disabled]:hover,fieldset[disabled] .btn-danger,fieldset[disabled] .btn-danger.active,fieldset[disabled] .btn-danger.focus,fieldset[disabled] .btn-danger:active,fieldset[disabled] .btn-danger:focus,fieldset[disabled] .btn-danger:hover{background-color:#c12e2a;background-image:none}.img-thumbnail,.thumbnail{-webkit-box-shadow:0 1px 2px rgba(0,0,0,.075);box-shadow:0 1px 2px rgba(0,0,0,.075)}.dropdown-menu>li>a:focus,.dropdown-menu>li>a:hover{background-color:#e8e8e8;background-image:-webkit-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-o-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f5f5f5),to(#e8e8e8));background-image:linear-gradient(to bottom,#f5f5f5 0,#e8e8e8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);background-repeat:repeat-x}.dropdown-menu>.active>a,.dropdown-menu>.active>a:focus,.dropdown-menu>.active>a:hover{background-color:#2e6da4;background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}.navbar-default{background-image:-webkit-linear-gradient(top,#fff 0,#f8f8f8 100%);background-image:-o-linear-gradient(top,#fff 0,#f8f8f8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fff),to(#f8f8f8));background-image:linear-gradient(to bottom,#fff 0,#f8f8f8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffffffff', endColorstr='#fff8f8f8', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-radius:4px;-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 5px rgba(0,0,0,.075);box-shadow:inset 0 1px 0 rgba(255,255,255,.15),0 1px 5px rgba(0,0,0,.075)}.navbar-default .navbar-nav>.active>a,.navbar-default .navbar-nav>.open>a{background-image:-webkit-linear-gradient(top,#dbdbdb 0,#e2e2e2 100%);background-image:-o-linear-gradient(top,#dbdbdb 0,#e2e2e2 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dbdbdb),to(#e2e2e2));background-image:linear-gradient(to bottom,#dbdbdb 0,#e2e2e2 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdbdbdb', endColorstr='#ffe2e2e2', GradientType=0);background-repeat:repeat-x;-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,.075);box-shadow:inset 0 3px 9px rgba(0,0,0,.075)}.navbar-brand,.navbar-nav>li>a{text-shadow:0 1px 0 rgba(255,255,255,.25)}.navbar-inverse{background-image:-webkit-linear-gradient(top,#3c3c3c 0,#222 100%);background-image:-o-linear-gradient(top,#3c3c3c 0,#222 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#3c3c3c),to(#222));background-image:linear-gradient(to bottom,#3c3c3c 0,#222 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff3c3c3c', endColorstr='#ff222222', GradientType=0);filter:progid:DXImageTransform.Microsoft.gradient(enabled=false);background-repeat:repeat-x;border-radius:4px}.navbar-inverse .navbar-nav>.active>a,.navbar-inverse .navbar-nav>.open>a{background-image:-webkit-linear-gradient(top,#080808 0,#0f0f0f 100%);background-image:-o-linear-gradient(top,#080808 0,#0f0f0f 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#080808),to(#0f0f0f));background-image:linear-gradient(to bottom,#080808 0,#0f0f0f 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff080808', endColorstr='#ff0f0f0f', GradientType=0);background-repeat:repeat-x;-webkit-box-shadow:inset 0 3px 9px rgba(0,0,0,.25);box-shadow:inset 0 3px 9px rgba(0,0,0,.25)}.navbar-inverse .navbar-brand,.navbar-inverse .navbar-nav>li>a{text-shadow:0 -1px 0 rgba(0,0,0,.25)}.navbar-fixed-bottom,.navbar-fixed-top,.navbar-static-top{border-radius:0}@media (max-width:767px){.navbar .navbar-nav .open .dropdown-menu>.active>a,.navbar .navbar-nav .open .dropdown-menu>.active>a:focus,.navbar .navbar-nav .open .dropdown-menu>.active>a:hover{color:#fff;background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}}.alert{text-shadow:0 1px 0 rgba(255,255,255,.2);-webkit-box-shadow:inset 0 1px 0 rgba(255,255,255,.25),0 1px 2px rgba(0,0,0,.05);box-shadow:inset 0 1px 0 rgba(255,255,255,.25),0 1px 2px rgba(0,0,0,.05)}.alert-success{background-image:-webkit-linear-gradient(top,#dff0d8 0,#c8e5bc 100%);background-image:-o-linear-gradient(top,#dff0d8 0,#c8e5bc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dff0d8),to(#c8e5bc));background-image:linear-gradient(to bottom,#dff0d8 0,#c8e5bc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffc8e5bc', GradientType=0);background-repeat:repeat-x;border-color:#b2dba1}.alert-info{background-image:-webkit-linear-gradient(top,#d9edf7 0,#b9def0 100%);background-image:-o-linear-gradient(top,#d9edf7 0,#b9def0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9edf7),to(#b9def0));background-image:linear-gradient(to bottom,#d9edf7 0,#b9def0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffb9def0', GradientType=0);background-repeat:repeat-x;border-color:#9acfea}.alert-warning{background-image:-webkit-linear-gradient(top,#fcf8e3 0,#f8efc0 100%);background-image:-o-linear-gradient(top,#fcf8e3 0,#f8efc0 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fcf8e3),to(#f8efc0));background-image:linear-gradient(to bottom,#fcf8e3 0,#f8efc0 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fff8efc0', GradientType=0);background-repeat:repeat-x;border-color:#f5e79e}.alert-danger{background-image:-webkit-linear-gradient(top,#f2dede 0,#e7c3c3 100%);background-image:-o-linear-gradient(top,#f2dede 0,#e7c3c3 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f2dede),to(#e7c3c3));background-image:linear-gradient(to bottom,#f2dede 0,#e7c3c3 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffe7c3c3', GradientType=0);background-repeat:repeat-x;border-color:#dca7a7}.progress{background-image:-webkit-linear-gradient(top,#ebebeb 0,#f5f5f5 100%);background-image:-o-linear-gradient(top,#ebebeb 0,#f5f5f5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#ebebeb),to(#f5f5f5));background-image:linear-gradient(to bottom,#ebebeb 0,#f5f5f5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffebebeb', endColorstr='#fff5f5f5', GradientType=0);background-repeat:repeat-x}.progress-bar{background-image:-webkit-linear-gradient(top,#337ab7 0,#286090 100%);background-image:-o-linear-gradient(top,#337ab7 0,#286090 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#286090));background-image:linear-gradient(to bottom,#337ab7 0,#286090 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff286090', GradientType=0);background-repeat:repeat-x}.progress-bar-success{background-image:-webkit-linear-gradient(top,#5cb85c 0,#449d44 100%);background-image:-o-linear-gradient(top,#5cb85c 0,#449d44 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5cb85c),to(#449d44));background-image:linear-gradient(to bottom,#5cb85c 0,#449d44 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5cb85c', endColorstr='#ff449d44', GradientType=0);background-repeat:repeat-x}.progress-bar-info{background-image:-webkit-linear-gradient(top,#5bc0de 0,#31b0d5 100%);background-image:-o-linear-gradient(top,#5bc0de 0,#31b0d5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#5bc0de),to(#31b0d5));background-image:linear-gradient(to bottom,#5bc0de 0,#31b0d5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff5bc0de', endColorstr='#ff31b0d5', GradientType=0);background-repeat:repeat-x}.progress-bar-warning{background-image:-webkit-linear-gradient(top,#f0ad4e 0,#ec971f 100%);background-image:-o-linear-gradient(top,#f0ad4e 0,#ec971f 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f0ad4e),to(#ec971f));background-image:linear-gradient(to bottom,#f0ad4e 0,#ec971f 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff0ad4e', endColorstr='#ffec971f', GradientType=0);background-repeat:repeat-x}.progress-bar-danger{background-image:-webkit-linear-gradient(top,#d9534f 0,#c9302c 100%);background-image:-o-linear-gradient(top,#d9534f 0,#c9302c 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9534f),to(#c9302c));background-image:linear-gradient(to bottom,#d9534f 0,#c9302c 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9534f', endColorstr='#ffc9302c', GradientType=0);background-repeat:repeat-x}.progress-bar-striped{background-image:-webkit-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:-o-linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent);background-image:linear-gradient(45deg,rgba(255,255,255,.15) 25%,transparent 25%,transparent 50%,rgba(255,255,255,.15) 50%,rgba(255,255,255,.15) 75%,transparent 75%,transparent)}.list-group{border-radius:4px;-webkit-box-shadow:0 1px 2px rgba(0,0,0,.075);box-shadow:0 1px 2px rgba(0,0,0,.075)}.list-group-item.active,.list-group-item.active:focus,.list-group-item.active:hover{text-shadow:0 -1px 0 #286090;background-image:-webkit-linear-gradient(top,#337ab7 0,#2b669a 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2b669a 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2b669a));background-image:linear-gradient(to bottom,#337ab7 0,#2b669a 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2b669a', GradientType=0);background-repeat:repeat-x;border-color:#2b669a}.list-group-item.active .badge,.list-group-item.active:focus .badge,.list-group-item.active:hover .badge{text-shadow:none}.panel{-webkit-box-shadow:0 1px 2px rgba(0,0,0,.05);box-shadow:0 1px 2px rgba(0,0,0,.05)}.panel-default>.panel-heading{background-image:-webkit-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-o-linear-gradient(top,#f5f5f5 0,#e8e8e8 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f5f5f5),to(#e8e8e8));background-image:linear-gradient(to bottom,#f5f5f5 0,#e8e8e8 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff5f5f5', endColorstr='#ffe8e8e8', GradientType=0);background-repeat:repeat-x}.panel-primary>.panel-heading{background-image:-webkit-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-o-linear-gradient(top,#337ab7 0,#2e6da4 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#337ab7),to(#2e6da4));background-image:linear-gradient(to bottom,#337ab7 0,#2e6da4 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ff337ab7', endColorstr='#ff2e6da4', GradientType=0);background-repeat:repeat-x}.panel-success>.panel-heading{background-image:-webkit-linear-gradient(top,#dff0d8 0,#d0e9c6 100%);background-image:-o-linear-gradient(top,#dff0d8 0,#d0e9c6 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#dff0d8),to(#d0e9c6));background-image:linear-gradient(to bottom,#dff0d8 0,#d0e9c6 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffdff0d8', endColorstr='#ffd0e9c6', GradientType=0);background-repeat:repeat-x}.panel-info>.panel-heading{background-image:-webkit-linear-gradient(top,#d9edf7 0,#c4e3f3 100%);background-image:-o-linear-gradient(top,#d9edf7 0,#c4e3f3 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#d9edf7),to(#c4e3f3));background-image:linear-gradient(to bottom,#d9edf7 0,#c4e3f3 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffd9edf7', endColorstr='#ffc4e3f3', GradientType=0);background-repeat:repeat-x}.panel-warning>.panel-heading{background-image:-webkit-linear-gradient(top,#fcf8e3 0,#faf2cc 100%);background-image:-o-linear-gradient(top,#fcf8e3 0,#faf2cc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#fcf8e3),to(#faf2cc));background-image:linear-gradient(to bottom,#fcf8e3 0,#faf2cc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fffcf8e3', endColorstr='#fffaf2cc', GradientType=0);background-repeat:repeat-x}.panel-danger>.panel-heading{background-image:-webkit-linear-gradient(top,#f2dede 0,#ebcccc 100%);background-image:-o-linear-gradient(top,#f2dede 0,#ebcccc 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#f2dede),to(#ebcccc));background-image:linear-gradient(to bottom,#f2dede 0,#ebcccc 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#fff2dede', endColorstr='#ffebcccc', GradientType=0);background-repeat:repeat-x}.well{background-image:-webkit-linear-gradient(top,#e8e8e8 0,#f5f5f5 100%);background-image:-o-linear-gradient(top,#e8e8e8 0,#f5f5f5 100%);background-image:-webkit-gradient(linear,left top,left bottom,from(#e8e8e8),to(#f5f5f5));background-image:linear-gradient(to bottom,#e8e8e8 0,#f5f5f5 100%);filter:progid:DXImageTransform.Microsoft.gradient(startColorstr='#ffe8e8e8', endColorstr='#fff5f5f5', GradientType=0);background-repeat:repeat-x;border-color:#dcdcdc;-webkit-box-shadow:inset 0 1px 3px rgba(0,0,0,.05),0 1px 0 rgba(255,255,255,.1);box-shadow:inset 0 1px 3px rgba(0,0,0,.05),0 1px 0 rgba(255,255,255,.1)}
6 | /*# sourceMappingURL=bootstrap-theme.min.css.map */


--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.eot


--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.ttf


--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff


--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/bootstrap/fonts/glyphicons-halflings-regular.woff2


--------------------------------------------------------------------------------
/webspider/web/static/bootstrap/js/npm.js:
--------------------------------------------------------------------------------
 1 | // This file is autogenerated via the `commonjs` Grunt task. You can require() this file in a CommonJS environment.
 2 | require('../../js/transition.js')
 3 | require('../../js/alert.js')
 4 | require('../../js/button.js')
 5 | require('../../js/carousel.js')
 6 | require('../../js/collapse.js')
 7 | require('../../js/dropdown.js')
 8 | require('../../js/modal.js')
 9 | require('../../js/tooltip.js')
10 | require('../../js/popover.js')
11 | require('../../js/scrollspy.js')
12 | require('../../js/tab.js')
13 | require('../../js/affix.js')


--------------------------------------------------------------------------------
/webspider/web/static/css/mystyle.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     font-family: "Hiragino Sans GB", "Microsoft Yahei", SimSun, Arial, "Helvetica Neue", Helvetica;
 3 |     color: #333;
 4 |     word-wrap: break-word;
 5 |     -webkit-font-smoothing: antialiased;
 6 |     font-size: 14px;
 7 | }
 8 | 
 9 | footer {
10 |     font-size: 14px;
11 |     border-radius: 5px;
12 |     margin: 0 auto;
13 |     width: 100%;
14 |     text-align: center;
15 |     padding: 10px 0;
16 | }
17 | 
18 | .main-body {
19 |     min-height: 780px;
20 | }
21 | 
22 | .chart-div {
23 |     width: 550px;
24 |     height: 400px;
25 |     padding-top: 30px;
26 |     margin: 0 auto;
27 | }
28 | 
29 | .large-chart-div {
30 |     width: 1000px;
31 |     height: 700px;
32 |     padding-top: 30px;
33 |     margin: 0 auto;
34 | }


--------------------------------------------------------------------------------
/webspider/web/static/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JustForFunnnn/webspider/e41e94fb4f269ecab61cc19c76331c7667151cfb/webspider/web/static/img/favicon.ico


--------------------------------------------------------------------------------
/webspider/web/templates/404.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block body %}
3 | <div class="main-body">
4 |    <h2>你来到了没有知识的荒原  _(:з」∠)_</h2>
5 | {% end %}


--------------------------------------------------------------------------------
/webspider/web/templates/500.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | {% block body %}
3 | <div class="main-body">
4 |    <h2>服务器提了一个问题  _(:з」∠)_</h2>
5 | {% end %}


--------------------------------------------------------------------------------
/webspider/web/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="zh-CN">
 3 | <head>
 4 |     <meta charset="utf-8">
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <!-- 上述3个meta标签*必须*放在最前面，任何其他内容都*必须*跟随其后！ -->
 8 |     <title>Web Spider|JustForFunnn</title>
 9 |     <!-- Bootstrap -->
10 |     <link rel="stylesheet" type="text/css" href="{{ static_url('bootstrap/css/bootstrap.min.css') }}"/>
11 |     <link rel="stylesheet" type="text/css" href="{{ static_url('css/mystyle.css') }}"/>
12 |     <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
13 |     <script type="text/javascript" src="{{ static_url('js/jquery.min.js') }}"></script>
14 |     <!-- Include all compiled plugins (below), or include individual files as needed -->
15 |     <script type="text/javascript" src="{{ static_url('bootstrap/js/bootstrap.min.js') }}"></script>
16 |     <link rel="icon" href="{{ static_url('img/favicon.ico') }}">
17 |     {% block header %}{% end %}
18 | </head>
19 | <body>
20 | <nav class="navbar navbar-default" role="navigation">
21 |     <div class="container navbar-small">
22 |         <div class="navbar-header">
23 |             <button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#navbar-collapse-01">
24 |                 <span class="sr-only">Toggle navigation</span>
25 |             </button>
26 |             <a class="navbar-brand" href="/">Web Spider</a>
27 |         </div>
28 |         <form class="navbar-form navbar-right" role="search" action="/statistics">
29 |             <div class="form-group">
30 |                 <input type="text" class="form-control" name="keyword_name" placeholder="输入些职位的关键词吧......">
31 |             </div>
32 |             <button type="submit" class="btn btn-default">Go</button>
33 |         </form>
34 |     </div>
35 | </nav>
36 | 
37 | <div class="container main-body">
38 |     {% block body %}
39 |     {% end %}
40 | </div>
41 | 
42 | <footer>
43 |     <hr>
44 |     <div class="container">
45 |         <p class="text-muted">
46 |             Design By &nbsp<span class="glyphicon glyphicon-user" aria-hidden="true"></span>&nbsp
47 |             <a href="">JustForFunnn</a>&nbsp
48 |             <span class="glyphicon glyphicon-grain" aria-hidden="true"></span>
49 |         </p>
50 |     </div>
51 | </footer>
52 | 
53 | </body>
54 | </html>
55 | 


--------------------------------------------------------------------------------
/webspider/web/templates/city-jobs-count-chart-module.html:
--------------------------------------------------------------------------------
 1 | <div id="city-jobs-count-chart" class="chart-div col-md-6"></div>
 2 | 
 3 | <script type="text/javascript">
 4 |     var cityJobsCountChart = echarts.init(document.getElementById('city-jobs-count-chart'));
 5 |     // 指定图表的配置项和数据
 6 | 
 7 |     var option = {
 8 |         title: {
 9 |             text: '各城市职位占比',
10 |             x: 'center'
11 |         },
12 |         tooltip: {
13 |             trigger: 'item',
14 |             formatter: "{a} <br/>{b} : {c} ({d}%)"
15 |         },
16 |         legend: {
17 |             orient: 'vertical',
18 |             left: 'left',
19 |             data: [
20 |                 {% for (city, count) in city_jobs_count_statistic.items() %}
21 |                     '{{city}}',
22 |                 {% end %}
23 |             ]
24 |         },
25 |         series: [
26 |             {
27 |                 type: 'pie',
28 |                 radius: '55%',
29 |                 center: ['50%', '60%'],
30 |                 data: [
31 |                     {% for (city, count) in city_jobs_count_statistic.items() %}
32 |                         {
33 |                             value: '{{count}}',
34 |                             name: '{{city}}'
35 |                         },
36 |                     {% end %}
37 |                 ],
38 |                 itemStyle: {
39 |                     emphasis: {
40 |                         shadowBlur: 10,
41 |                         shadowOffsetX: 0,
42 |                         shadowColor: 'rgba(0, 0, 0, 0.5)'
43 |                     }
44 |                 }
45 |             }
46 |         ]
47 |     };
48 |     cityJobsCountChart.setOption(option);
49 | </script>


--------------------------------------------------------------------------------
/webspider/web/templates/education-chart-module.html:
--------------------------------------------------------------------------------
 1 | <div id="education-year-chart" class="chart-div col-md-6"></div>
 2 | 
 3 | <script type="text/javascript">
 4 |     var educationChart = echarts.init(document.getElementById('education-year-chart'));
 5 |     // 指定图表的配置项和数据
 6 | 
 7 |     var option = {
 8 |         title: {
 9 |             text: '学历要求',
10 |             x: 'center'
11 |         },
12 |         tooltip: {
13 |             trigger: 'item',
14 |             formatter: "{a} <br/>{b} : {c} ({d}%)"
15 |         },
16 |         legend: {
17 |             orient: 'vertical',
18 |             left: 'left',
19 |             data: [
20 |                 {% for (education, count) in educations_statistic.items() %}
21 |                     '{{education}}',
22 |                 {% end %}
23 |             ]
24 |         },
25 |         series: [
26 |             {
27 |                 type: 'pie',
28 |                 radius: '55%',
29 |                 center: ['50%', '60%'],
30 |                 data: [
31 |                     {% for (education, count) in educations_statistic.items() %}
32 |                         {
33 |                             value: '{{count}}',
34 |                             name: '{{education}}'
35 |                         },
36 |                     {% end %}
37 |                 ],
38 |                 itemStyle: {
39 |                     emphasis: {
40 |                         shadowBlur: 10,
41 |                         shadowOffsetX: 0,
42 |                         shadowColor: 'rgba(0, 0, 0, 0.5)'
43 |                     }
44 |                 }
45 |             }
46 |         ]
47 |     };
48 | 
49 |     // 使用刚指定的配置项和数据显示图表。
50 |     educationChart.setOption(option);
51 | </script>


--------------------------------------------------------------------------------
/webspider/web/templates/finance-stage-chart-module.html:
--------------------------------------------------------------------------------
 1 | <div id="finance-stage-chart" class="large-chart-div col-md-12"></div>
 2 | 
 3 | <script type="text/javascript">
 4 |     var financeStageChart = echarts.init(document.getElementById('finance-stage-chart'));
 5 |     // 指定图表的配置项和数据
 6 | 
 7 | 
 8 |     var option = {
 9 |         title: {
10 |             text: '公司融资情况',
11 |             subtext: '统计招聘该岗位的公司'
12 |         },
13 |         tooltip: {
14 |             trigger: 'axis',
15 |             axisPointer: {
16 |                 type: 'shadow'
17 |             }
18 |         },
19 |         grid: {
20 |             left: '3%',
21 |             right: '4%',
22 |             bottom: '3%',
23 |             containLabel: true
24 |         },
25 |         xAxis: {
26 |             type: 'value',
27 |             boundaryGap: [0, 0.01]
28 |         },
29 |         yAxis: {
30 |             type: 'category',
31 |             data: ['未融资', '天使轮', 'A轮', 'B轮', 'C轮', 'D轮及以上', '上市公司', '不需要融资']
32 |         },
33 |         series: [
34 |             {
35 |                 name: '数量(单位:个)',
36 |                 type: 'bar',
37 |                 data: [
38 |                     {% if '未融资' in finance_stage_statistic %}
39 |                         {{finance_stage_statistic['未融资']}}
40 |                     {% else %}
41 |                         0
42 |                     {% end %},
43 |                     {% if '天使轮' in finance_stage_statistic %}
44 |                         {{finance_stage_statistic['天使轮']}}
45 |                     {% else %}
46 |                         0
47 |                     {% end %},
48 |                     {% if 'A轮' in finance_stage_statistic %}
49 |                         {{finance_stage_statistic['A轮']}}
50 |                     {% else %}
51 |                         0
52 |                     {% end %},
53 |                     {% if 'B轮' in finance_stage_statistic %}
54 |                         {{finance_stage_statistic['B轮']}}
55 |                     {% else %}
56 |                         0
57 |                     {% end %},
58 |                     {% if 'C轮' in finance_stage_statistic %}
59 |                         {{finance_stage_statistic['C轮']}}
60 |                     {% else %}
61 |                         0
62 |                     {% end %},
63 |                     {% if 'D轮及以上' in finance_stage_statistic %}
64 |                         {{finance_stage_statistic['D轮及以上']}}
65 |                     {% else %}
66 |                         0
67 |                     {% end %},
68 |                     {% if '上市公司' in finance_stage_statistic %}
69 |                         {{finance_stage_statistic['上市公司']}}
70 |                     {% else %}
71 |                         0
72 |                     {% end %},
73 |                     {% if '不需要融资' in finance_stage_statistic %}
74 |                         {{finance_stage_statistic['不需要融资']}}
75 |                     {% else %}
76 |                         0
77 |                     {% end %}
78 |                 ]
79 |             }
80 |         ]
81 |     };
82 | 
83 |     // 使用刚指定的配置项和数据显示图表。
84 |     financeStageChart.setOption(option);
85 | </script>


--------------------------------------------------------------------------------
/webspider/web/templates/pagination-module.html:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |     <ul class="pagination">
 3 |         <li {% if not pagination.has_prev %} class="disabled" {% else %} class="previous" {% end %}>
 4 |             <a {% if pagination.has_prev %} href="/keyword?keyword={{ keyword }}&page={{ pagination.prev_num }}" {% else %} href="#"{% end %}>
 5 |                 &laquo;
 6 |             </a>
 7 |         </li>
 8 |         {% for p in pagination.iter_pages() %}
 9 |             {% if p %}
10 |                 <li {% if p == pagination.page %} class="active" {% end %} >
11 |                     <a href="/keyword?keyword={{ keyword }}&page={{ p }}">{{ p }}</a>
12 |                 </li>
13 |             {% else %}
14 |                 <li class="disabled"><a href="#">&hellip;</a></li>
15 |             {% end %}
16 |         {% end %}
17 |         <li {% if not pagination.has_next %} class="disabled" {% else %} class="next" {% end %}>
18 |             <a {% if pagination.has_next %} href="/keyword?keyword={{ keyword }}&page={{ pagination.next_num }}" {% else %} href="#"{% end %}>
19 |                 &raquo;
20 |             </a>
21 |         </li>
22 |     </ul>
23 | </div>
24 | 
25 | 


--------------------------------------------------------------------------------
/webspider/web/templates/per-day-jobs-count-chart-module.html:
--------------------------------------------------------------------------------
  1 | <div id="per-day-jobs-count-chart" class="large-chart-div col-md-12"></div>
  2 | 
  3 | <script type="text/javascript">
  4 |     var perDayJobsCountChart = echarts.init(document.getElementById('per-day-jobs-count-chart'));
  5 |     // 指定图表的配置项和数据
  6 | 
  7 |     var option = {
  8 |         title: {
  9 |             text: '岗位热度',
 10 |             subtext: '根据近期岗位数量统计'
 11 |         },
 12 |         tooltip: {
 13 |             trigger: 'axis',
 14 |             axisPointer: {
 15 |                 type: 'cross',
 16 |                 label: {
 17 |                     backgroundColor: '#6a7985'
 18 |                 }
 19 |             }
 20 |         },
 21 |         color: ["#FF0000", "#00BFFF", "#FF00FF", "#1ce322", "#000000", '#EE7942', '#6a7985'],
 22 |         calculable: true,
 23 |         legend: {
 24 |             data: ['全国', '北京', '上海', '深圳', '广州', '杭州', '成都']
 25 |         },
 26 |         grid: {
 27 |             left: '3%',
 28 |             right: '4%',
 29 |             bottom: '3%',
 30 |             containLabel: true
 31 |         },
 32 |         toolbox: {
 33 |             show: true,
 34 |             feature: {
 35 |                 mark: {show: true},
 36 |                 dataView: {show: true, readOnly: false},
 37 |                 magicType: {show: true, type: ['line', 'bar', 'stack', 'tiled']},
 38 |                 restore: {show: true},
 39 |                 saveAsImage: {show: true}
 40 |             }
 41 |         },
 42 |         xAxis: {
 43 |             type: 'category',
 44 |             boundaryGap: false,
 45 |             data: [
 46 |                 {% for jobs_count in per_day_jobs_count_statistic %}
 47 |                     '{{jobs_count.date}}',
 48 |                 {% end %}
 49 |             ]
 50 |         },
 51 |         yAxis: {
 52 |             type: 'value'
 53 |         },
 54 |         series: [
 55 |             {
 56 |                 name: '全国',
 57 |                 type: 'line',
 58 |                 smooth: true,
 59 |                 data: [
 60 |                     {% for jobs_count in per_day_jobs_count_statistic %}
 61 |                     {{jobs_count.all_city}},
 62 |                     {% end %}
 63 |                 ]
 64 |             },
 65 |             {
 66 |                 name: '北京',
 67 |                 type: 'line',
 68 |                 smooth: true,
 69 |                 data: [
 70 |                     {% for jobs_count in per_day_jobs_count_statistic %}
 71 |                     {{jobs_count.beijing}},
 72 |                     {% end %}
 73 |                 ]
 74 |             },
 75 |             {
 76 |                 name: '上海',
 77 |                 type: 'line',
 78 |                 smooth: true,
 79 |                 data: [
 80 |                     {% for jobs_count in per_day_jobs_count_statistic %}
 81 |                     {{jobs_count.shanghai}},
 82 |                     {% end %}
 83 |                 ]
 84 |             },
 85 |             {
 86 |                 name: '深圳',
 87 |                 type: 'line',
 88 |                 smooth: true,
 89 |                 data: [
 90 |                     {% for jobs_count in per_day_jobs_count_statistic %}
 91 |                     {{jobs_count.shenzhen}},
 92 |                     {% end %}
 93 |                 ]
 94 |             },
 95 |             {
 96 |                 name: '广州',
 97 |                 type: 'line',
 98 |                 smooth: true,
 99 |                 data: [
100 |                     {% for jobs_count in per_day_jobs_count_statistic %}
101 |                     {{jobs_count.guangzhou}},
102 |                     {% end %}
103 |                 ]
104 |             },
105 |             {
106 |                 name: '杭州',
107 |                 type: 'line',
108 |                 smooth: true,
109 |                 data: [
110 |                     {% for jobs_count in per_day_jobs_count_statistic %}
111 |                     {{jobs_count.hangzhou}},
112 |                     {% end %}
113 |                 ]
114 |             },
115 |             {
116 |                 name: '成都',
117 |                 type: 'line',
118 |                 smooth: true,
119 |                 data: [
120 |                     {% for jobs_count in per_day_jobs_count_statistic %}
121 |                     {{jobs_count.chengdu}},
122 |                     {% end %}
123 |                 ]
124 |             }
125 |         ]
126 |     };
127 | 
128 | 
129 |     // 使用刚指定的配置项和数据显示图表。
130 |     perDayJobsCountChart.setOption(option);
131 | </script>


--------------------------------------------------------------------------------
/webspider/web/templates/salary-chart-module.html:
--------------------------------------------------------------------------------
 1 | <div id="salary-chart" class="chart-div  col-md-6"></div>
 2 | 
 3 | <script type="text/javascript">
 4 |     var salaryChart = echarts.init(document.getElementById('salary-chart'));
 5 |     // 指定图表的配置项和数据
 6 |     
 7 | var option = {
 8 |         title: {
 9 |             text: '薪水分布',
10 |             x: 'center'
11 |         },
12 |         tooltip: {
13 |             trigger: 'item',
14 |             formatter: "{a} <br/>{b} : {c} ({d}%)"
15 |         },
16 |         legend: {
17 |             orient: 'vertical',
18 |             left: 'left',
19 |             data: [
20 |                     {% for (salary, count) in salary_statistic.items() %}
21 |                         '{{salary}}',
22 |                     {% end %}
23 |                 ]
24 |         },
25 |         series: [
26 |             {
27 |                 type: 'pie',
28 |                 radius: '55%',
29 |                 center: ['50%', '60%'],
30 |                 data: [
31 |                     {% for (salary, count) in salary_statistic.items() %}
32 |                         {
33 |                             {% if count != 0 %}
34 |                                 value: '{{count}}',
35 |                                 name: '{{salary}}'
36 |                             {% end %}
37 |                         },
38 |                     {% end %}
39 |                 ],
40 |                 itemStyle: {
41 |                     emphasis: {
42 |                         shadowBlur: 10,
43 |                         shadowOffsetX: 0,
44 |                         shadowColor: 'rgba(0, 0, 0, 0.5)'
45 |                     }
46 |                 }
47 |             }
48 |         ]
49 |     };
50 | 
51 |     // 使用刚指定的配置项和数据显示图表。
52 |     salaryChart.setOption(option);
53 | </script>


--------------------------------------------------------------------------------
/webspider/web/templates/statistics.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | {% block body %}
 3 | <div class="container center_div">
 4 |     
 5 |     <h1>about {{ keyword_name }}: </h1>
 6 |     <br>
 7 |     {% include 'work-year-chart-module.html' %}
 8 |     {% include 'salary-chart-module.html' %}
 9 |     {% include 'city-jobs-count-chart-module.html' %}
10 |     {% include 'education-chart-module.html' %}
11 |     {% include 'per-day-jobs-count-chart-module.html' %}
12 |     {% include 'finance-stage-chart-module.html' %}
13 | 
14 | </div>
15 | {% end %}
16 | 
17 | {% block header %}
18 | <script src="{{ static_url('js/echarts.min.js') }}"></script>
19 | {% end %}


--------------------------------------------------------------------------------
/webspider/web/templates/work-year-chart-module.html:
--------------------------------------------------------------------------------
 1 | <div id="work-year-chart" class="chart-div  col-md-6"></div>
 2 | 
 3 | <script type="text/javascript">
 4 |     var wordYearChart = echarts.init(document.getElementById('work-year-chart'));
 5 |     // 指定图表的配置项和数据
 6 |     var option = {
 7 |         title: {
 8 |             text: '工作年限要求',
 9 |             x: 'center'
10 |         },
11 |         tooltip: {
12 |             trigger: 'item',
13 |             formatter: "{a} <br/>{b} : {c} ({d}%)"
14 |         },
15 |         legend: {
16 |             orient: 'vertical',
17 |             left: 'left',
18 |             data: [
19 |                     {% for (work_year, count) in work_years_statistic.items() %}
20 |                         '{{work_year}}',
21 |                     {% end %}
22 |                 ]
23 |         },
24 |         series: [
25 |             {
26 |                 type: 'pie',
27 |                 radius: '55%',
28 |                 center: ['50%', '60%'],
29 |                 data: [
30 |                     {% for (work_year, count) in work_years_statistic.items() %}
31 |                         {
32 |                             value: '{{count}}',
33 |                             name: '{{work_year}}'
34 |                         },
35 |                     {% end %}
36 |                 ],
37 |                 itemStyle: {
38 |                     emphasis: {
39 |                         shadowBlur: 10,
40 |                         shadowOffsetX: 0,
41 |                         shadowColor: 'rgba(0, 0, 0, 0.5)'
42 |                     }
43 |                 }
44 |             }
45 |         ]
46 |     };
47 | 
48 |     // 使用刚指定的配置项和数据显示图表。
49 |     wordYearChart.setOption(option);
50 | </script>


--------------------------------------------------------------------------------
/webspider/web/urls.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from tornado.web import URLSpec, RedirectHandler
 3 | 
 4 | from webspider.web.handlers import KeywordStatisticsApiHandler, KeywordStatisticsPageHandler
 5 | 
 6 | url_handlers = [
 7 |     URLSpec(r"/", RedirectHandler, {'url': '/statistics?keyword_name=python'}),
 8 |     URLSpec(r"/api/statistics", KeywordStatisticsApiHandler),
 9 |     URLSpec(r"/statistics", KeywordStatisticsPageHandler),
10 | ]
11 | 


--------------------------------------------------------------------------------