├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── README.rst
├── crwy
    ├── __init__.py
    ├── changetmpl.py
    ├── cmdline.py
    ├── decorates.py
    ├── exceptions.py
    ├── settings
    │   ├── __init__.py
    │   └── default_settings.py
    ├── spider.py
    ├── templates
    │   ├── project
    │   │   ├── logger_py2.conf.tmpl
    │   │   └── logger_py3.conf.tmpl
    │   └── spiders
    │   │   ├── crwybasic.tmpl
    │   │   ├── crwycrawl.tmpl
    │   │   └── crwyredis.tmpl
    └── utils
    │   ├── __init__.py
    │   ├── common.py
    │   ├── data
    │       ├── RedisHash.py
    │       └── __init__.py
    │   ├── extend
    │       ├── __init__.py
    │       ├── chaojiying.py
    │       ├── dingding_robot.py
    │       ├── xunma.py
    │       └── yima.py
    │   ├── filter
    │       ├── RedisSet.py
    │       ├── RedisSortedSet.py
    │       └── __init__.py
    │   ├── html
    │       ├── __init__.py
    │       ├── font_analysis.py
    │       ├── html_downloader.py
    │       └── html_parser.py
    │   ├── load_settings.py
    │   ├── logger.py
    │   ├── mail.py
    │   ├── no_sql
    │       ├── __init__.py
    │       └── redis_m.py
    │   ├── pyppeteer_api.py
    │   ├── queue
    │       ├── RedisQueue.py
    │       ├── SsdbQueue.py
    │       └── __init__.py
    │   ├── scrapy_plugs
    │       ├── __init__.py
    │       ├── dupefilters.py
    │       ├── middlewares.py
    │       ├── pipelines.py
    │       └── settings.py
    │   ├── selenium_api.py
    │   └── sql
    │       ├── __init__.py
    │       ├── mysql.py
    │       ├── pg.py
    │       └── sqlalchemy_m.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .DS_Store
3 | .idea
4 | *.db
5 | *.conf


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 | - '2.7'
 4 | - '3.6'
 5 | script: python setup.py install
 6 | deploy:
 7 |   provider: pypi
 8 |   user: wuyue
 9 |   password:
10 |     secure: 7sJA2nimxQQLdm6iifp0cC2ccDj+rPdzRPVB9BlNQV03G7ev37eXC3D2BWZcUyCUhUrIPFRlYuAxrcq6XlKJ9+0vQH8XhWpaMQhxcU2Zca8+AVFZ3yWrPihRGGWOvBpNQ25d5dc9SPMQIvzJdnVTgkbxQf3kvEk3rSSKfPWScKZZWYWUf2btunRzJSC24O6BvcbU9XW7dgXTUR7wb5P1JDFsQ3+U4DK7X+HEGq4TO1rYobEEw9Bnf8RGQuR1L64vusza3TDTag3D5yQ3iC8rX9GLxLQGlVnVlTUuj1jfw78m6jSQgNDB7Eyt3Nk9kbSDlSeed/uD+aWSDm8jh/RinZ0/OBq/yUz7/hkermvevZgnGQq36TH5L1xzlphAIO39gLL0RtEPYLw24jUmE+fRK8C5g2YLpVaV4JKqtrh2qNKjbCXXSXYXIN2cdkjRCXAEfs6bOhhrV1JecOwseIfG+gQLzY/WSUU0OCNnPZo21kl4kKH45hI96QwXLM6PPfk69JWE3DIPTB5F/Nht5YZfi66Ni9a/0LLew4qKaGBk19UvguMAfU8LonN/m+REoNJRGdaaSPq6BH224NtqnnFm/brqfQ/ZZCFXCaRoNPUip2k2wShfkH6LjG5BYsm3V83xJikFphrgNFHRuY3mFe6bA6SedvJvLH2/LpV28vbRh1A=
11 |   skip_existing: true
12 |   skip_cleanup: true
13 |   on:
14 |     tag: true
15 |     all_branches: true
16 | 
17 | branches:
18 |   except:
19 |   - develop
20 |   - master
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 wuyue
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include MANIFEST.in
2 | include README.md
3 | include requirements.txt
4 | recursive-include crwy/templates *
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Crwy
 2 | 
 3 | [![PyPI Version](https://img.shields.io/pypi/v/Crwy.svg)](https://pypi.python.org/pypi/Crwy)
 4 | [![Download Status](https://img.shields.io/pypi/dm/django-adminlte-ui.svg)](https://pypi.python.org/pypi/Crwy)
 5 | [![Build Status](https://travis-ci.org/wuyue92tree/crwy.svg)](https://travis-ci.org/wuyue92tree/crwy)
 6 | [![License Status](https://img.shields.io/github/license/wuyue92tree/crwy)](https://raw.githubusercontent.com/wuyue92tree/crwy/master/LICENS)
 7 | 
 8 | 
 9 | # 简介
10 | 
11 | Crwy是一个轻量级的爬虫抓取框架，参考Scrapy框架结构开发而来。该框架提供了实用的爬虫模板，旨在帮助大家快速实现爬虫任务，高效开发。并为scrapy使用者提供通用轮子^.^。新增了gevent，使爬虫异步执行，速度更快。
12 | 
13 | # 运行环境
14 | 
15 | 
16 |  * Python3
17 |  * Works on Linux, Mac OSX
18 | 
19 | # 安装
20 | 
21 | 
22 | 快速安装
23 | ```
24 | pip install crwy
25 | ```
26 | 
27 | or
28 | 前往下载: https://pypi.python.org/pypi/Crwy/
29 | 
30 | 
31 | # TODO
32 | 
33 | - 完善scrapy_plugs
34 | - 完善selenium_api
35 | - 兼容python3
36 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Crwy
 2 | ====
 3 | 
 4 | |PyPI Version| |Download Status| |Build Status| |License Status|
 5 | 
 6 | 简介
 7 | ====
 8 | 
 9 | Crwy是一个轻量级的爬虫抓取框架，参考Scrapy框架结构开发而来。该框架提供了实用的爬虫模板，旨在帮助大家快速实现爬虫任务，高效开发。并为scrapy使用者提供通用轮子\ :sup:`.`\ 。新增了gevent，使爬虫异步执行，速度更快。
10 | 
11 | 运行环境
12 | ========
13 | 
14 | -  Python3
15 | -  Works on Linux, Mac OSX
16 | 
17 | 
18 | 安装
19 | ====
20 | 
21 | 快速安装
22 | 
23 | ::
24 | 
25 |    pip install crwy
26 | 
27 | or 前往下载: https://pypi.python.org/pypi/Crwy/
28 | 
29 | TODO
30 | ====
31 | 
32 | -  完善scrapy_plugs
33 | -  完善selenium_api
34 | -  兼容python3
35 | 
36 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/Crwy.svg
37 |    :target: https://pypi.python.org/pypi/Crwy
38 | .. |Download Status| image:: https://img.shields.io/pypi/dm/django-adminlte-ui.svg
39 |    :target: https://pypi.python.org/pypi/Crwy
40 | .. |Build Status| image:: https://travis-ci.org/wuyue92tree/crwy.svg
41 |    :target: https://travis-ci.org/wuyue92tree/crwy
42 | .. |License Status| image:: https://img.shields.io/github/license/wuyue92tree/crwy
43 |    :target: https://raw.githubusercontent.com/wuyue92tree/crwy/master/LICENS
44 | 


--------------------------------------------------------------------------------
/crwy/__init__.py:
--------------------------------------------------------------------------------
1 | version = '1.7.1'
2 | 


--------------------------------------------------------------------------------
/crwy/changetmpl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | from string import Template
 6 | 
 7 | try:
 8 |     import ConfigParser
 9 | except ImportError:
10 |     from configparser import ConfigParser
11 | 
12 | 
13 | def get_project_name():
14 |     conf = ConfigParser()
15 |     conf.read('crwy.cfg', encoding='utf-8')
16 |     project_name = conf.get('project', 'project_name').encode('utf-8')
17 |     return project_name
18 | 
19 | 
20 | def change_project_name(name, path):
21 |     f = open(path, 'r')
22 |     t = Template(f.read()).substitute(project_name=name)
23 |     return t
24 | 
25 | 
26 | def change_spider_name(name, path):
27 |     f = open(path, 'r')
28 |     class_name = name.capitalize()
29 |     spider_name = name
30 |     project_name = get_project_name()
31 |     t = Template(f.read()).substitute(class_name=class_name, spider_name=spider_name, project_name=project_name)
32 |     return t
33 | 


--------------------------------------------------------------------------------
/crwy/cmdline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | from __future__ import print_function
 6 | import os
 7 | import shutil
 8 | import scrapy
 9 | from optparse import OptionParser
10 | from crwy import version
11 | from crwy.settings.default_settings import TEMPLATE_DIR
12 | 
13 | CRWY_SPIDER_TEMPLATE_DIR = os.path.join(TEMPLATE_DIR, 'spiders')
14 | SCRAPY_SPIDER_TEMPLATE_DIR = os.path.join(scrapy.__path__[0],
15 |                                           'templates/spiders')
16 | 
17 | 
18 | def install():
19 |     scrapy_tmpl = os.listdir(SCRAPY_SPIDER_TEMPLATE_DIR)
20 |     for tmpl in os.listdir(CRWY_SPIDER_TEMPLATE_DIR):
21 |         if tmpl in scrapy_tmpl:
22 |             print('{} exist.'.format(tmpl))
23 |             continue
24 |         shutil.copy(os.path.join(CRWY_SPIDER_TEMPLATE_DIR, tmpl),
25 |                     os.path.join(SCRAPY_SPIDER_TEMPLATE_DIR, tmpl))
26 |         print('{} installed.'.format(tmpl))
27 | 
28 | 
29 | def uninstall():
30 |     crwy_tmpl = os.listdir(CRWY_SPIDER_TEMPLATE_DIR)
31 |     for tmpl in os.listdir(SCRAPY_SPIDER_TEMPLATE_DIR):
32 |         if tmpl not in crwy_tmpl:
33 |             print('{} not match, skip.'.format(tmpl))
34 |             continue
35 |         os.remove(os.path.join(SCRAPY_SPIDER_TEMPLATE_DIR, tmpl))
36 |         print('{} uninstalled.'.format(tmpl))
37 |     pass
38 | 
39 | 
40 | def execute():
41 |     parser = OptionParser(usage="Usage: crwy [options] arg1 arg2")
42 |     parser.add_option('-i', '--install', action="store_true",
43 |                       help='install crwy tmpl for scrapy')
44 |     parser.add_option('-u', '--uninstall', action="store_true",
45 |                       help='uninstall crwy tmpl for scrapy')
46 |     parser.add_option('-v', '--version', action="store_true",
47 |                       help='print version')
48 |     options, args = parser.parse_args()
49 |     if options.version:
50 |         print(version)
51 |     elif options.install:
52 |         install()
53 |     elif options.uninstall:
54 |         uninstall()
55 |     else:
56 |         parser.print_help()
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     execute()
61 | 


--------------------------------------------------------------------------------
/crwy/decorates.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: PyCharm
 7 | @file: decorates.py
 8 | @create at: 2017-12-07 09:47
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | import functools
14 | from crwy.exceptions import CrwyCookieValidException
15 | 
16 | 
17 | def cls2singleton(cls, *args, **kwargs):
18 |     """
19 |     将类转换为单例模式
20 |     :param cls: 
21 |     :param args: 
22 |     :param kwargs: 
23 |     :return: 
24 |     """
25 |     instances = {}
26 | 
27 |     def _singleton(*args, **kwargs):
28 |         if kwargs.pop('cls_singleton', True) is False:
29 |             return cls(*args, **kwargs)
30 |         if cls not in instances:
31 |             instances[cls] = cls(*args, **kwargs)
32 |         return instances[cls]
33 | 
34 |     return _singleton
35 | 
36 | 
37 | def cls_catch_exception(func):
38 |     """
39 |     该装饰器用于捕捉类方法异常
40 |     1. 未出现异常，直接return方法执行结果
41 |     2. 出现异常，则先将异常记入日志，再抛出异常
42 |     :param func:
43 |     :return:
44 |     """
45 | 
46 |     @functools.wraps(func)
47 |     def wrapper(self, *args, **kwargs):
48 |         try:
49 |             return func(self, *args, **kwargs)
50 |         except Exception as e:
51 |             self.logger.exception(e)
52 |             raise e
53 | 
54 |     return wrapper
55 | 
56 | 
57 | def cls_refresh_cookie(func):
58 |     """
59 |     该装饰器用于捕捉类方法异常 CrwyCookieValidException
60 |     1. 未出现异常，直接return方法执行结果
61 |     2. 出现异常，则先调用self.get_cookie()进行cookie刷新，若cookie刷新成功，
62 |     直接return返回
63 |     :param func:
64 |     :return:
65 |     """
66 | 
67 |     @functools.wraps(func)
68 |     def wrapper(self, *args, **kwargs):
69 |         try:
70 |             return func(self, *args, **kwargs)
71 |         except CrwyCookieValidException as e:
72 |             if not self.get_cookie():
73 |                 self.logger.warning("Func[%s]: cookie更新失败." % func.__name__)
74 |                 raise e
75 |             self.logger.info("Func[%s]: cookie更新成功." % func.__name__)
76 |             return func(self, *args, **kwargs)
77 | 
78 |     return wrapper
79 | 


--------------------------------------------------------------------------------
/crwy/exceptions.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: PyCharm
 7 | @file: exceptions.py
 8 | @create at: 2017-12-13 14:14
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | 
14 | class CrwyException(Exception):
15 |     def __init__(self, value):
16 |         self.value = value
17 | 
18 |     def __str__(self):
19 |         return repr(self.value)
20 | 
21 | 
22 | class CrwyImportException(CrwyException):
23 |     pass
24 | 
25 | 
26 | class CrwyKafkaException(CrwyException):
27 |     pass
28 | 
29 | 
30 | class CrwyMnsException(CrwyException):
31 |     pass
32 | 
33 | 
34 | class CrwyDbException(CrwyException):
35 |     pass
36 | 
37 | 
38 | class CrwyExtendException(CrwyException):
39 |     pass
40 | 
41 | 
42 | class CrwyCookieValidException(CrwyException):
43 |     pass
44 | 
45 | 
46 | class CrwyScrapyPlugsException(CrwyException):
47 |     pass
48 | 


--------------------------------------------------------------------------------
/crwy/settings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/settings/__init__.py


--------------------------------------------------------------------------------
/crwy/settings/default_settings.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | import os
 6 | 
 7 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | 
 9 | DATEBASE_DIR = os.path.join(BASE_DIR, 'data')
10 | 
11 | TEMPLATE_DIR = os.path.join(BASE_DIR, 'templates')
12 | 
13 | CONF_DIR = os.path.join(BASE_DIR, 'crwy')


--------------------------------------------------------------------------------
/crwy/spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | import inspect
 6 | from crwy.utils.html.html_downloader import HtmlDownloader
 7 | from crwy.utils.html.html_parser import HtmlParser
 8 | 
 9 | 
10 | class BaseSpider(object):
11 |     """ Spider基础类 """
12 |     def __init__(self):
13 |         """
14 |         初始化下载器/解析器及日志接口
15 |         """
16 |         self.html_downloader = HtmlDownloader()
17 |         self.html_parser = HtmlParser()
18 | 
19 | 
20 | class Spider(BaseSpider):
21 |     """ Spider类 提供基本方法 """
22 |     def __init__(self, logger=None):
23 |         super(Spider, self).__init__()
24 |         self.login_kwargs = None    # 用于存放登录时所需的参数
25 |         self.proxies = None
26 |         if logger:
27 |             self.logger = logger
28 |         else:
29 |             from crwy.utils.logger import Logger
30 |             self.logger = Logger.timed_rt_logger()
31 | 
32 |     def login(self, *args, **kwargs):
33 |         pass
34 | 
35 |     def clean(self, *args, **kwargs):
36 |         pass
37 | 
38 |     def save(self, *args, **kwargs):
39 |         pass
40 | 
41 |     def get_cookie(self):
42 |         pass
43 | 
44 |     @staticmethod
45 |     def func_name():
46 |         """ 返回函数名称 """
47 |         return inspect.stack()[1][3]
48 | 


--------------------------------------------------------------------------------
/crwy/templates/project/logger_py2.conf.tmpl:
--------------------------------------------------------------------------------
 1 | #logger.conf
 2 | ###############################################
 3 | [loggers]
 4 | keys=root,fileLogger,rtLogger,timedRtLogger
 5 | 
 6 | [logger_root]
 7 | level=INFO
 8 | handlers=consoleHandler
 9 | 
10 | [logger_fileLogger]
11 | handlers=consoleHandler,fileHandler
12 | qualname=fileLogger
13 | propagate=0
14 | 
15 | [logger_rtLogger]
16 | handlers=consoleHandler,rtHandler
17 | qualname=rtLogger
18 | propagate=0
19 | 
20 | [logger_timedRtLogger]
21 | handlers=consoleHandler,timedRtHandler
22 | qualname=timedRtLogger
23 | propagate=0
24 | 
25 | ###############################################
26 | [handlers]
27 | keys=consoleHandler,fileHandler,rtHandler,timedRtHandler
28 | 
29 | [handler_consoleHandler]
30 | class=StreamHandler
31 | level=DEBUG
32 | formatter=simpleFmt
33 | args=(sys.stderr,)
34 | 
35 | [handler_fileHandler]
36 | class=FileHandler
37 | level=DEBUG
38 | formatter=defaultFmt
39 | args=('./log/default.log', 'a')
40 | 
41 | [handler_rtHandler]
42 | class=handlers.RotatingFileHandler
43 | level=DEBUG
44 | formatter=defaultFmt
45 | args=('./log/default.log', 'a', 100*1024*1024, 10)
46 | 
47 | [handler_timedRtHandler]
48 | class=handlers.TimedRotatingFileHandler
49 | level=DEBUG
50 | formatter=defaultFmt
51 | args=('./log/default.log', 'midnight', 1, 0)
52 | 
53 | 
54 | ###############################################
55 | 
56 | [formatters]
57 | keys=defaultFmt,simpleFmt
58 | 
59 | [formatter_defaultFmt]
60 | format=%(asctime)s %(filename)s %(funcName)s %(processName)s %(threadName)s [line:%(lineno)d] %(levelname)s %(message)s
61 | datefmt=%Y-%m-%d %H:%M:%S
62 | 
63 | [formatter_simpleFmt]
64 | format=%(asctime)s %(levelname)s %(message)s
65 | datefmt=%Y-%m-%d %H:%M:%S


--------------------------------------------------------------------------------
/crwy/templates/project/logger_py3.conf.tmpl:
--------------------------------------------------------------------------------
 1 | #logger.conf
 2 | ###############################################
 3 | [loggers]
 4 | keys=root,fileLogger,rtLogger,timedRtLogger
 5 | 
 6 | [logger_root]
 7 | level=INFO
 8 | handlers=consoleHandler
 9 | 
10 | [logger_fileLogger]
11 | handlers=consoleHandler,fileHandler
12 | qualname=fileLogger
13 | propagate=0
14 | 
15 | [logger_rtLogger]
16 | handlers=consoleHandler,rtHandler
17 | qualname=rtLogger
18 | propagate=0
19 | 
20 | [logger_timedRtLogger]
21 | handlers=consoleHandler,timedRtHandler
22 | qualname=timedRtLogger
23 | propagate=0
24 | 
25 | ###############################################
26 | [handlers]
27 | keys=consoleHandler,fileHandler,rtHandler,timedRtHandler
28 | 
29 | [handler_consoleHandler]
30 | class=StreamHandler
31 | level=DEBUG
32 | formatter=simpleFmt
33 | args=(sys.stderr,)
34 | 
35 | [handler_fileHandler]
36 | class=FileHandler
37 | level=DEBUG
38 | formatter=defaultFmt
39 | args=('./log/default.log', 'a', 'utf-8')
40 | 
41 | [handler_rtHandler]
42 | class=handlers.RotatingFileHandler
43 | level=DEBUG
44 | formatter=defaultFmt
45 | args=('./log/default.log', 'a', 100*1024*1024, 10, 'utf-8')
46 | 
47 | [handler_timedRtHandler]
48 | class=handlers.TimedRotatingFileHandler
49 | level=DEBUG
50 | formatter=defaultFmt
51 | args=('./log/default.log', 'midnight', 1, 0, 'utf-8')
52 | 
53 | 
54 | ###############################################
55 | 
56 | [formatters]
57 | keys=defaultFmt,simpleFmt
58 | 
59 | [formatter_defaultFmt]
60 | format=%(asctime)s %(filename)s %(funcName)s %(processName)s %(threadName)s [line:%(lineno)d] %(levelname)s: %(message)s
61 | datefmt=%Y-%m-%d %H:%M:%S
62 | 
63 | [formatter_simpleFmt]
64 | format=%(asctime)s %(levelname)s: %(message)s
65 | datefmt=%Y-%m-%d %H:%M:%S


--------------------------------------------------------------------------------
/crwy/templates/spiders/crwybasic.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import print_function
 4 | import logging
 5 | import scrapy
 6 | from crwy.spider import BaseSpider
 7 | 
 8 | 
 9 | class $classname(scrapy.Spider, BaseSpider):
10 |     name = '$name'
11 |     allowed_domains = ['$domain']
12 |     start_urls = ['http://$domain/']
13 | 
14 |     custom_settings = {
15 |         'LOG_LEVEL': logging.INFO,
16 |         'LOG_ENCODING': 'utf-8',
17 |         'LOG_FORMAT': '%(asctime)s %(filename)s %(funcName)s %(processName)s '
18 |                       '%(threadName)s [line:%(lineno)d] '
19 |                       '%(levelname)s: %(message)s'
20 |     }
21 | 
22 |     def __init__(self, *args, **kwargs):
23 |         super($classname, self).__init__(*args, **kwargs)
24 |         BaseSpider.__init__(self)
25 | 
26 |     def parse(self, response):
27 |         pass
28 | 


--------------------------------------------------------------------------------
/crwy/templates/spiders/crwycrawl.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import print_function
 4 | import logging
 5 | import scrapy
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from scrapy.spiders import CrawlSpider, Rule
 8 | from crwy.spider import BaseSpider
 9 | 
10 | 
11 | class $classname(CrawlSpider, BaseSpider):
12 |     name = '$name'
13 |     allowed_domains = ['$domain']
14 |     start_urls = ['http://$domain/']
15 | 
16 |     custom_settings = {
17 |         'LOG_LEVEL': logging.INFO,
18 |         'LOG_ENCODING': 'utf-8',
19 |         'LOG_FORMAT': '%(asctime)s %(filename)s %(funcName)s %(processName)s '
20 |                       '%(threadName)s [line:%(lineno)d] '
21 |                       '%(levelname)s: %(message)s'
22 |     }
23 | 
24 |     rules = (
25 |         Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
26 |     )
27 | 
28 |     def __init__(self, *args, **kwargs):
29 |         super($classname, self).__init__(*args, **kwargs)
30 |         BaseSpider.__init__(self)
31 | 
32 |     def parse_item(self, response):
33 |         item = {}
34 |         #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
35 |         #item['name'] = response.xpath('//div[@id="name"]').get()
36 |         #item['description'] = response.xpath('//div[@id="description"]').get()
37 |         return item
38 | 


--------------------------------------------------------------------------------
/crwy/templates/spiders/crwyredis.tmpl:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import print_function
 4 | import logging
 5 | import scrapy
 6 | from scrapy_redis.spiders import RedisSpider
 7 | from crwy.spider import BaseSpider
 8 | 
 9 | 
10 | class $classname(RedisSpider, BaseSpider):
11 |     name = '$name'
12 |     allowed_domains = ['$domain']
13 |     redis_key = 'crawl_task:$name:start_urls'
14 | 
15 |     custom_settings = {
16 |         'SPIDER_NAME': '$name',
17 |         'DUPEFILTER_DO_HASH': False,
18 |         # 'DUPEFILTER_DELAY_DAY': 2,
19 |         'DUPEFILTER_CLASS':
20 |             'crwy.utils.scrapy_plugs.dupefilters.RedisRFPDupeFilter',
21 |         'REDIS_URL': 'redis://root:password@host:port/db',
22 |         'LOG_LEVEL': logging.INFO,
23 |         'LOG_ENCODING': 'utf-8',
24 |         'LOG_FORMAT': '%(asctime)s %(filename)s %(funcName)s %(processName)s '
25 |                       '%(threadName)s [line:%(lineno)d] '
26 |                       '%(levelname)s: %(message)s'
27 |     }
28 | 
29 |     def __init__(self, *args, **kwargs):
30 |         super($classname, self).__init__(*args, **kwargs)
31 |         BaseSpider.__init__(self)
32 | 
33 |     def parse(self, response):
34 |         # use dupefilter_key filter with redis set or sorted set
35 |         # 1. add a dupefilter_key, meta['dupefilter_key'] = url.encode('utf-8')
36 |         # 2. rm a dupefilter_key, release_dupefilter_key.call(spider, request.meta.get('dupefilter_key'))
37 |         pass
38 | 


--------------------------------------------------------------------------------
/crwy/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # author: wuyue92tree@163.com
4 | 
5 | 


--------------------------------------------------------------------------------
/crwy/utils/common.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: PyCharm
  7 | @file: common.py
  8 | @create at: 2017-12-12 18:01
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | """
 12 | 
 13 | import os
 14 | import re
 15 | import datetime
 16 | 
 17 | try:
 18 |     import ConfigParser as configparser
 19 | except ImportError:
 20 |     import configparser
 21 | 
 22 | __all__ = [
 23 |     'cookie2str', 'cookie2dict', 'config_handle',
 24 |     'file_handle', 'datetime2str', 'str2datetime',
 25 |     'dict2obj', 'obj2dict', 'remove_emoji', 'change_kv',
 26 |     'remove_item_from_dict', 'splice_list'
 27 | ]
 28 | 
 29 | 
 30 | def cookie2str(cookie_dict):
 31 |     """
 32 |     将requests 字典类型cookie转换成字符串
 33 |     :param cookie_dict: dict
 34 |     :return: string
 35 |     """
 36 |     return '; '.join(
 37 |         [name + '=' + cookie_dict.get(name) for name in cookie_dict])
 38 | 
 39 | 
 40 | def cookie2dict(cookie_str):
 41 |     """
 42 |     将cookie_str转换成requests可用的dict类型
 43 |     :param cookie_str: string
 44 |     :return: dict
 45 |     """
 46 |     cookie_dict = dict()
 47 |     for item in cookie_str.strip().replace(' ', '').split(';'):
 48 |         if not item:
 49 |             continue
 50 |         name, value = item.split('=', 1)
 51 |         cookie_dict[name] = value
 52 |     return cookie_dict
 53 | 
 54 | 
 55 | def datetime2str(target, fmt='%Y-%m-%d %H:%M:%S'):
 56 |     """
 57 |     将datetime对象转换成字符串
 58 |     :param target: datetime
 59 |     :param fmt: string
 60 |     :return: string
 61 |     """
 62 |     return datetime.datetime.strftime(target, fmt)
 63 | 
 64 | 
 65 | def str2datetime(target, fmt='%Y-%m-%d %H:%M:%S'):
 66 |     """
 67 |     将string转换成datetime对象
 68 |     :param target: string
 69 |     :param fmt: string
 70 |     :return: datetime
 71 |     """
 72 |     return datetime.datetime.strptime(target, fmt)
 73 | 
 74 | 
 75 | def dict2obj(target, change_dict=True):
 76 |     """
 77 |     将dict转换成obj对象
 78 |     change_dict 用于控制是否转换target内部dict为obj
 79 | 
 80 |     :param target: dict
 81 |     :param change_dict: bool
 82 |     :return: obj
 83 |     """
 84 | 
 85 |     class Obj(object):
 86 |         def __init__(self, d, change_dict):
 87 |             for a, b in d.items():
 88 |                 if change_dict is True:
 89 |                     if isinstance(b, (list, tuple)):
 90 |                         setattr(self, a,
 91 |                                 [Obj(x, change_dict) if isinstance(x, dict) else x
 92 |                                  for x in b])
 93 |                     else:
 94 |                         setattr(self, a, Obj(b, change_dict) if isinstance(
 95 |                             b, dict) else b)
 96 |                 else:
 97 |                     setattr(self, a, b)
 98 | 
 99 |     return Obj(target, change_dict=change_dict)
100 | 
101 | 
102 | def obj2dict(target):
103 |     """
104 |     将obj对象转换成dict
105 |     :param target: obj
106 |     :return: dict
107 |     """
108 |     return target.__dict__
109 | 
110 | 
111 | def config_handle(path):
112 |     """
113 |     用于对Config配置文件进行操作，初始化config_path
114 |     :param path: config文件路径
115 |     :return: 返回config对象
116 |     """
117 |     config = configparser.ConfigParser()
118 |     config.read(path)
119 |     return config
120 | 
121 | 
122 | def file_handle(path, file_name, mode='r'):
123 |     """
124 |     用于对普通文件进行操作
125 |     :param path: 文件路径
126 |     :param file_name: 文件名称
127 |     :param mode: 加载模式，默认'r'
128 |     :return: file对象
129 |     """
130 |     if path[-1] == '/':
131 |         real_path = path + file_name
132 |     else:
133 |         real_path = path + '/' + file_name
134 | 
135 |     if not os.path.exists(path):
136 |         os.makedirs(path)
137 | 
138 |     return open(real_path, mode=mode)
139 | 
140 | 
141 | def remove_emoji(content):
142 |     """
143 |     表情符去除
144 |     :param content: unicode
145 |     :return: unicode
146 |     """
147 |     pattern = re.compile(
148 |         u"(\ud83d[\ude00-\ude4f])|"  # emoticons
149 |         u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
150 |         u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
151 |         u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
152 |         u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
153 |         "+", flags=re.UNICODE)
154 |     return pattern.sub(r'', content)
155 | 
156 | 
157 | def change_kv(dict_ori):
158 |     """
159 |     字典kv调换
160 |     :param dict_ori: 原字典
161 |     :return: 新字典
162 |     """
163 |     return dict(zip(dict_ori.values(), dict_ori.keys()))
164 | 
165 | 
166 | def remove_item_from_dict(obj, keys_to_remove):
167 |     """
168 |     移除字典中某些item
169 |     :param obj:
170 |     :param keys_to_remove:
171 |     :return:
172 |     """
173 |     for key in keys_to_remove:
174 |         if obj.get(key, ''):
175 |             obj.pop(key)
176 |     return obj
177 | 
178 | 
179 | def splice_list(obj_list, group_number=3):
180 |     """
181 |     分割列表
182 |     :param obj_list:
183 |     :param group_number:
184 |     :return:
185 |     """
186 |     if len(obj_list) < group_number:
187 |         raise Exception('obj_list length must greater than group_number.')
188 | 
189 |     distance = int(len(obj_list) / group_number)
190 |     new_list = []
191 |     for group in range(group_number):
192 |         if group == group_number - 1:
193 |             # 若有超出部分并入最后一组
194 |             new_list.append(obj_list[distance*group:len(obj_list)])
195 |         else:
196 |             new_list.append(obj_list[distance*group:distance*(group+1)])
197 |     return new_list
198 | 


--------------------------------------------------------------------------------
/crwy/utils/data/RedisHash.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | from crwy.utils.no_sql.redis_m import get_redis_client
 6 | 
 7 | 
 8 | class RedisHash(object):
 9 |     """Simple Hash with Redis Backend"""
10 | 
11 |     def __init__(self, name, server=None, **redis_kwargs):
12 |         """
13 |         The default connection parameters are:
14 |         host='localhost', port=6379, db=0
15 |         """
16 |         if server:
17 |             self.__db = server
18 |         else:
19 |             self.__db = get_redis_client(**redis_kwargs)
20 |         self.key = name
21 | 
22 |     def hget(self, item):
23 |         """Get item value."""
24 |         return self.__db.hget(self.key, item)
25 | 
26 |     def hset(self, item, value):
27 |         """Set item value."""
28 |         return self.__db.hset(self.key, item, value)
29 | 
30 |     def hexists(self, item):
31 |         """Is item exist."""
32 |         return self.__db.hexists(self.key, item)
33 | 
34 |     def hlen(self):
35 |         """Return total count."""
36 |         return self.__db.hlen(self.key)
37 | 
38 |     def hkeys(self):
39 |         return self.__db.hkeys(self.key)
40 | 
41 |     def clean(self):
42 |         """Empty key"""
43 |         return self.__db.delete(self.key)
44 | 
45 |     def db(self):
46 |         return self.__db
47 | 


--------------------------------------------------------------------------------
/crwy/utils/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/data/__init__.py


--------------------------------------------------------------------------------
/crwy/utils/extend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/extend/__init__.py


--------------------------------------------------------------------------------
/crwy/utils/extend/chaojiying.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: PyCharm
 7 | @file: chaojiying.py
 8 | @create at: 2018-05-11 16:33
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | import requests
14 | from hashlib import md5
15 | 
16 | 
17 | class ChaoJiYingApi(object):
18 |     def __init__(self, username, password, soft_id):
19 |         self.username = username
20 |         self.password = md5(password).hexdigest()
21 |         self.soft_id = soft_id
22 |         self.base_params = {
23 |             'user': self.username,
24 |             'pass2': self.password,
25 |             'softid': self.soft_id,
26 |         }
27 |         self.headers = {
28 |             'Connection': 'Keep-Alive',
29 |             'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0;'
30 |                           ' Windows NT 5.1; Trident/4.0)',
31 |         }
32 | 
33 |     def post_pic(self, im, code_type):
34 |         """
35 |         im: 图片字节
36 |         codetype: 题目类型 参考 http://www.chaojiying.com/price.html
37 |         """
38 |         params = {
39 |             'codetype': code_type,
40 |         }
41 |         params.update(self.base_params)
42 |         files = {'userfile': ('ccc.jpg', im)}
43 |         r = requests.post('http://upload.chaojiying.net/Upload/Processing.php',
44 |                           data=params, files=files, headers=self.headers)
45 |         return r.json()
46 | 
47 |     def report_error(self, im_id):
48 |         """
49 |         im_id:报错题目的图片ID
50 |         """
51 |         params = {
52 |             'id': im_id,
53 |         }
54 |         params.update(self.base_params)
55 |         r = requests.post(
56 |             'http://upload.chaojiying.net/Upload/ReportError.php', data=params,
57 |             headers=self.headers)
58 |         return r.json()
59 | 
60 |     def decode(self, img_path, code_type):
61 |         im = open(img_path, 'rb').read()
62 |         res = self.post_pic(im, code_type)
63 |         # {u'err_str': u'OK', u'err_no': 0,
64 |         #  u'md5': u'a11171f1f444e8d1992926f4ca16c7d8',
65 |         #  u'pic_id': u'6031116291508600001',
66 |         #  u'pic_str': u'113,72|220,81|138,101'}
67 |         if res.get('err_no') == 0 and res.get('err_str') == u'OK':
68 |             return res.get('pic_str')
69 |         return
70 | 


--------------------------------------------------------------------------------
/crwy/utils/extend/dingding_robot.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: PyCharm
  7 | @file: dingding_robot.py
  8 | @create at: 2017-10-24 10:57
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | """
 12 | 
 13 | import json
 14 | 
 15 | from crwy.spider import BaseSpider
 16 | from crwy.exceptions import CrwyExtendException
 17 | 
 18 | 
 19 | class DingDingRobot(BaseSpider):
 20 |     def __init__(self, access_token=None,
 21 |                  api_url="https://oapi.dingtalk.com/robot/send?access_token="):
 22 |         super(DingDingRobot, self).__init__()
 23 |         if not api_url:
 24 |             raise CrwyExtendException('access_token unset.')
 25 |         self.api_url = api_url
 26 |         self.header = {'Content-Type': 'application/json'}
 27 |         self.access_token = access_token
 28 |         self.html_downloader.session.headers = self.header
 29 | 
 30 |     def send_text(self, content, at_mobiles=list(), is_at_all=False):
 31 |         try:
 32 |             data = {
 33 |                 "text": {
 34 |                     "content": content
 35 |                 },
 36 |                 "msgtype": "text",
 37 |                 "at": {
 38 |                     "isAtAll": is_at_all,
 39 |                     "atMobiles": at_mobiles
 40 |                 }
 41 |             }
 42 | 
 43 |             res = self.html_downloader.download(
 44 |                 self.api_url + self.access_token,
 45 |                 method='POST',
 46 |                 data=json.dumps(data))
 47 |             return res
 48 |         except Exception as e:
 49 |             raise CrwyExtendException(e)
 50 | 
 51 |     def send_markdown(self, title, content, at_mobiles=list(),
 52 |                       is_at_all=False):
 53 |         try:
 54 |             data = {
 55 |                 "msgtype": "markdown",
 56 |                 "markdown": {
 57 |                     "title": title,
 58 |                     "text": content
 59 |                 },
 60 |                 "at": {
 61 |                     "atMobiles": at_mobiles,
 62 |                     "isAtAll": is_at_all
 63 |                 }
 64 |             }
 65 | 
 66 |             res = self.html_downloader.download(
 67 |                 self.api_url + self.access_token,
 68 |                 method='POST',
 69 |                 data=json.dumps(data))
 70 |             return res
 71 |         except Exception as e:
 72 |             raise CrwyExtendException(e)
 73 | 
 74 |     def send_action_card(self, title, content, hide_avatar="0",
 75 |                          btn_oriengtation="0", single_title="阅读全文",
 76 |                          single_url="#"):
 77 |         try:
 78 |             data = {
 79 |                 "actionCard": {
 80 |                     "title": title,
 81 |                     "text": content,
 82 |                     "hideAvatar": hide_avatar,
 83 |                     "btnOrientation": btn_oriengtation,
 84 |                     "singleTitle": single_title,
 85 |                     "singleURL": single_url
 86 |                 },
 87 |                 "msgtype": "actionCard"
 88 |             }
 89 |             res = self.html_downloader.download(
 90 |                 self.api_url + self.access_token,
 91 |                 method='POST',
 92 |                 data=json.dumps(data))
 93 |             return res
 94 |         except Exception as e:
 95 |             raise CrwyExtendException(e)
 96 | 
 97 |     def send_feed_card(self, links):
 98 |         """
 99 | 
100 |         :param links: array[{'title':'', 'messageURL':'', 'picURL':''}]
101 |         :return:
102 |         """
103 |         try:
104 |             data = {
105 |                 "feedCard": {
106 |                     "links": links
107 |                 },
108 |                 "msgtype": "feedCard"
109 |             }
110 |             res = self.html_downloader.download(
111 |                 self.api_url + self.access_token,
112 |                 method='POST',
113 |                 data=json.dumps(data))
114 |             return res
115 |         except Exception as e:
116 |             raise CrwyExtendException(e)
117 | 


--------------------------------------------------------------------------------
/crwy/utils/extend/xunma.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: IntelliJ IDEA
  7 | @file: xunma.py
  8 | @create at: 2018-09-14 11:41
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | """
 12 | 
 13 | 
 14 | from __future__ import print_function, unicode_literals
 15 | 
 16 | from crwy.spider import Spider
 17 | from crwy.exceptions import CrwyExtendException
 18 | 
 19 | 
 20 | class XunMa(Spider):
 21 |     def __init__(self, username, password, item_id):
 22 |         super(XunMa, self).__init__()
 23 |         if username and password and item_id:
 24 |             self.username = username
 25 |             self.password = password
 26 |             self.item_id = item_id
 27 |         else:
 28 |             raise CrwyExtendException("[XunMa] params not valid.")
 29 | 
 30 |     def login(self):
 31 |         """
 32 |         XunMa 登录
 33 |         :return: 登录token
 34 |         """
 35 |         try:
 36 |             url = "http://xapi.xunma.net/Login?uName={username}" \
 37 |                   "&pWord={password}&Code=UTF8".format(username=self.username,
 38 |                                                        password=self.password)
 39 |             res = self.html_downloader.download(url)
 40 | 
 41 |             return res.text.strip().split("&")[0]
 42 |         except Exception as e:
 43 |             raise CrwyExtendException(e)
 44 | 
 45 |     def get_phone(self, token, phone_type='', phone=''):
 46 |         """
 47 |         获取手机号
 48 |         :param token:   登录token
 49 |         :param phone_type:  运营商 1 [移动] 2 [联通] 3 [电信]
 50 |         :param phone:   指定号码
 51 |         :return: 手机号码
 52 |         """
 53 |         try:
 54 |             url = "http://xapi.xunma.net/getPhone?ItemId=" \
 55 |                   "{item_id}&token={token}&" \
 56 |                   "PhoneType={phone_type}&Code=UTF8&" \
 57 |                   "Phone={phone}".format(token=token, item_id=self.item_id,
 58 |                                          phone_type=phone_type, phone=phone)
 59 | 
 60 |             res = self.html_downloader.download(url)
 61 |             return res.text.strip().split(';')[0]
 62 | 
 63 |         except Exception as e:
 64 |             raise CrwyExtendException(e)
 65 | 
 66 |     def get_message(self, token, phone):
 67 |         """
 68 |         获取短信消息
 69 |         :param token:   登录token
 70 |         :param phone:   手机号
 71 |         :return:
 72 |         """
 73 |         try:
 74 |             # http://xapi.xunma.net/getMessage?token=登陆token&itemId=项目ID&phone=手机号码
 75 |             url = "http://xapi.xunma.net/getMessage?" \
 76 |                   "token={token}&itemId={item_id}&phone={phone}" \
 77 |                   "&Code=UTF8".format(token=token,
 78 |                                       item_id=self.item_id, phone=phone)
 79 |             res = self.html_downloader.download(url)
 80 | 
 81 |             return res.text.strip().split('&')[-1]
 82 | 
 83 |         except Exception as e:
 84 |             raise CrwyExtendException(e)
 85 | 
 86 |     def release_phone(self, token, phone):
 87 |         try:
 88 |             # http://xapi.xunma.net/releasePhone?token=登陆token&phoneList=phone-itemId;phone-itemId;
 89 |             url = "http://xapi.xunma.net/releasePhone?" \
 90 |                   "token={token}&phoneList={phone};" \
 91 |                   "&Code=UTF8".format(token=token, phone=phone)
 92 |             self.html_downloader.download(url)
 93 | 
 94 |         except Exception as e:
 95 |             raise CrwyExtendException(e)
 96 | 
 97 |     def add_black(self, token, phone):
 98 |         try:
 99 |             url = "http://xapi.xunma.net/addBlack?" \
100 |                   "token={token}&phoneList={phone};" \
101 |                   "&Code=UTF8".format(token=token, phone=phone)
102 |             self.html_downloader.download(url)
103 | 
104 |         except Exception as e:
105 |             raise CrwyExtendException(e)
106 | 


--------------------------------------------------------------------------------
/crwy/utils/extend/yima.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: PyCharm
  7 | @file: yima.py
  8 | @create at: 2017-10-27 09:57
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | """
 12 | 
 13 | from __future__ import print_function, unicode_literals
 14 | 
 15 | from crwy.spider import Spider
 16 | from crwy.exceptions import CrwyException
 17 | 
 18 | 
 19 | class YiMa(Spider):
 20 |     def __init__(self, username, password, item_id):
 21 |         super(YiMa, self).__init__()
 22 |         if username and password and item_id:
 23 |             self.username = username
 24 |             self.password = password
 25 |             self.item_id = item_id
 26 |         else:
 27 |             raise CrwyException("[YiMa] params not valid.")
 28 | 
 29 |     def login(self):
 30 |         """
 31 |         YiMa 登录
 32 |         :return: 登录token
 33 |         """
 34 | 
 35 |         try:
 36 |             url = "http://api.fxhyd.cn/UserInterface.aspx?" \
 37 |                   "action=login&username={username}" \
 38 |                   "&password={password}".format(username=self.username,
 39 |                                                 password=self.password)
 40 |             res = self.html_downloader.download(url)
 41 | 
 42 |             if 'success' not in res.text:
 43 |                 raise CrwyException("[YiMa] Login failed.")
 44 | 
 45 |             return res.text.strip().split("|")[-1]
 46 |         except Exception as e:
 47 |             raise CrwyException(e)
 48 | 
 49 |     def get_phone(self, token, phone_type='',
 50 |                   phone='', not_prefix=''):
 51 |         """
 52 |         获取手机号
 53 |         :param token:   登录token
 54 |         :param phone_type:  运营商 1 [移动] 2 [联通] 3 [电信]
 55 |         :param phone:   指定号码
 56 |         :param not_prefix:  不要号段 (例子:notPrefix=170.177 ,代表不获取170和177的号段)
 57 |         :return: 手机号码
 58 |         """
 59 |         try:
 60 |             url = "http://api.fxhyd.cn/UserInterface.aspx?action=getmobile&" \
 61 |                   "token={token}&itemid={item_id}&excludeno=" \
 62 |                   "{not_prefix}&isp={phone_type}&" \
 63 |                   "mobile={phone}".format(token=token, item_id=self.item_id,
 64 |                                           not_prefix=not_prefix,
 65 |                                           phone_type=phone_type, phone=phone)
 66 | 
 67 |             res = self.html_downloader.download(url)
 68 |             if 'success' not in res.text:
 69 |                 raise CrwyException("[YiMa] get phone failed.")
 70 | 
 71 |             # print(res.text)
 72 |             return res.text.strip().split('|')[-1]
 73 | 
 74 |         except Exception as e:
 75 |             raise CrwyException(e)
 76 | 
 77 |     def get_message(self, token, phone):
 78 |         """
 79 |         获取短信消息
 80 |         :param token:   登录token
 81 |         :param phone:   手机号
 82 |         :return:
 83 |         """
 84 |         try:
 85 |             url = "http://api.fxhyd.cn/UserInterface.aspx?action=getsms&" \
 86 |                   "token={token}&itemid={item_id}&mobile={phone}" \
 87 |                   "&release=0".format(token=token, item_id=self.item_id,
 88 |                                       phone=phone)
 89 |             res = self.html_downloader.download(url)
 90 | 
 91 |             if 'success' not in res.text:
 92 |                 raise CrwyException("[YiMa] get message failed.")
 93 | 
 94 |             else:
 95 |                 return res.text.strip().split('|')[-1]
 96 | 
 97 |         except Exception as e:
 98 |             raise CrwyException(e)
 99 | 
100 |     def release_phone(self, token, phone):
101 |         try:
102 |             url = "http://api.fxhyd.cn/UserInterface.aspx?action=release&" \
103 |                   "token={token}&itemid={item_id}&mobile={phone}" \
104 |                   "&release=0".format(token=token, item_id=self.item_id,
105 |                                       phone=phone)
106 |             res = self.html_downloader.download(url)
107 | 
108 |             if 'success' not in res.text:
109 |                 raise CrwyException("[YiMa] release phone failed.")
110 | 
111 |         except Exception as e:
112 |             raise CrwyException(e)
113 | 
114 |     def add_black(self, token, phone):
115 |         try:
116 |             url = "http://api.fxhyd.cn/UserInterface.aspx?action=addignore&" \
117 |                   "token={token}&itemid={item_id}&mobile={phone}" \
118 |                   "&release=0".format(token=token, item_id=self.item_id,
119 |                                       phone=phone)
120 |             res = self.html_downloader.download(url)
121 | 
122 |             if 'success' not in res.text:
123 |                 raise CrwyException("[YiMa] black phone failed.")
124 | 
125 |         except Exception as e:
126 |             raise CrwyException(e)
127 | 


--------------------------------------------------------------------------------
/crwy/utils/filter/RedisSet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | from crwy.utils.no_sql.redis_m import get_redis_client
 6 | 
 7 | 
 8 | class RedisSet(object):
 9 |     """Simple Deduplicate with Redis Backend"""
10 | 
11 |     def __init__(self, name, namespace='deduplicate', server=None,
12 |                  **redis_kwargs):
13 |         """
14 |         The default connection parameters are:
15 |         host='localhost', port=6379, db=0
16 |         """
17 |         if server:
18 |             self.__db = server
19 |         else:
20 |             self.__db = get_redis_client(**redis_kwargs)
21 |         self.key = '%s:%s' % (namespace, name)
22 | 
23 |     def sadd(self, item):
24 |         """Add item."""
25 |         if self.__db.sadd(self.key, item) == 0:
26 |             return False
27 |         else:
28 |             return True
29 | 
30 |     def srem(self, item):
31 |         """Del item."""
32 |         if self.__db.srem(self.key, item) == 0:
33 |             return False
34 |         else:
35 |             return True
36 | 
37 |     def scard(self):
38 |         """Return total count."""
39 |         return self.__db.scard(self.key)
40 | 
41 |     def smembers(self):
42 |         """Return all item."""
43 |         return self.__db.smembers(self.key)
44 | 
45 |     def clean(self):
46 |         """Empty key"""
47 |         return self.__db.delete(self.key)
48 | 
49 |     def db(self):
50 |         return self.__db
51 | 


--------------------------------------------------------------------------------
/crwy/utils/filter/RedisSortedSet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | from crwy.utils.no_sql.redis_m import get_redis_client
 6 | 
 7 | 
 8 | class RedisSortedSet(object):
 9 |     """Simple Sorted Deduplicate with Redis Backend"""
10 | 
11 |     def __init__(self, name, namespace='deduplicate_sorted', server=None,
12 |                  **redis_kwargs):
13 |         """
14 |         The default connection parameters are:
15 |         host='localhost', port=6379, db=0
16 |         """
17 |         if server:
18 |             self.__db = server
19 |         else:
20 |             self.__db = get_redis_client(**redis_kwargs)
21 |         self.key = '%s:%s' % (namespace, name)
22 | 
23 |     def zadd(self, score, item):
24 |         """Add item."""
25 |         if self.__db.zadd(self.key, score, item) == 0:
26 |             return False
27 |         else:
28 |             return True
29 | 
30 |     def zrem(self, item):
31 |         """Del item."""
32 |         if self.__db.zrem(self.key, item) == 0:
33 |             return False
34 |         else:
35 |             return True
36 | 
37 |     def zcard(self):
38 |         """Return total count."""
39 |         return self.__db.zcard(self.key)
40 | 
41 |     def zscore(self, item):
42 |         """Return item score."""
43 |         return self.__db.zscore(self.key, item)
44 | 
45 |     def zmembers(self):
46 |         """Return all item."""
47 |         return self.__db.zmembers(self.key)
48 | 
49 |     def clean(self):
50 |         """Empty key"""
51 |         return self.__db.delete(self.key)
52 | 
53 |     def db(self):
54 |         return self.__db
55 | 


--------------------------------------------------------------------------------
/crwy/utils/filter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/filter/__init__.py


--------------------------------------------------------------------------------
/crwy/utils/html/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/crwy/utils/html/font_analysis.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: IntelliJ IDEA
  7 | @file: font_analysis.py
  8 | @create at: 2018-08-22 19:42
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | 
 12 | 本工具类适用于58同城，其他站点可将该类作为基础类进行扩展
 13 | 
 14 | 思路：
 15 | 
 16 | 1. 获取web页面内容；
 17 | 2. 获取字体文件；
 18 | 3. 获取字体xml文件，解析出经过自定义的文字；
 19 | 4. 获取文字value的hash值（经测试发现，文字的key、value对应关系每次请求都是变化的，而不可能
 20 | 改变的是字的value，所以这里通过文字value的hash值来确定，是哪一个字，反推确定页面上的字符对应的
 21 | 字是什么。）
 22 | 
 23 | like：
 24 | 
 25 | '77880914931fb6dda97269a9156404745f609d35': '黄'
 26 | 
 27 | hash值与文字对应mapping需要人工，通过字体软件对应 推荐 fontforge
 28 | 
 29 | 5. 通过人工确认的mapping，找到页面上字符与真实字体的对应关系；
 30 | 6. 替换原始页面中的字符
 31 | 
 32 | """
 33 | 
 34 | import base64
 35 | import hashlib
 36 | # import json
 37 | import os
 38 | import re
 39 | import uuid
 40 | from fontTools.ttLib import TTFont
 41 | from crwy.spider import BaseSpider
 42 | 
 43 | 
 44 | class FontAnalysis(BaseSpider):
 45 |     def __init__(self, html=None):
 46 |         super(FontAnalysis, self).__init__()
 47 |         uid = str(uuid.uuid1())
 48 |         self.font_path = './data/font/font-{}.woff'.format(uid)
 49 |         self.xml_path = './data/xml/font-{}.xml'.format(uid)
 50 |         self.html = html if html else self.get_test_html()
 51 | 
 52 |     def get_test_html(self):
 53 |         # 58简历页
 54 |         url = 'https://bj.58.com/qzyewu/pn2/?PGTID=0d303353-0000-1188-7c8a-829b2b71d0e8&ClickID=2'
 55 | 
 56 |         res = self.html_downloader.download(url)
 57 | 
 58 |         return res.text
 59 | 
 60 |     def save_font(self):
 61 |         """
 62 |         保存字体
 63 |         :return:
 64 |         """
 65 |         base64string = re.search('(?<=base64,).*?(?=\))', self.html).group()
 66 |         bin_data = base64.b64decode(base64string)
 67 |         with open(self.font_path, 'wb') as f:
 68 |             f.write(bin_data)
 69 | 
 70 |     def get_font_xml(self):
 71 |         """
 72 |         获取字体 xml
 73 |         :return:
 74 |         """
 75 |         font = TTFont(self.font_path)
 76 |         font.saveXML(self.xml_path)
 77 | 
 78 |     def analysis(self, is_clean=True, debug=False):
 79 |         """
 80 |         解析xml，获取web页面字符与文字key及文字value hash值的对应关系
 81 |         :param is_clean: 是否清楚字体文件及字体xml文件
 82 |         :param debug: 是否终端输出字体对照关系
 83 |         :return:
 84 |         """
 85 | 
 86 |         self.save_font()
 87 |         self.get_font_xml()
 88 |         with open(self.xml_path, 'rb') as xml:
 89 |             soups = self.html_parser.parser(xml.read())
 90 |             ttglyph_lst = soups.find('glyf').find_all('ttglyph')[1:]
 91 |             map_lst = soups.find('cmap').find_all('map')
 92 |             map_dict = {}
 93 |             for map in map_lst:
 94 |                 map_dict[map.get('name')] = map.get('code')
 95 |             # print(map_dict, len(map_dict))
 96 |             analysis_res = []
 97 |             for ttglyph in ttglyph_lst:
 98 |                 analysis_dict = dict()
 99 |                 analysis_dict['ttglyph_name'] = ttglyph.get('name')
100 |                 # analysis_dict['html_name'] = '&#x{};'.format(
101 |                 #     analysis_dict['ttglyph_name'][3:].lower())
102 |                 # x_distance = str(int(ttglyph.get('xmax')) - int(ttglyph.get('xmin')))
103 |                 # y_distance = str(int(ttglyph.get('ymax')) - int(ttglyph.get('ymin')))
104 |                 analysis_dict['html_name'] = '&#x{};'.format(
105 |                     map_dict.get(ttglyph.get('name'))[2:].upper())
106 |                 ttglyph_value = []
107 |                 contour_lst = ttglyph.find_all('contour')
108 |                 for contour in contour_lst:
109 |                     pt_lst = contour.find_all('pt')
110 |                     for pt in pt_lst[:1]:
111 |                         tmp = str(int(pt.get('x')) - int(pt.get('y')))
112 |                         # pt['y'] = str(int(y_distance) - int(pt.get('y')))
113 |                         ttglyph_value.append(tmp)
114 |                 analysis_dict['ttglyph_value'] = str(sorted(ttglyph_value))
115 |                 analysis_dict['font_hash'] = hashlib.sha1(
116 |                     analysis_dict['ttglyph_value'].encode('utf-8')
117 |                 ).hexdigest()
118 |                 analysis_res.append(analysis_dict)
119 | 
120 |                 if debug:
121 |                     print(analysis_dict['ttglyph_name'],  # 字体key
122 |                           analysis_dict['html_name'],  # web页面显示值
123 |                           analysis_dict['font_hash'],  # 字体内容哈希值
124 |                           analysis_dict['ttglyph_value'])
125 | 
126 |             if is_clean:
127 |                 os.remove(self.font_path)
128 |                 os.remove(self.xml_path)
129 | 
130 |             return analysis_res
131 | 
132 |     @staticmethod
133 |     def get_real_font_mapping(analysis_res, font_mapping, debug=False):
134 |         real_font_mapping = dict()
135 |         for item in analysis_res:
136 |             real_font_mapping[item['html_name']] = font_mapping[
137 |                 item['font_hash']]
138 |             if debug is True:
139 |                 print(item['html_name'], font_mapping[item['font_hash']])
140 | 
141 |         return real_font_mapping
142 | 
143 |     @staticmethod
144 |     def recover_html(html, real_font_mapping):
145 |         for k, v in real_font_mapping.items():
146 |             html = html.replace(k, v)
147 |         return html
148 | 
149 | 
150 | font_mapping = {
151 |     '7fd63556d48347cd5a50007b3151e2735f93bed2': '',
152 |     'ed465eefa32423091781b4cf7136d16d3ebce463': '技',
153 |     'f9e740d4af46806fd75ab69783555c87f6ec7706': '6',
154 |     '50365252b61dbe2651e0c83bebc8d00ef763a158': '经',
155 |     'ecc7ed15aa268e5a699eb8ddbe73ad2b27911ee1': '王',
156 |     '891090dd6e752593d367a61dc3891f1cb110f0dc': '应',
157 |     '6aea2037fa2d83b11da6dcc837443c7b2a9be22e': '专',
158 |     '7138fba5e0f9093c696c20ac994857385f257c1e': '赵',
159 |     '2609770c8afd922eb37758dd8828db1b566c7fd6': '李',
160 |     '02c3341d2d8085eded8233ece3f54b6540322eac': '以',
161 |     '75859289bdf9b78f1842ed692daf445f403e2b88': '吴',
162 |     'f9dcd3c88958fc85f6fbf770532929d3b3891a53': '女',
163 |     'e83dc230a3f59d5361bbcdc82973529c6fcbf443': '杨',
164 |     '8371e4d560301720541aa2b18c92d7624ff11082': '7',
165 |     '19abe86dc73d03989b6e2c9ba3e86d05f187a3c6': '5',
166 |     'fdb060ded208610d1923ff00a5cee237a021be83': '张',
167 |     'b032add2a6287c5d6ab051bef17e37c45d714f40': 'B',
168 |     '5b262e3ff34a8ec29d9b5b271e0de2396de260ce': '本',
169 |     '512adeb01f06bca832fd2a6dea974f09029edeeb': '男',
170 |     '6f15d67b48b66a140ca1858aae74897c4a5a79f6': '博',
171 |     'b2f5589957afac462f1620e6647e7c33f05dce96': '3',
172 |     'e24ed92db7331e919161e6d2961d35ccaae0593a': '无',
173 |     '3cd650e51bbe8268ffb4ec2ab9537937eddac0dd': '9',
174 |     'e1eb5abd0f77e1c2627a50eda8a4765c4be7a606': '生',
175 |     '04783616ad232d7ad4d886876f4eaaf5a0bbb580': '验',
176 |     '374446ca738266d7c1da2a551f15c54cdd12e460': '8',
177 |     'e2af36b0e0add44124e65c78e0bb388a91da5373': '下',
178 |     '679f2fd459c646e6e1668938a57f7c1a248f806d': '科',
179 |     '6924fddd64c781721e91d3797742761616b58532': '1',
180 |     'c81c0fe74c783fdc90d8ff7bcb801ac73646f5bf': '4',
181 |     '07aa1ed8e3c9b283fd20a9942fd81d925fb49de2': 'M',
182 |     '968f86f893ff4e01575e3acc1e61c940b424d479': '中',
183 |     '7b9ef322a4ea16ddd46ce3afbf9ae7265b638947': '硕',
184 |     '5ac2c4226b898a1cb133c1cd012506399c291253': '届',
185 |     'e51700563d6f6c7c82def8c408b972ae28870a7e': '2',
186 |     '44b8aeb98a4556a2d9e0121848d1ce5ddd1cf820': '刘',
187 |     'b1df1706be1e25ce632329d0967bc4d87de7a0ce': '士',
188 |     'b412e5268df2f25d418a5547c902e0794ed33a7c': '陈',
189 |     'c0fdac37111eee42dde8f1481eddf803c1c6eafd': '高',
190 |     '859502b5860c99ec88061eb60dfc0c2af03e1778': '大',
191 |     'afd6a61616dbb4215b9327194e177e6caa71ace9': '0',
192 |     'f4d6418264b302bc9d70693b5cc3d4a7da01a445': 'E',
193 |     'b74117c62c7ff7ae79a1d01dfe0baa21874d3ae8': '周',
194 |     '9372bfdc75a272eceb0f4fbf8bd5c86bc21b4b1d': 'A',
195 |     '9e2c166141627880782747ba69b9972885ca798a': '校',
196 |     'f38c4fe982d55a9bcb4460b82db43f995bc5a992': '黄'
197 | }
198 | 
199 | # def main():
200 | #     runner = FontAnalysis()
201 | #     analysis_res = runner.analysis(debug=True)
202 | #     real_font_mapping = runner.get_real_font_mapping(analysis_res, font_mapping)
203 | #     print(real_font_mapping)
204 | #
205 | #     real_html = runner.recover_html(html=runner.html,
206 | #                                     real_font_mapping=real_font_mapping)
207 | #     print(real_html)
208 | #
209 | #
210 | # if __name__ == '__main__':
211 | #     main()
212 | 


--------------------------------------------------------------------------------
/crwy/utils/html/html_downloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | import requests
 6 | 
 7 | 
 8 | class HtmlDownloader(object):
 9 |     """ 下载器 """
10 | 
11 |     def __init__(self):
12 |         self.session = requests.session()
13 | 
14 |     def download(self, url, method='GET', single=False, timeout=60,
15 |                  **kwargs):
16 |         """
17 |         请求页面
18 |         :param url: 目标地址
19 |         :param method: 请求方式
20 |         :param single: 为True时，不使用session
21 |         :param timeout: 初始化超时时间
22 |         :param kwargs: 更多requests参数
23 |         :return: 返回requests session对象
24 |         """
25 | 
26 |         if single is False:
27 |             if method.upper() == 'GET':
28 |                 return self.session.get(url, timeout=timeout, **kwargs)
29 |             return self.session.post(url, timeout=timeout, **kwargs)
30 |         else:
31 |             if method.upper() == 'GET':
32 |                 return requests.get(url, timeout=timeout, **kwargs)
33 |             return requests.post(url, timeout=timeout, **kwargs)
34 | 
35 |     def download_file(self, url, method='GET', single=False, timeout=180,
36 |                       save_path='./data/', file_name=None, **kwargs):
37 |         """
38 |         请求文件
39 |         :param url: 目标地址
40 |         :param method: 请求方式
41 |         :param single: 为True时，不使用session
42 |         :param timeout: 初始化超时时间
43 |         :param save_path: 保存路径
44 |         :param file_name: 文件名称，默认为空
45 |         :param kwargs: 更多requests参数
46 |         :return: 返回保存路径
47 |         """
48 |         if not file_name:
49 |             file_name = url.split('/')[-1]
50 |         tmp = self.download(url, method=method, single=single,
51 |                             timeout=timeout,
52 |                             stream=True, **kwargs)
53 |         with open(save_path + file_name, 'wb') as f:
54 |             for chunk in tmp.iter_content(chunk_size=1024):
55 |                 if chunk:  # filter out keep-alive new chunks
56 |                     f.write(chunk)
57 |                     f.flush()
58 |             f.close()
59 |         return save_path + file_name
60 | 


--------------------------------------------------------------------------------
/crwy/utils/html/html_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | from bs4 import BeautifulSoup
 5 | try:
 6 |     import PyV8
 7 | except ImportError:
 8 |     pass
 9 | 
10 | 
11 | class HtmlParser(object):
12 |     """ 解析器 """
13 |     @staticmethod
14 |     def parser(response):
15 |         """
16 |         utf-8字符处理
17 |         :param response: 待处理字符串
18 |         :return: 返回bs对象
19 |         """
20 |         if response is None:
21 |             return
22 | 
23 |         if sys.version_info < (3, ):
24 |             soup = BeautifulSoup(str(response), 'html.parser',
25 |                                  from_encoding='utf-8')
26 |         else:
27 |             soup = BeautifulSoup(str(response), 'html.parser')
28 | 
29 |         return soup
30 | 
31 |     @staticmethod
32 |     def gbk_parser(response):
33 |         """
34 |         gbk字符处理
35 |         :param response: 待处理字符串
36 |         :return: 返回bs对象
37 |         """
38 |         if response is None:
39 |             return
40 | 
41 |         if sys.version_info < (3, ):
42 |             soup = BeautifulSoup(str(response), 'html.parser',
43 |                                  from_encoding='gb18030')
44 |         else:
45 |             soup = BeautifulSoup(str(response), 'html.parser')
46 | 
47 |         return soup
48 | 
49 |     @staticmethod
50 |     def jsonp_parser(data):
51 |         """
52 |         非规范json数据处理 {a:1, b:1}
53 |         key非字符串
54 |         :param data: 待处理字符串
55 |         :return: 返回标准json数据
56 |         """
57 |         ctx = PyV8.JSContext()
58 |         ctx.enter()
59 |         ctx.eval("""
60 |             function func() {
61 |               var data = """ + data + """;
62 |               var json_data = JSON.stringify(data);
63 |               return json_data;
64 |             }
65 |         """)
66 |         return ctx.locals.func()
67 | 
68 | 


--------------------------------------------------------------------------------
/crwy/utils/load_settings.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: PyCharm
 7 | @file: load_settings.py
 8 | @create at: 2018-06-20 19:32
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | import consul
14 | from crwy.exceptions import CrwyException
15 | 
16 | 
17 | class LoadSettingsFromConsul(object):
18 |     def __init__(self, **kwargs):
19 |         self.c = consul.Consul(**kwargs)
20 |         self.main_key = None
21 | 
22 |     def init_main_key(self, key=None):
23 |         if not key:
24 |             raise CrwyException('Please set key first.')
25 |         self.main_key = key
26 | 
27 |     def _get_settings(self, key=None):
28 |         self.init_main_key(key=key)
29 |         index, data = self.c.kv.get(self.main_key, recurse=True)
30 |         if not data:
31 |             raise CrwyException('Please make sure the key: <%s> is exist.' %
32 |                                 self.main_key)
33 | 
34 |         new_data = {
35 |             item.get('Key').split('/')[-1]: eval(item.get('Value'))
36 |             for item in data
37 |         }
38 | 
39 |         return new_data
40 | 
41 |     @classmethod
42 |     def get_settings(cls, key=None, **kwargs):
43 |         load_settings = cls(**kwargs)
44 |         return load_settings._get_settings(key=key)
45 | 


--------------------------------------------------------------------------------
/crwy/utils/logger.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: wuyue
  4 | # Email: wuyue92tree@163.com
  5 | 
  6 | 
  7 | import os
  8 | import sys
  9 | import logging
 10 | import logging.config
 11 | import logging.handlers
 12 | from crwy.exceptions import CrwyException
 13 | from crwy.settings.default_settings import TEMPLATE_DIR
 14 | 
 15 | try:
 16 |     import ConfigParser as configparser
 17 | except ImportError:
 18 |     import configparser
 19 | 
 20 | DEFAULT_LOGGER_CONF = './conf/logger.conf'
 21 | 
 22 | if sys.version_info[0] == 2:
 23 |     BASE_LOGGER_CONF = os.path.join(
 24 |         TEMPLATE_DIR, 'project/logger_py2.conf.tmpl')
 25 | else:
 26 |     BASE_LOGGER_CONF = os.path.join(
 27 |         TEMPLATE_DIR, 'project/logger_py3.conf.tmpl')
 28 | 
 29 | try:
 30 |     try:
 31 |         logging.config.fileConfig(DEFAULT_LOGGER_CONF)
 32 |     except KeyError:
 33 |         logging.config.fileConfig(BASE_LOGGER_CONF)
 34 | except:
 35 |     pass
 36 | 
 37 | 
 38 | def _install_handlers_custom(cp, formatters, log_path):
 39 |     """Install and return handlers"""
 40 |     hlist = cp.get("handlers", "keys")
 41 |     if not len(hlist):
 42 |         return {}
 43 |     hlist = hlist.split(",")
 44 |     hlist = logging.config._strip_spaces(hlist)
 45 |     handlers = {}
 46 |     fixups = []  # for inter-handler references
 47 |     for hand in hlist:
 48 |         sectname = "handler_%s" % hand
 49 |         klass = cp.get(sectname, "class")
 50 |         opts = cp.options(sectname)
 51 |         if "formatter" in opts:
 52 |             fmt = cp.get(sectname, "formatter")
 53 |         else:
 54 |             fmt = ""
 55 |         try:
 56 |             klass = eval(klass, vars(logging))
 57 |         except (AttributeError, NameError):
 58 |             klass = logging.config._resolve(klass)
 59 |         args = cp.get(sectname, "args")
 60 |         args = eval(args, vars(logging))
 61 | 
 62 |         # 修改args中的path参数
 63 |         if isinstance(args[0], str):
 64 |             args = tuple([log_path] + list(args)[1:])
 65 | 
 66 |         h = klass(*args)
 67 | 
 68 |         if "level" in opts:
 69 |             level = cp.get(sectname, "level")
 70 |             try:
 71 |                 h.setLevel(logging._levelNames[level])
 72 |             except AttributeError:
 73 |                 h.setLevel(logging._nameToLevel[level])
 74 |         if len(fmt):
 75 |             h.setFormatter(formatters[fmt])
 76 |         if issubclass(klass, logging.handlers.MemoryHandler):
 77 |             if "target" in opts:
 78 |                 target = cp.get(sectname, "target")
 79 |             else:
 80 |                 target = ""
 81 |             if len(target):
 82 |                 fixups.append((h, target))
 83 |         handlers[hand] = h
 84 | 
 85 |     for h, t in fixups:
 86 |         h.setTarget(handlers[t])
 87 |     return handlers
 88 | 
 89 | 
 90 | def fileConfigWithLogPath(fname=BASE_LOGGER_CONF,
 91 |                           log_path=None,
 92 |                           defaults=None,
 93 |                           disable_existing_loggers=True):
 94 |     """
 95 |     通过拦截重写handler的方式传入log_path，实现日志位置修改
 96 |     """
 97 |     if not log_path:
 98 |         raise CrwyException('Please setup <log_path> first!')
 99 | 
100 |     cp = configparser.ConfigParser(defaults)
101 |     if hasattr(fname, 'readline'):
102 |         cp.read_file(fname)
103 |     else:
104 |         cp.read(fname)
105 |     try:
106 |         formatters = logging.config._create_formatters(cp)
107 |     except configparser.NoSectionError:
108 |         raise CrwyException('Please make sure fname: "%s" is exist.' % fname)
109 | 
110 |     logging._acquireLock()
111 |     try:
112 |         logging._handlers.clear()
113 |         del logging._handlerList[:]
114 |         # Handlers add themselves to logging._handlers
115 |         handlers = _install_handlers_custom(cp, formatters, log_path)
116 |         logging.config._install_loggers(cp, handlers, disable_existing_loggers)
117 |     finally:
118 |         logging._releaseLock()
119 | 
120 | 
121 | class Logger(object):
122 |     @staticmethod
123 |     def file_logger():
124 |         return logging.getLogger('fileLogger')
125 | 
126 |     @staticmethod
127 |     def rt_logger():
128 |         return logging.getLogger('rtLogger')
129 | 
130 |     @staticmethod
131 |     def timed_rt_logger():
132 |         return logging.getLogger('timedRtLogger')
133 | 
134 |     @staticmethod
135 |     def extra_logger(name=None):
136 |         return logging.getLogger(name)
137 | 
138 | 


--------------------------------------------------------------------------------
/crwy/utils/mail.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Author: wuyue
  4 | # Email: wuyue92tree@163.com
  5 | 
  6 | 
  7 | from __future__ import print_function, unicode_literals
  8 | 
  9 | import email
 10 | import re
 11 | import traceback
 12 | import imaplib
 13 | 
 14 | from imapclient import IMAPClient
 15 | from email.header import decode_header
 16 | 
 17 | imaplib._MAXLINE = 10000000
 18 | 
 19 | SEEN = br'\Seen'
 20 | 
 21 | 
 22 | class MailReceiver(IMAPClient):
 23 |     def __init__(self, host, timeout=60, **kwargs):
 24 |         super(MailReceiver, self).__init__(host, timeout=timeout, **kwargs)
 25 | 
 26 |     def get_folder_list(self):
 27 |         """
 28 |         获取邮箱文件夹
 29 |         :return: list
 30 |         """
 31 |         folders = self.list_folders()
 32 |         res_list = []
 33 |         for folder in folders:
 34 |             if folder:
 35 |                 # print(folder[2].encode("utf-8"))
 36 |                 res_list.append(folder[2])
 37 |         return res_list
 38 | 
 39 |     def get_message_id_list(self, mailbox='INBOX', search_='all'):
 40 |         """
 41 |         获取邮件ID列表
 42 |         :param mailbox: 邮箱文件夹
 43 |         :param search_: 搜索规则
 44 |         :return: list
 45 |         """
 46 |         self.select_folder(mailbox)
 47 |         # message_list = self.server.search('ON 21-Dec-2017')
 48 |         message_list = self.search(search_)
 49 |         return message_list
 50 | 
 51 |     def get_message_list(self, message_id_list):
 52 |         """
 53 |         获取邮件列表
 54 |         :param message_id_list: 邮件ID列表
 55 |         :return: dict   id:email
 56 |         """
 57 |         message_list = self.fetch(
 58 |             message_id_list, ['INTERNALDATE', 'FLAGS', 'BODY.PEEK[]'])
 59 |         if not message_list:
 60 |             return
 61 | 
 62 |         return message_list
 63 | 
 64 |     @staticmethod
 65 |     def parse_email(m, flag=None):
 66 |         """
 67 |         解析邮件header内容
 68 |         :param m: 原内容
 69 |         :param flag: 解析类型标识
 70 |         :return: 编码转换后内容
 71 |         """
 72 |         res = []
 73 |         try:
 74 |             for s, c in decode_header(m):
 75 |                 if c:
 76 |                     res.append(s.decode(c, 'ignore'))
 77 |                 else:
 78 |                     res.append(s.decode('utf-8') if isinstance(s, bytes) else s)
 79 | 
 80 |             if not res:
 81 |                 return
 82 | 
 83 |             # 处理邮件发送方 返回邮箱地址
 84 |             if flag == 'from':
 85 |                 res = re.findall(
 86 |                     '[0-9a-zA-Z_\.]{0,19}@[0-9a-zA-Z\.]{1,100}', res[1])
 87 |                 return res[0]
 88 | 
 89 |             # 处理邮件接收方 返回邮箱地址列表
 90 |             if flag == 'to':
 91 |                 new_res = []
 92 |                 for e in res[0].split(','):
 93 |                     em = re.findall('[0-9a-zA-Z_\.]{0,19}@[0-9a-zA-Z\.]{1,100}', e)
 94 |                     if em:
 95 |                         new_res.append(em[0])
 96 | 
 97 |                 return new_res
 98 | 
 99 |             return res[0]
100 |         except Exception as e:
101 |             traceback.print_exc()
102 |             return res
103 | 
104 |     def get_message_content(self, message):
105 |         """
106 |         获取邮件内容
107 |         :param message:
108 |         :return:
109 |         """
110 |         try:
111 |             while True:
112 |                 res = {}
113 |                 msg = email.message_from_bytes(message[b'BODY[]'])
114 |                 res['subject'] = self.parse_email(msg['Subject'])
115 |                 res['from'] = self.parse_email(msg['From'], flag='from')
116 |                 res['to'] = self.parse_email(msg['To'], flag='to')
117 |                 res['date'] = self.parse_email(msg['Date'])
118 | 
119 |                 for par in msg.walk():
120 |                     if not par.is_multipart():
121 |                         name = par.get_param("name")
122 |                         if name:
123 |                             # print(name)
124 |                             pass
125 |                         else:
126 |                             body = par.get_payload(decode=True)
127 |                             if not body:
128 |                                 continue
129 |                             try:
130 |                                 code = par.get_content_charset()
131 |                                 res['body'] = body.decode(code, 'ignore')
132 |                             except TypeError:
133 |                                 res['body'] = body
134 |                 return res
135 | 
136 |         except Exception as e:
137 |             traceback.print_exc()
138 |             return
139 | 
140 |     def delete_message(self, messages, deleted_folder="Deleted Messages"):
141 |         """
142 |         删除邮件
143 |         :param messages:
144 |         :param deleted_folder:
145 |         :return:
146 |         """
147 |         try:
148 |             self.add_flags(messages, SEEN)
149 |             if deleted_folder:
150 |                 # 将邮件移动到 已删除
151 |                 self.copy(messages, deleted_folder)
152 |             self.delete_messages(messages)
153 |             self.expunge()
154 |             return True
155 |         except Exception as e:
156 |             traceback.print_exc(e)
157 |             return False
158 | 


--------------------------------------------------------------------------------
/crwy/utils/no_sql/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/no_sql/__init__.py


--------------------------------------------------------------------------------
/crwy/utils/no_sql/redis_m.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: PyCharm
 7 | @file: redis_m.py
 8 | @create at: 2017-12-26 14:50
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | from crwy.exceptions import CrwyImportException
14 | from crwy.decorates import cls2singleton
15 | 
16 | try:
17 |     import redis
18 | except ImportError:
19 |     raise CrwyImportException(
20 |         "You should install redis plugin first! try: pip install redis")
21 | 
22 | 
23 | @cls2singleton
24 | class RedisDb(object):
25 |     def __init__(self, **kwargs):
26 |         if 'url' in kwargs.keys():
27 |             url = kwargs.pop('url')
28 |             self.pool = redis.ConnectionPool.from_url(url, **kwargs)
29 |         else:
30 |             self.pool = redis.ConnectionPool(**kwargs)
31 |         self.db = redis.StrictRedis(connection_pool=self.pool)
32 | 
33 | 
34 | def get_redis_client(**kwargs):
35 |     r = RedisDb(**kwargs)
36 |     return r.db
37 | 


--------------------------------------------------------------------------------
/crwy/utils/pyppeteer_api.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: IntelliJ IDEA
 7 | @file: pyppeteer_api.py
 8 | @create at: 2019-03-24 17:04
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | import asyncio
14 | from crwy.spider import Spider
15 | 
16 | try:
17 |     from pyppeteer import launch
18 | except ImportError:
19 |     pass
20 | 
21 | 
22 | class PyppeteerApi(Spider):
23 |     def __init__(self, logger=None, proxy=None, **kwargs):
24 |         super(PyppeteerApi, self).__init__(logger=logger)
25 | 
26 | 
27 | def main():
28 |     executor = PyppeteerApi()
29 |     # TODO
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/crwy/utils/queue/RedisQueue.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | from crwy.utils.no_sql.redis_m import get_redis_client
 6 | 
 7 | 
 8 | class RedisQueue(object):
 9 |     """Simple Queue with Redis Backend"""
10 | 
11 |     def __init__(self, name, namespace='queue', server=None, **redis_kwargs):
12 |         """The default connection parameters are:
13 |         host='localhost', port=6379, db=0"""
14 |         if server:
15 |             self.__db = server
16 |         else:
17 |             self.__db = get_redis_client(**redis_kwargs)
18 |         self.key = '%s:%s' % (namespace, name)
19 | 
20 |     def qsize(self):
21 |         """Return the approximate size of the queue."""
22 |         return self.__db.llen(self.key)
23 | 
24 |     def empty(self):
25 |         """Return True if the queue is empty, False otherwise."""
26 |         return self.qsize() == 0
27 | 
28 |     def put(self, item):
29 |         """Put item into the queue."""
30 |         self.__db.rpush(self.key, item)
31 | 
32 |     def get(self, block=True, timeout=None):
33 |         """Remove and return an item from the queue.
34 | 
35 |         If optional args block is true and timeout is None (the default), block
36 |         if necessary until an item is available."""
37 |         if block:
38 |             item = self.__db.blpop(self.key, timeout=timeout)
39 |         else:
40 |             item = self.__db.lpop(self.key)
41 | 
42 |         if item:
43 |             item = item[1]
44 |         return item
45 | 
46 |     def get_nowait(self):
47 |         """Equivalent to get(False)."""
48 |         return self.get(False)
49 | 
50 |     def clean(self):
51 |         """Empty key"""
52 |         return self.__db.delete(self.key)
53 | 
54 |     def db(self):
55 |         return self.__db
56 | 


--------------------------------------------------------------------------------
/crwy/utils/queue/SsdbQueue.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | import pyssdb
 6 | 
 7 | 
 8 | class SsdbQueue(object):
 9 |     """Simple Queue with SSDB Backend"""
10 | 
11 |     def __init__(self, name, **ssdb_kwargs):
12 |         """The default connection parameters are:
13 |         host='localhost', port=8888"""
14 |         self.__db = pyssdb.Client(**ssdb_kwargs)
15 |         self.key = name
16 | 
17 |     def qsize(self):
18 |         """Return the approximate size of the queue."""
19 |         return self.__db.qsize(self.key)
20 | 
21 |     def empty(self):
22 |         """Return True if the queue is empty, False otherwise."""
23 |         return self.qsize() == 0
24 | 
25 |     def put(self, item):
26 |         """Put item into the queue."""
27 |         self.__db.qpush(self.key, item)
28 | 
29 |     def get(self):
30 |         """Remove and return an item from the queue.
31 | 
32 |         If optional args block is true and timeout is None (the default), block
33 |         if necessary until an item is available."""
34 | 
35 |         item = self.__db.qpop(self.key)
36 | 
37 |         return item
38 | 
39 |     def clean(self):
40 |         """Empty key"""
41 |         return self.__db.qclear(self.key)
42 | 
43 |     def db(self):
44 |         return self.__db
45 | 


--------------------------------------------------------------------------------
/crwy/utils/queue/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/queue/__init__.py


--------------------------------------------------------------------------------
/crwy/utils/scrapy_plugs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/scrapy_plugs/__init__.py


--------------------------------------------------------------------------------
/crwy/utils/scrapy_plugs/dupefilters.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: PyCharm
  7 | @file: dupefilters.py
  8 | @create at: 2018-06-14 14:52
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | """
 12 | 
 13 | import logging
 14 | import time
 15 | import datetime
 16 | import hashlib
 17 | from crwy.utils.filter.RedisSet import RedisSet
 18 | from crwy.utils.filter.RedisSortedSet import RedisSortedSet
 19 | from scrapy.dupefilters import BaseDupeFilter
 20 | from scrapy.exceptions import NotConfigured
 21 | from scrapy_redis.connection import get_redis_from_settings
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | class RedisRFPDupeFilter(BaseDupeFilter):
 27 |     """
 28 |     dupefilter by redis, redis connect base on scrapy-redis connect
 29 | 
 30 |     warning:
 31 |         config SPIDER_NAME in settings before use
 32 |     default:
 33 |         DUPEFILTER_DEBUG = False
 34 |         DUPEFILTER_DELAY_DAY = 0
 35 |     """
 36 |     logger = logger
 37 | 
 38 |     def __init__(self, debug=False,
 39 |                  server=None,
 40 |                  bot_name=None,
 41 |                  spider_name=None,
 42 |                  duperliter_delay_day=None,
 43 |                  do_hash=None):
 44 |         self.debug = debug
 45 |         self.logdupes = True
 46 |         self.server = server
 47 |         self.bot_name = bot_name
 48 |         self.spider_name = spider_name
 49 |         self.duperliter_delay_day = duperliter_delay_day
 50 |         self.do_hash = do_hash
 51 | 
 52 |     @classmethod
 53 |     def from_settings(cls, settings):
 54 |         server = get_redis_from_settings(settings)
 55 |         debug = settings.getbool('DUPEFILTER_DEBUG')
 56 |         bot_name = settings.get('BOT_NAME')
 57 |         spider_name = settings.get('SPIDER_NAME')
 58 |         duperliter_delay_day = settings.getint('DUPEFILTER_DELAY_DAY', 0)
 59 |         do_hash = settings.getbool('DUPEFILTER_DO_HASH', True)
 60 |         if not spider_name:
 61 |             raise NotConfigured('%s - "SPIDER_NAME" is not found.' %
 62 |                                 cls.__name__)
 63 |         return cls(debug=debug, server=server, bot_name=bot_name,
 64 |                    spider_name=spider_name,
 65 |                    duperliter_delay_day=duperliter_delay_day,
 66 |                    do_hash=do_hash)
 67 | 
 68 |     def request_seen(self, request):
 69 |         if not request.meta.get('dupefilter_key', None):
 70 |             return False
 71 | 
 72 |         if len(request.meta.get('redirect_urls', [])) > 0:
 73 |             # skip url from redirect
 74 |             return False
 75 | 
 76 |         dupefilter_key = request.meta.get('dupefilter_key')
 77 |         dupefilter_key = hashlib.sha1(dupefilter_key).hexdigest() if \
 78 |             self.do_hash else dupefilter_key
 79 | 
 80 |         # SPIDER_NAME for dupefilter
 81 |         key = '{bot_name}:{spider_name}'.format(
 82 |             bot_name=self.bot_name,
 83 |             spider_name=self.spider_name)
 84 | 
 85 |         if request.meta.get('duperliter_delay_day', ''):
 86 |             self.duperliter_delay_day = int(request.meta.get(
 87 |                 'duperliter_delay_day'))
 88 | 
 89 |         if self.duperliter_delay_day == 0:
 90 |             s = RedisSet(key, server=self.server)
 91 |             if s.sadd(dupefilter_key) is True:
 92 |                 return False
 93 |             self.logger.info('Filtered dupefilter_key: %s' %
 94 |                              dupefilter_key)
 95 |             return True
 96 |         else:
 97 |             z = RedisSortedSet(key, server=self.server)
 98 |             now = time.time()
 99 |             last_time = z.zscore(dupefilter_key)
100 | 
101 |             if not last_time:
102 |                 z.zadd(now, dupefilter_key)
103 |                 return False
104 | 
105 |             if (datetime.datetime.utcfromtimestamp(now) -
106 |                 datetime.datetime.utcfromtimestamp(last_time)).days >= \
107 |                     self.duperliter_delay_day:
108 |                 z.zadd(now, dupefilter_key)
109 |                 return False
110 |             self.logger.info('Filtered dupefilter_key within %s day(s): %s' %
111 |                              (self.duperliter_delay_day,
112 |                               request.meta.get('dupefilter_key')))
113 |             return True
114 | 
115 |     def log(self, request, spider):  # log that a request has been filtered
116 |         if self.debug:
117 |             msg = "Filtered duplicate request: %(request)s"
118 |             self.logger.debug(msg, {
119 |                 'request': request.meta.get('dupefilter_key')}, extra={
120 |                 'spider': spider})
121 |         elif self.logdupes:
122 |             msg = ("Filtered duplicate request: %(request)s"
123 |                    " - no more duplicates will be shown"
124 |                    " (see DUPEFILTER_DEBUG to show all duplicates)")
125 |             self.logger.debug(msg, {'request': request},
126 |                               extra={'spider': spider})
127 |             self.logdupes = False
128 | 
129 |         spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
130 | 
131 | 
132 | class ReleaseDupefilterKey(object):
133 |     """
134 |     rm dupefilter_key from redis, when call response
135 |     """
136 | 
137 |     def call(self, spider, dupefilter_key):
138 | 
139 |         if not dupefilter_key:
140 |             return
141 | 
142 |         obj = RedisRFPDupeFilter().from_settings(spider.settings)
143 | 
144 |         dupefilter_key = hashlib.sha1(dupefilter_key).hexdigest() if \
145 |             obj.do_hash else dupefilter_key
146 | 
147 |         # SPIDER_NAME for dupefilter
148 |         key = '{bot_name}:{spider_name}'.format(
149 |             bot_name=obj.bot_name,
150 |             spider_name=obj.spider_name)
151 | 
152 |         if obj.duperliter_delay_day == 0:
153 |             s = RedisSet(key, server=obj.server)
154 |             s.srem(dupefilter_key)
155 |         else:
156 |             z = RedisSortedSet(key, server=obj.server)
157 |             z.zrem(dupefilter_key)
158 |         obj.logger.info('dupefilter_key: {} released.'.format(
159 |             dupefilter_key))
160 | 
161 | 
162 | release_dupefilter_key = ReleaseDupefilterKey()
163 | 


--------------------------------------------------------------------------------
/crwy/utils/scrapy_plugs/middlewares.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: PyCharm
  7 | @file: middlewares.py
  8 | @create at: 2018-06-26 18:21
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | 
 12 | 通过redis hash表记录站点cookie
 13 | 
 14 | key为 cookie_pool:SITE (SITE需要在settings中指定)
 15 | field为 账号cookie的唯一标识，可以是username，id等，具体自行约定
 16 | value为 cookie值，必须为json格式
 17 | 
 18 | """
 19 | import datetime
 20 | import random
 21 | from scrapy.downloadermiddlewares.retry import RetryMiddleware
 22 | from scrapy.exceptions import IgnoreRequest
 23 | from scrapy_redis.connection import get_redis_from_settings
 24 | 
 25 | from crwy.utils.common import datetime2str
 26 | from crwy.utils.data.RedisHash import RedisHash
 27 | from crwy.exceptions import CrwyScrapyPlugsException, CrwyCookieValidException
 28 | from crwy.utils.scrapy_plugs.dupefilters import release_dupefilter_key
 29 | 
 30 | 
 31 | class CookieMiddleware(RetryMiddleware):
 32 |     """
 33 |     cookie_pool
 34 |         eg: '{"a": 1, "b": "aaa"}'
 35 |     """
 36 | 
 37 |     def __init__(self, settings):
 38 |         super(CookieMiddleware, self).__init__(settings)
 39 |         self.site = settings.get('SITE', None)
 40 |         if not self.site:
 41 |             raise CrwyScrapyPlugsException('SITE_NOT_SET')
 42 | 
 43 |         self.server = get_redis_from_settings(settings)
 44 | 
 45 |         self.h = RedisHash(
 46 |             'cookie_pool:{}'.format(self.site),
 47 |             server=self.server
 48 |         )
 49 | 
 50 |     def process_request(self, request, spider):
 51 |         """
 52 |         cookie_user不为空时，获取cookie_user对应的cookie
 53 |         keep_cookie_user为True时，将设置cookie_user，并传递到子请求
 54 |         :param request:
 55 |         :param spider:
 56 |         :return:
 57 |         """
 58 |         if request.meta.get('cookie_user', ''):
 59 |             user = request.meta.get('cookie_user')
 60 |         else:
 61 |             users = self.h.hkeys()
 62 |             if len(users) > 0:
 63 |                 user = random.choice(users)
 64 |                 if request.meta.get('keep_cookie_user', False) is True:
 65 |                     request.meta['cookie_user'] = user
 66 |             else:
 67 |                 raise CrwyScrapyPlugsException(
 68 |                     'no user in cookie_pool:{}'.format(self.site))
 69 |         cookie = self.h.hget(user)
 70 |         if cookie:
 71 |             # 字典存入redis，取出时未string，通过eval进行还原
 72 |             request.cookies = eval(cookie)
 73 |             spider.logger.debug('get_cookie_success: {}'.format(
 74 |                 user.decode('utf-8')))
 75 |         else:
 76 |             spider.logger.warning('get_cookie_failed: {}'.format(
 77 |                 user.decode('utf-8')))
 78 | 
 79 | 
 80 | class LimitCookieMiddleware(CookieMiddleware):
 81 |     """
 82 |     在cookieMiddleware基础上限制账号
 83 | 
 84 |     1. 每日搜索上限
 85 |         通过 update_requests_count method 更新有效请求次数
 86 |     2. cookie失效标识
 87 |         捕捉 CrwyCookieValidException 异常更改标识状态
 88 |     """
 89 |     def __init__(self, settings):
 90 |         super(LimitCookieMiddleware, self).__init__(settings)
 91 | 
 92 |         # 每日搜索上限
 93 |         self.search_limit = RedisHash(
 94 |             'search_limit:{}'.format(self.site), server=self.server)
 95 |         # cookie失效标识， 1为cookie失效
 96 |         self.account_limit = RedisHash(
 97 |             'account_limit:{}'.format(self.site), server=self.server)
 98 | 
 99 |     def get_requests_count(self, request):
100 |         user = request.meta.get('cookie_user')
101 |         today = datetime2str(datetime.datetime.now(), fmt='%Y-%m-%d')
102 |         if not self.search_limit.hget(user):
103 |             count = 1
104 |         else:
105 |             date, count = self.search_limit.hget(
106 |                 user).decode('utf-8').split('|')
107 |             if date == today:
108 |                 count = int(count)
109 |             else:
110 |                 count = 1
111 |         return user, count
112 | 
113 |     def update_requests_count(self, request, spider):
114 |         """
115 |         请求完毕后添加详情页计数
116 |         :param request:
117 |         :param spider:
118 |         :return:
119 |         """
120 |         user, count = self.get_requests_count(request)
121 |         today = datetime2str(datetime.datetime.now(), fmt='%Y-%m-%d')
122 |         count += 1
123 |         self.search_limit.hset(user, '{}|{}'.format(today, count))
124 |         spider.logger.debug('update search_limit: {} {}'.format(
125 |             user.decode('utf-8'), count))
126 | 
127 |     def _retry(self, request, reason, spider):
128 |         callback = super(LimitCookieMiddleware, self)._retry(
129 |             request, reason, spider
130 |         )
131 |         if not callback:
132 |             if isinstance(reason, CrwyCookieValidException):
133 |                 user = request.meta.get('cookie_user')
134 |                 self.account_limit.hset(user, 1)
135 |                 spider.logger.warning('account limit: {} after retry'.format(
136 |                     user.decode('utf-8')))
137 |                 raise IgnoreRequest
138 |         else:
139 |             return callback
140 | 
141 |     def process_request(self, request, spider):
142 |         super(LimitCookieMiddleware, self).process_request(request, spider)
143 | 
144 |         user, count = self.get_requests_count(request)
145 | 
146 |         dupefilter_key = request.meta.get('dupefilter_key')
147 | 
148 |         # 判断account_limit, 若为1则表示账号禁用中
149 |         if self.account_limit.hget(user) == b'1':
150 |             spider.logger.warning(
151 |                 'account_limit: {}'.format(user.decode('utf-8')))
152 |             release_dupefilter_key.call(spider, dupefilter_key)
153 |             raise IgnoreRequest
154 | 
155 |         # 判断是否为受保护搜索账号
156 |         if user.decode('utf-8') in spider.custom_settings.get(
157 |                 'SAFE_SEARCH_ACCOUNT'):
158 |             if count >= spider.custom_settings.get('SAFE_SEARCH_LIMIT'):
159 |                 spider.logger.warning(
160 |                     '{} safe_search_limit: {}'.format(
161 |                         user.decode('utf-8'), count))
162 |                 release_dupefilter_key.call(spider, dupefilter_key)
163 |                 raise IgnoreRequest
164 | 
165 |         # 判断search_limit，若大于上限则跳过
166 |         if count >= spider.custom_settings.get('SEARCH_LIMIT'):
167 |             spider.logger.warning(
168 |                 '{} search_limit: {}'.format(user.decode('utf-8'), count))
169 |             release_dupefilter_key.call(spider, dupefilter_key)
170 |             raise IgnoreRequest
171 | 
172 |         if not request.cookies:
173 |             spider.logger.warning('cookie is empty: {}'.format(
174 |                 user.decode('utf-8')))
175 |             release_dupefilter_key.call(spider, dupefilter_key)
176 |             raise IgnoreRequest
177 | 


--------------------------------------------------------------------------------
/crwy/utils/scrapy_plugs/pipelines.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: PyCharm
  7 | @file: pipelines.py
  8 | @create at: 2018-06-15 15:26
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | """
 12 | 
 13 | import logging
 14 | from pymysql.cursors import DictCursor
 15 | from sqlalchemy.ext.compiler import compiles
 16 | from sqlalchemy.sql.expression import Insert
 17 | from crwy.utils.sql.mysql import MysqlHandle
 18 | from crwy.utils.sql.sqlalchemy_m import SqlalchemyHandle
 19 | from crwy.exceptions import CrwyScrapyPlugsException
 20 | 
 21 | 
 22 | class MysqlSavePipeline(object):
 23 |     def __init__(self, db_name=None, db_host=None, db_port=None,
 24 |                  db_username=None, db_password=None, db_charset=None,
 25 |                  db_cursorclass=None):
 26 |         self.db_name = db_name
 27 |         self.db_host = db_host
 28 |         self.db_port = db_port
 29 |         self.db_username = db_username
 30 |         self.db_password = db_password
 31 |         self.db_charset = db_charset
 32 |         self.db_cursorclass = db_cursorclass
 33 |         self.logger = logging.getLogger(__name__)
 34 |         self.mysql_handle = None
 35 | 
 36 |     @classmethod
 37 |     def from_crawler(cls, crawler):
 38 |         """
 39 |         loading mysql settings
 40 |         :param crawler: 
 41 |         :return: 
 42 |         """
 43 |         settings = crawler.settings
 44 |         db_name = settings.get('MYSQL_DB_NAME', '')
 45 |         db_host = settings.get('MYSQL_DB_HOST', '127.0.0.1')
 46 |         db_port = settings.getint('MYSQL_DB_PORT', 3306)
 47 |         db_username = settings.get('MYSQL_DB_USERNAME', 'root')
 48 |         db_password = settings.get('MYSQL_DB_PASSWORD', '123456')
 49 |         db_charset = settings.get('MYSQL_DB_CHARSET', 'utf8')
 50 |         db_cursorclass = settings.get('MYSQL_DB_CURSORCLASS', DictCursor)
 51 |         return cls(db_name=db_name, db_host=db_host, db_port=db_port,
 52 |                    db_username=db_username,
 53 |                    db_password=db_password,
 54 |                    db_charset=db_charset,
 55 |                    db_cursorclass=db_cursorclass)
 56 | 
 57 |     def process_item(self, item, spider):
 58 |         self.insert_db(item)
 59 |         return item
 60 | 
 61 |     def open_spider(self, spider):
 62 |         self.mysql_handle = MysqlHandle(
 63 |             host=self.db_host,
 64 |             port=self.db_port,
 65 |             user=self.db_username,
 66 |             password=self.db_password,
 67 |             db=self.db_name,
 68 |             charset=self.db_charset,
 69 |             cursorclass=self.db_cursorclass
 70 |         )
 71 | 
 72 |     def insert_db(self, item):
 73 |         """        
 74 |         -----------------------------------
 75 |         Do something here with mysql_handle
 76 |         -----------------------------------
 77 |         
 78 |         eg:
 79 |         sql = None
 80 |         data = None
 81 |         last_insert_id = self.mysql_handle.save(
 82 |             sql, data, get_last_insert_id=True)
 83 |         self.logger.info('item saved succcess to mysql: %s' % last_insert_id)
 84 |         """
 85 |         pass
 86 | 
 87 | 
 88 | @compiles(Insert)
 89 | def append_string(insert, compiler, **kw):
 90 |     s = compiler.visit_insert(insert, **kw)
 91 |     if 'append_string' in insert.kwargs:
 92 |         return s + " " + insert.kwargs['append_string']
 93 |     return s
 94 | 
 95 | 
 96 | class SqlalchemySavePipeline(object):
 97 |     def __init__(self, db_url, echo=True):
 98 |         self.db_url = db_url
 99 |         self.echo = echo
100 |         self.sqlalchemy_handle = None
101 |         self.logger = logging.getLogger(__name__)
102 | 
103 |     @classmethod
104 |     def from_crawler(cls, crawler):
105 |         """
106 |         loading sqlalchemy settings
107 |         :param crawler:
108 |         :return:
109 |         """
110 |         settings = crawler.settings
111 |         db_url = settings.get('SQLALCHEMY_URI')
112 |         echo = settings.getbool('SQLALCHEMY_ECHO')
113 |         if not db_url:
114 |             raise CrwyScrapyPlugsException('SQLALCHEMY_URI must be setup.')
115 |         return cls(db_url, echo)
116 | 
117 |     def process_item(self, item, spider):
118 |         self.insert_db(item)
119 |         return item
120 | 
121 |     def open_spider(self, spider):
122 |         self.sqlalchemy_handle = SqlalchemyHandle(
123 |             db_url=self.db_url, echo=self.echo)
124 |         self.sqlalchemy_handle.init_table()
125 | 
126 |     def insert_db(self, item):
127 |         """
128 |         -----------------------------------
129 |         Do something here with sqlalchemy_handle
130 |         -----------------------------------
131 | 
132 |         eg:
133 |         self.sqlalchemy_handle.session.execute(
134 |             Test.__table__.insert(), item
135 |         )
136 |         self.sqlalchemy_handle.session.commit()
137 |         self.logger.info('sqlachemy inserted success.')
138 |         """
139 |         pass
140 | 
141 |     def close_spider(self, spider):
142 |         self.sqlalchemy_handle.session.close()
143 | 


--------------------------------------------------------------------------------
/crwy/utils/scrapy_plugs/settings.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: PyCharm
 7 | @file: settings.py
 8 | @create at: 2018-06-20 19:33
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | from crwy.utils.load_settings import LoadSettingsFromConsul
14 | from crwy.exceptions import CrwyException
15 | 
16 | 
17 | class ScrapySettingsFromConsul(LoadSettingsFromConsul):
18 |     def __init__(self, spider_name, bot_name, prefix='scrapy', **kwargs):
19 |         super(ScrapySettingsFromConsul, self).__init__(**kwargs)
20 |         self.spider_name = spider_name
21 |         self.bot_name = bot_name
22 |         self.prefix = prefix
23 | 
24 |     def init_main_key(self, key=None):
25 |         if not key:
26 |             self.main_key = '{prefix}/{bot_name}/{spider_name}'.format(
27 |                 prefix=self.prefix, bot_name=self.bot_name,
28 |                 spider_name=self.spider_name
29 |             )
30 |         else:
31 |             self.main_key = key
32 | 
33 |     def _get_settings(self, key=None):
34 |         self.init_main_key(key=key)
35 |         index, data = self.c.kv.get(self.main_key, recurse=True)
36 |         if not data:
37 |             raise CrwyException('Please make sure the key: <%s> is exist.' %
38 |                                 self.main_key)
39 | 
40 |         new_data = {
41 |             item.get('Key').split('/')[-1]: eval(item.get('Value'))
42 |             for item in data
43 |         }
44 |         new_data['SPIDER_NAME'] = self.spider_name
45 | 
46 |         return new_data
47 | 
48 |     @classmethod
49 |     def get_settings(cls, spider_name, bot_name, key=None, prefix='scrapy',
50 |                      **kwargs):
51 |         load_settings = cls(spider_name, bot_name, prefix=prefix, **kwargs)
52 |         return load_settings._get_settings(key=key)
53 | 


--------------------------------------------------------------------------------
/crwy/utils/selenium_api.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | @author: wuyue
  5 | @contact: wuyue92tree@163.com
  6 | @software: IntelliJ IDEA
  7 | @file: selenium_api.py
  8 | @create at: 2018-10-15 11:51
  9 | 
 10 | 这一行开始写关于本文件的说明与解释
 11 | """
 12 | 
 13 | import os
 14 | import re
 15 | import time
 16 | import uuid
 17 | 
 18 | from PIL import Image
 19 | from crwy.spider import Spider
 20 | from crwy.exceptions import CrwyImportException
 21 | 
 22 | try:
 23 |     from selenium import webdriver
 24 | except ImportError:
 25 |     raise CrwyImportException(
 26 |         "You should install selenium first! suggestion: pip install "
 27 |         "selenium==3.6.0")
 28 | from selenium.common.exceptions import TimeoutException, NoSuchElementException
 29 | from selenium.webdriver import DesiredCapabilities, ActionChains
 30 | from selenium.webdriver.support import expected_conditions as EC
 31 | from selenium.webdriver.support.wait import WebDriverWait
 32 | 
 33 | 
 34 | class SeleniumApi(Spider):
 35 |     def __init__(self, driver_type='chrome',
 36 |                  hub_url='http://127.0.0.1:4444/wd/hub',
 37 |                  proxy=None, user_agent=None, use_hub=True,
 38 |                  logger=None):
 39 |         super(SeleniumApi, self).__init__(logger=logger)
 40 |         """
 41 |         :param driver_type: driver类型
 42 |         :param hub_url: hub server地址
 43 |         :param proxy: 代理地址
 44 |         :param user_agent: user_agent
 45 |         :param use_hub: 是否启用hub，为False时使用本地driver
 46 |         """
 47 |         self.driver_type = driver_type
 48 |         self.hub_url = hub_url
 49 |         self.proxy = proxy
 50 |         self.use_hub = use_hub
 51 |         self.user_agent = user_agent
 52 |         # device_pixel_ratio 用于处理高分屏dpi抠图
 53 |         self.device_pixel_ratio = 1
 54 |         self.driver = self.init_driver()
 55 |         self.driver.set_window_size(1280, 960)
 56 | 
 57 |     def _init_chrome_driver(self):
 58 |         chrome_options = webdriver.ChromeOptions()
 59 |         if self.proxy:
 60 |             chrome_options.add_argument('--proxy-server=%s' % self.proxy)
 61 |         if self.user_agent:
 62 |             chrome_options.add_argument('--user-agent=%s' % self.user_agent)
 63 |         desired_capabilities = chrome_options.to_capabilities()
 64 |         if self.use_hub:
 65 |             driver = webdriver.Remote(
 66 |                 command_executor=self.hub_url,
 67 |                 desired_capabilities=desired_capabilities
 68 |             )
 69 |         else:
 70 |             driver = webdriver.Chrome(
 71 |                 chrome_options=chrome_options
 72 |             )
 73 |         return driver
 74 | 
 75 |     def _init_firefox_driver(self):
 76 |         firefox_profile = webdriver.FirefoxProfile()
 77 |         if self.proxy:
 78 |             ip, port = self.proxy.split(':')
 79 |             firefox_profile.set_preference('network.proxy.type', 1)
 80 |             firefox_profile.set_preference('network.proxy.http', ip)
 81 |             firefox_profile.set_preference('network.proxy.http_port', int(port))
 82 |             firefox_profile.set_preference('network.proxy.ssl', ip)
 83 |             firefox_profile.set_preference('network.proxy.ssl_port', int(port))
 84 |         if self.user_agent:
 85 |             firefox_profile.set_preference(
 86 |                 'general.useragent.override', self.user_agent)
 87 |         firefox_profile.update_preferences()
 88 |         desired_capabilities = DesiredCapabilities.FIREFOX
 89 |         if self.use_hub:
 90 |             driver = webdriver.Remote(
 91 |                 command_executor=self.hub_url,
 92 |                 desired_capabilities=desired_capabilities,
 93 |                 browser_profile=firefox_profile
 94 |             )
 95 |         else:
 96 |             driver = webdriver.Firefox(
 97 |                 firefox_profile=firefox_profile,
 98 |                 capabilities=desired_capabilities
 99 |             )
100 |         return driver
101 | 
102 |     def init_driver(self):
103 |         if self.driver_type.upper() == 'CHROME':
104 |             return self._init_chrome_driver()
105 |         elif self.driver_type.upper() == 'FIREFOX':
106 |             return self._init_firefox_driver()
107 |         raise Exception('No supported driver: %s' % self.driver_type)
108 | 
109 |     @staticmethod
110 |     def is_similar(image1, image2, x, y, distance=25):
111 |         """
112 |         对比RGB值
113 |         :param image1: 待对比的图片1
114 |         :param image2: 待对比的图片2
115 |         :param x: x坐标
116 |         :param y: y坐标
117 |         :param distance: 色差
118 |         :return:
119 |         """
120 |         # 获取指定位置的RGB值
121 |         pixel1 = image1.getpixel((x, y))
122 |         pixel2 = image2.getpixel((x, y))
123 |         for i in range(0, 3):
124 |             # 如果相差超过50则就认为找到了缺口的位置
125 |             # print(x, y, pixel1, pixel2)
126 |             if abs(pixel1[i] - pixel2[i]) >= distance:
127 |                 return False
128 |         return True
129 | 
130 |     def get_diff_location(self, image1, image2):
131 |         """
132 |         计算缺口的位置
133 |         :param image1:
134 |         :param image2:
135 |         :return:
136 |         """
137 |         i = 0
138 |         # 两张原始图的大小都是相同的260*160
139 |         # 那就通过两个for循环依次对比每个像素点的RGB值
140 |         # 如果相差超过50则就认为找到了缺口的位置
141 |         for i in range(0, image1.width):
142 |             for j in range(0, image1.height):
143 |                 if self.is_similar(image1, image2, i, j) is False:
144 |                     return i
145 |         return i
146 | 
147 |     def get_img(self, screenshot, xpath):
148 |         """
149 |         获取验证码图片
150 | 
151 |         :param screenshot: 页面截图
152 |         :param xpath: 验证码图片xpath
153 |         :return: img对象
154 |         """
155 |         self.device_pixel_ratio = self.driver.execute_script(
156 |             "return window.devicePixelRatio;")
157 |         element = self.driver.find_element_by_xpath(xpath)
158 |         left = int(element.location['x']) * self.device_pixel_ratio
159 |         top = int(element.location['y']) * self.device_pixel_ratio
160 |         right = int(element.location['x'] +
161 |                     element.size['width']) * self.device_pixel_ratio
162 |         bottom = int(element.location['y'] +
163 |                      element.size['height']) * self.device_pixel_ratio
164 |         img = Image.open(screenshot)
165 |         img = img.crop((left, top, right, bottom))
166 |         return img
167 | 
168 |     def click_img(self, answer, height, identify_img_xpath1=None,
169 |                   identify_button_xpath=None):
170 |         """
171 |         根据打码返回的坐标进行点击操作
172 |         仅适用与点击型验证码
173 | 
174 |         :param answer: 打码返回结果
175 |         :param height: 答案高度
176 |         :param identify_img_xpath1: 题目xpath
177 |         :param identify_button_xpath: 验证按钮
178 |         :return:
179 |         """
180 |         actions = ActionChains(self.driver)
181 |         img = self.driver.find_element_by_xpath(identify_img_xpath1)
182 |         points = answer.split('|')
183 |         for point in points:
184 |             x, y = eval(point)
185 |             actions.move_to_element_with_offset(
186 |                 img, x, y - int((height / self.device_pixel_ratio)))
187 |             actions.click()
188 |         actions.perform()
189 |         time.sleep(2)
190 |         if not identify_button_xpath:
191 |             return
192 |         self.driver.find_element_by_xpath(identify_button_xpath).click()
193 | 
194 |     def deal_normal_verification_code(
195 |             self, captcha_obj, captcha_code, identify_img_xpath):
196 |         uuid_str = str(uuid.uuid1())
197 |         screenshot_path = './data/img/screenshot_%s.png' % uuid_str
198 |         check_img_path = './data/img/check_image-%s.png' % uuid_str
199 |         self.driver.save_screenshot(screenshot_path)
200 |         img = self.get_img(screenshot_path, xpath=identify_img_xpath)
201 |         img.save(check_img_path)
202 |         answer = captcha_obj.decode(check_img_path, captcha_code)
203 |         self.logger.info('get normal captcha code : %s' % answer)
204 |         os.remove(screenshot_path)
205 |         os.remove(check_img_path)
206 |         return answer
207 | 
208 |     def deal_click_verification_code(
209 |             self, captcha_obj, captcha_code,
210 |             identify_img_xpath, identify_img_xpath1,
211 |             identify_button_xpath):
212 |         """
213 |         处理点击型验证码
214 | 
215 |         :param captcha_obj:
216 |         :param captcha_code:
217 |         :param identify_img_xpath:
218 |         :param identify_img_xpath1:
219 |         :param identify_button_xpath:
220 |         :return:
221 |         """
222 |         uuid_str = str(uuid.uuid1())
223 |         screenshot_path = './data/img/screenshot_%s.png' % uuid_str
224 |         check_img_path = './data/img/check_image-%s.png' % uuid_str
225 |         self.driver.save_screenshot(screenshot_path)
226 |         img1 = self.get_img(screenshot_path, xpath=identify_img_xpath)
227 |         img2 = self.get_img(screenshot_path, xpath=identify_img_xpath1)
228 |         to_image = Image.new('RGBA', (img2.width, img1.height + img2.height))
229 |         to_image.paste(img1, (0, 0))
230 |         to_image.paste(img2, (0, img1.height))
231 |         if self.device_pixel_ratio > 1:
232 |             to_image = to_image.resize(
233 |                 (int(to_image.width / self.device_pixel_ratio),
234 |                  int(to_image.height / self.device_pixel_ratio))
235 |             )
236 |         to_image.save(check_img_path)
237 |         answer = captcha_obj.decode(check_img_path, captcha_code)
238 |         self.logger.info('get click captcha code : %s' % answer)
239 |         self.click_img(answer, img1.height,
240 |                        identify_img_xpath1=identify_img_xpath1,
241 |                        identify_button_xpath=identify_button_xpath)
242 |         os.remove(screenshot_path)
243 |         os.remove(check_img_path)
244 | 
245 |     def get_mobile_code(self, phone_obj, phone, phone_token,
246 |                         check_str='智联招聘', regexp='\d+',
247 |                         retry_times=20, sleep_time=5):
248 |         while retry_times > 0:
249 |             msg = phone_obj.get_message(token=phone_token, phone=phone)
250 |             if check_str in msg:
251 |                 code = re.findall(regexp, msg)[0]
252 |                 self.logger.info(
253 |                     '{}: get mobile code success. code is: {}'.format(
254 |                         phone, code))
255 |                 return code
256 | 
257 |             self.logger.info(
258 |                 '{}: no more message received. sleep {}s'.format(
259 |                     phone, sleep_time))
260 |             time.sleep(sleep_time)
261 |             retry_times -= 1
262 | 
263 |     def is_element_visible(self, element):
264 |         """
265 |         判断元素是否存在
266 |         :param element:
267 |         :return:
268 |         """
269 |         driver = self.driver
270 |         try:
271 |             the_element = EC.visibility_of_element_located(element)
272 |             assert the_element(driver)
273 |             flag = True
274 |         except (AssertionError, NoSuchElementException):
275 |             self.logger.warning('the element is not visible.')
276 |             flag = False
277 |         except Exception as e:
278 |             self.logger.exception(e)
279 |             flag = False
280 |         return flag
281 | 
282 |     def wait_element(self, by, by_value, timeout=5):
283 |         try:
284 |             WebDriverWait(
285 |                 self.driver, timeout).until(
286 |                 EC.presence_of_element_located((by, by_value)))
287 |         except TimeoutException as e:
288 |             self.logger.exception(e)
289 | 
290 |     @staticmethod
291 |     def cookies2dict(cookies):
292 |         """
293 |         trans cookies
294 |         :param cookies: driver.get_cookies()
295 |         :return:
296 |         """
297 |         cookie_dict = {}
298 |         for item in cookies:
299 |             cookie_dict[item['name']] = item['value']
300 |         return cookie_dict
301 | 
302 |     def release(self):
303 |         try:
304 |             self.driver.quit()
305 |             self.driver.close()
306 |         except:
307 |             pass
308 | 


--------------------------------------------------------------------------------
/crwy/utils/sql/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/sql/__init__.py


--------------------------------------------------------------------------------
/crwy/utils/sql/mysql.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: PyCharm
 7 | @file: mysql.py
 8 | @create at: 2017-12-15 14:20
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | from crwy.exceptions import CrwyImportException, CrwyDbException
14 | from crwy.decorates import cls2singleton
15 | 
16 | try:
17 |     import pymysql
18 | except ImportError:
19 |     raise CrwyImportException(
20 |         "You should install pymysql first! try: pip install "
21 |         "pymysql")
22 | try:
23 |     from dbutils.persistent_db import PersistentDB
24 | except ImportError:
25 |     raise CrwyImportException(
26 |         "You should install DBUtils first! try: pip install "
27 |         "dbutils>=2.0")
28 | 
29 | 
30 | @cls2singleton
31 | class MysqlHandle(object):
32 |     def __init__(self, **kwargs):
33 |         self._mysql_pool = PersistentDB(pymysql, **kwargs)
34 | 
35 |     def query_by_sql(self, sql):
36 |         conn = self._mysql_pool.connection()
37 |         cur = conn.cursor()
38 |         try:
39 |             cur.execute(sql)
40 |             result = cur.fetchall()
41 |             return result
42 |         except Exception as e:
43 |             raise CrwyDbException(e)
44 |         finally:
45 |             cur.close()
46 |             conn.close()
47 | 
48 |     def save(self, sql, data, many=False, get_last_insert_id=False):
49 |         conn = self._mysql_pool.connection()
50 |         cur = conn.cursor()
51 |         try:
52 |             if many is False:
53 |                 cur.execute(sql, data)
54 |             else:
55 |                 cur.executemany(sql, data)
56 |             conn.commit()
57 | 
58 |             if get_last_insert_id is False:
59 |                 return
60 | 
61 |             cur.execute("select last_insert_id() as id")
62 |             res = cur.fetchone()
63 |             if isinstance(res, tuple):
64 |                 return res[0]
65 |             elif isinstance(res, dict):
66 |                 return res.get('id')
67 |             else:
68 |                 return res
69 | 
70 |         except Exception as e:
71 |             raise CrwyDbException(e)
72 |         finally:
73 |             cur.close()
74 |             conn.close()
75 | 


--------------------------------------------------------------------------------
/crwy/utils/sql/pg.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | @author: wuyue
 5 | @contact: wuyue92tree@163.com
 6 | @software: PyCharm
 7 | @file: pg.py
 8 | @create at: 2017-12-15 14:28
 9 | 
10 | 这一行开始写关于本文件的说明与解释
11 | """
12 | 
13 | from crwy.exceptions import CrwyImportException, CrwyDbException
14 | from crwy.decorates import cls2singleton
15 | 
16 | try:
17 |     import pgdb
18 | except ImportError:
19 |     raise CrwyImportException("You should install PyGreSQL first! try: pip "
20 |                               "install PyGreSQL")
21 | try:
22 |     from dbutils.persistent_db import PersistentDB
23 | except ImportError:
24 |     raise CrwyImportException(
25 |         "You should install DBUtils first! try: pip install "
26 |         "dbutils>=2.0")
27 | 
28 | 
29 | @cls2singleton
30 | class PgHandle(object):
31 |     def __init__(self, **kwargs):
32 |         self._pg_pool = PersistentDB(pgdb, **kwargs)
33 | 
34 |     def query_by_sql(self, sql):
35 |         conn = self._pg_pool.connection()
36 |         cur = conn.cursor()
37 |         try:
38 |             cur.execute(sql)
39 |             result = cur.fetchall()
40 |             return result
41 |         except Exception as e:
42 |             raise CrwyDbException(e)
43 |         finally:
44 |             cur.close()
45 |             conn.close()
46 | 
47 |     def save(self, sql, data, many=False, get_last_insert_id=False):
48 |         conn = self._pg_pool.connection()
49 |         cur = conn.cursor()
50 |         try:
51 |             if get_last_insert_id is True:
52 |                 sql = sql.strip(';')
53 |                 sql = sql + ' returning id'
54 | 
55 |             if many is False:
56 |                 cur.execute(sql, data)
57 |             else:
58 |                 cur.executemany(sql, data)
59 | 
60 |             conn.commit()
61 | 
62 |             if get_last_insert_id is True:
63 |                 res = cur.fetchone()
64 |                 return res.id
65 | 
66 |         except Exception as e:
67 |             raise CrwyDbException(e)
68 |         finally:
69 |             cur.close()
70 |             conn.close()
71 | 


--------------------------------------------------------------------------------
/crwy/utils/sql/sqlalchemy_m.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # author: wuyue92tree@163.com
 4 | 
 5 | from sqlalchemy import create_engine
 6 | from sqlalchemy.orm import sessionmaker
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | from crwy.decorates import cls2singleton
 9 | 
10 | Base = declarative_base()
11 | 
12 | 
13 | @cls2singleton
14 | class SqlalchemyHandle(object):
15 |     """
16 |     以ORM的方式连接数据库
17 |     """
18 | 
19 |     def __init__(self, db_url, **kwargs):
20 |         self.engine = create_engine(db_url, **kwargs)
21 |         DBSession = sessionmaker(bind=self.engine)
22 |         self.session = DBSession()
23 | 
24 |     def init_table(self):
25 |         return Base.metadata.create_all(self.engine)
26 | 
27 |     def delete_table(self):
28 |         return Base.metadata.drop_all(self.engine)
29 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.5.1
2 | configparser>=3.5.0
3 | requests>=2.20.0
4 | gevent>=1.2.1
5 | retrying>=1.3.3
6 | imapclient>=2.0.0
7 | DBUtils>=2.0
8 | redis
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup, find_packages
 5 | from crwy import version
 6 | 
 7 | install_requires = []
 8 | 
 9 | with open('requirements.txt', 'r') as f:
10 |     for req in f.readlines():
11 |         install_requires.append(req.strip('\n'))
12 | 
13 | 
14 | setup(
15 |     name='Crwy',
16 |     version=version,
17 |     url='https://github.com/wuyue92tree/crwy',
18 |     description='A Simple Web Crawling and Web Scraping framework',
19 |     long_description=open('README.rst', encoding='utf-8').read(),
20 |     author='wuyue',
21 |     author_email='wuyue92tree@163.com',
22 |     maintainer='wuyue',
23 |     maintainer_email='wuyue92tree@163.com',
24 |     license='MIT',
25 |     packages=find_packages(exclude=('tests', 'tests.*')),
26 |     include_package_data=True,
27 |     zip_safe=False,
28 |     entry_points={
29 |         'console_scripts': ['crwy = crwy.cmdline:execute']
30 |     },
31 |     install_requires=install_requires,
32 | )
33 | 


--------------------------------------------------------------------------------