├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── README.rst ├── crwy ├── __init__.py ├── changetmpl.py ├── cmdline.py ├── decorates.py ├── exceptions.py ├── settings │ ├── __init__.py │ └── default_settings.py ├── spider.py ├── templates │ ├── project │ │ ├── logger_py2.conf.tmpl │ │ └── logger_py3.conf.tmpl │ └── spiders │ │ ├── crwybasic.tmpl │ │ ├── crwycrawl.tmpl │ │ └── crwyredis.tmpl └── utils │ ├── __init__.py │ ├── common.py │ ├── data │ ├── RedisHash.py │ └── __init__.py │ ├── extend │ ├── __init__.py │ ├── chaojiying.py │ ├── dingding_robot.py │ ├── xunma.py │ └── yima.py │ ├── filter │ ├── RedisSet.py │ ├── RedisSortedSet.py │ └── __init__.py │ ├── html │ ├── __init__.py │ ├── font_analysis.py │ ├── html_downloader.py │ └── html_parser.py │ ├── load_settings.py │ ├── logger.py │ ├── mail.py │ ├── no_sql │ ├── __init__.py │ └── redis_m.py │ ├── pyppeteer_api.py │ ├── queue │ ├── RedisQueue.py │ ├── SsdbQueue.py │ └── __init__.py │ ├── scrapy_plugs │ ├── __init__.py │ ├── dupefilters.py │ ├── middlewares.py │ ├── pipelines.py │ └── settings.py │ ├── selenium_api.py │ └── sql │ ├── __init__.py │ ├── mysql.py │ ├── pg.py │ └── sqlalchemy_m.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | .idea 4 | *.db 5 | *.conf -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '2.7' 4 | - '3.6' 5 | script: python setup.py install 6 | deploy: 7 | provider: pypi 8 | user: wuyue 9 | password: 10 | secure: 7sJA2nimxQQLdm6iifp0cC2ccDj+rPdzRPVB9BlNQV03G7ev37eXC3D2BWZcUyCUhUrIPFRlYuAxrcq6XlKJ9+0vQH8XhWpaMQhxcU2Zca8+AVFZ3yWrPihRGGWOvBpNQ25d5dc9SPMQIvzJdnVTgkbxQf3kvEk3rSSKfPWScKZZWYWUf2btunRzJSC24O6BvcbU9XW7dgXTUR7wb5P1JDFsQ3+U4DK7X+HEGq4TO1rYobEEw9Bnf8RGQuR1L64vusza3TDTag3D5yQ3iC8rX9GLxLQGlVnVlTUuj1jfw78m6jSQgNDB7Eyt3Nk9kbSDlSeed/uD+aWSDm8jh/RinZ0/OBq/yUz7/hkermvevZgnGQq36TH5L1xzlphAIO39gLL0RtEPYLw24jUmE+fRK8C5g2YLpVaV4JKqtrh2qNKjbCXXSXYXIN2cdkjRCXAEfs6bOhhrV1JecOwseIfG+gQLzY/WSUU0OCNnPZo21kl4kKH45hI96QwXLM6PPfk69JWE3DIPTB5F/Nht5YZfi66Ni9a/0LLew4qKaGBk19UvguMAfU8LonN/m+REoNJRGdaaSPq6BH224NtqnnFm/brqfQ/ZZCFXCaRoNPUip2k2wShfkH6LjG5BYsm3V83xJikFphrgNFHRuY3mFe6bA6SedvJvLH2/LpV28vbRh1A= 11 | skip_existing: true 12 | skip_cleanup: true 13 | on: 14 | tag: true 15 | all_branches: true 16 | 17 | branches: 18 | except: 19 | - develop 20 | - master 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 wuyue 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MANIFEST.in 2 | include README.md 3 | include requirements.txt 4 | recursive-include crwy/templates * 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crwy 2 | 3 | [![PyPI Version](https://img.shields.io/pypi/v/Crwy.svg)](https://pypi.python.org/pypi/Crwy) 4 | [![Download Status](https://img.shields.io/pypi/dm/django-adminlte-ui.svg)](https://pypi.python.org/pypi/Crwy) 5 | [![Build Status](https://travis-ci.org/wuyue92tree/crwy.svg)](https://travis-ci.org/wuyue92tree/crwy) 6 | [![License Status](https://img.shields.io/github/license/wuyue92tree/crwy)](https://raw.githubusercontent.com/wuyue92tree/crwy/master/LICENS) 7 | 8 | 9 | # 简介 10 | 11 | Crwy是一个轻量级的爬虫抓取框架,参考Scrapy框架结构开发而来。该框架提供了实用的爬虫模板,旨在帮助大家快速实现爬虫任务,高效开发。并为scrapy使用者提供通用轮子^.^。新增了gevent,使爬虫异步执行,速度更快。 12 | 13 | # 运行环境 14 | 15 | 16 | * Python3 17 | * Works on Linux, Mac OSX 18 | 19 | # 安装 20 | 21 | 22 | 快速安装 23 | ``` 24 | pip install crwy 25 | ``` 26 | 27 | or 28 | 前往下载: https://pypi.python.org/pypi/Crwy/ 29 | 30 | 31 | # TODO 32 | 33 | - 完善scrapy_plugs 34 | - 完善selenium_api 35 | - 兼容python3 36 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Crwy 2 | ==== 3 | 4 | |PyPI Version| |Download Status| |Build Status| |License Status| 5 | 6 | 简介 7 | ==== 8 | 9 | Crwy是一个轻量级的爬虫抓取框架,参考Scrapy框架结构开发而来。该框架提供了实用的爬虫模板,旨在帮助大家快速实现爬虫任务,高效开发。并为scrapy使用者提供通用轮子\ :sup:`.`\ 。新增了gevent,使爬虫异步执行,速度更快。 10 | 11 | 运行环境 12 | ======== 13 | 14 | - Python3 15 | - Works on Linux, Mac OSX 16 | 17 | 18 | 安装 19 | ==== 20 | 21 | 快速安装 22 | 23 | :: 24 | 25 | pip install crwy 26 | 27 | or 前往下载: https://pypi.python.org/pypi/Crwy/ 28 | 29 | TODO 30 | ==== 31 | 32 | - 完善scrapy_plugs 33 | - 完善selenium_api 34 | - 兼容python3 35 | 36 | .. |PyPI Version| image:: https://img.shields.io/pypi/v/Crwy.svg 37 | :target: https://pypi.python.org/pypi/Crwy 38 | .. |Download Status| image:: https://img.shields.io/pypi/dm/django-adminlte-ui.svg 39 | :target: https://pypi.python.org/pypi/Crwy 40 | .. |Build Status| image:: https://travis-ci.org/wuyue92tree/crwy.svg 41 | :target: https://travis-ci.org/wuyue92tree/crwy 42 | .. |License Status| image:: https://img.shields.io/github/license/wuyue92tree/crwy 43 | :target: https://raw.githubusercontent.com/wuyue92tree/crwy/master/LICENS 44 | -------------------------------------------------------------------------------- /crwy/__init__.py: -------------------------------------------------------------------------------- 1 | version = '1.7.1' 2 | -------------------------------------------------------------------------------- /crwy/changetmpl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | from string import Template 6 | 7 | try: 8 | import ConfigParser 9 | except ImportError: 10 | from configparser import ConfigParser 11 | 12 | 13 | def get_project_name(): 14 | conf = ConfigParser() 15 | conf.read('crwy.cfg', encoding='utf-8') 16 | project_name = conf.get('project', 'project_name').encode('utf-8') 17 | return project_name 18 | 19 | 20 | def change_project_name(name, path): 21 | f = open(path, 'r') 22 | t = Template(f.read()).substitute(project_name=name) 23 | return t 24 | 25 | 26 | def change_spider_name(name, path): 27 | f = open(path, 'r') 28 | class_name = name.capitalize() 29 | spider_name = name 30 | project_name = get_project_name() 31 | t = Template(f.read()).substitute(class_name=class_name, spider_name=spider_name, project_name=project_name) 32 | return t 33 | -------------------------------------------------------------------------------- /crwy/cmdline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | from __future__ import print_function 6 | import os 7 | import shutil 8 | import scrapy 9 | from optparse import OptionParser 10 | from crwy import version 11 | from crwy.settings.default_settings import TEMPLATE_DIR 12 | 13 | CRWY_SPIDER_TEMPLATE_DIR = os.path.join(TEMPLATE_DIR, 'spiders') 14 | SCRAPY_SPIDER_TEMPLATE_DIR = os.path.join(scrapy.__path__[0], 15 | 'templates/spiders') 16 | 17 | 18 | def install(): 19 | scrapy_tmpl = os.listdir(SCRAPY_SPIDER_TEMPLATE_DIR) 20 | for tmpl in os.listdir(CRWY_SPIDER_TEMPLATE_DIR): 21 | if tmpl in scrapy_tmpl: 22 | print('{} exist.'.format(tmpl)) 23 | continue 24 | shutil.copy(os.path.join(CRWY_SPIDER_TEMPLATE_DIR, tmpl), 25 | os.path.join(SCRAPY_SPIDER_TEMPLATE_DIR, tmpl)) 26 | print('{} installed.'.format(tmpl)) 27 | 28 | 29 | def uninstall(): 30 | crwy_tmpl = os.listdir(CRWY_SPIDER_TEMPLATE_DIR) 31 | for tmpl in os.listdir(SCRAPY_SPIDER_TEMPLATE_DIR): 32 | if tmpl not in crwy_tmpl: 33 | print('{} not match, skip.'.format(tmpl)) 34 | continue 35 | os.remove(os.path.join(SCRAPY_SPIDER_TEMPLATE_DIR, tmpl)) 36 | print('{} uninstalled.'.format(tmpl)) 37 | pass 38 | 39 | 40 | def execute(): 41 | parser = OptionParser(usage="Usage: crwy [options] arg1 arg2") 42 | parser.add_option('-i', '--install', action="store_true", 43 | help='install crwy tmpl for scrapy') 44 | parser.add_option('-u', '--uninstall', action="store_true", 45 | help='uninstall crwy tmpl for scrapy') 46 | parser.add_option('-v', '--version', action="store_true", 47 | help='print version') 48 | options, args = parser.parse_args() 49 | if options.version: 50 | print(version) 51 | elif options.install: 52 | install() 53 | elif options.uninstall: 54 | uninstall() 55 | else: 56 | parser.print_help() 57 | 58 | 59 | if __name__ == '__main__': 60 | execute() 61 | -------------------------------------------------------------------------------- /crwy/decorates.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: decorates.py 8 | @create at: 2017-12-07 09:47 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import functools 14 | from crwy.exceptions import CrwyCookieValidException 15 | 16 | 17 | def cls2singleton(cls, *args, **kwargs): 18 | """ 19 | 将类转换为单例模式 20 | :param cls: 21 | :param args: 22 | :param kwargs: 23 | :return: 24 | """ 25 | instances = {} 26 | 27 | def _singleton(*args, **kwargs): 28 | if kwargs.pop('cls_singleton', True) is False: 29 | return cls(*args, **kwargs) 30 | if cls not in instances: 31 | instances[cls] = cls(*args, **kwargs) 32 | return instances[cls] 33 | 34 | return _singleton 35 | 36 | 37 | def cls_catch_exception(func): 38 | """ 39 | 该装饰器用于捕捉类方法异常 40 | 1. 未出现异常,直接return方法执行结果 41 | 2. 出现异常,则先将异常记入日志,再抛出异常 42 | :param func: 43 | :return: 44 | """ 45 | 46 | @functools.wraps(func) 47 | def wrapper(self, *args, **kwargs): 48 | try: 49 | return func(self, *args, **kwargs) 50 | except Exception as e: 51 | self.logger.exception(e) 52 | raise e 53 | 54 | return wrapper 55 | 56 | 57 | def cls_refresh_cookie(func): 58 | """ 59 | 该装饰器用于捕捉类方法异常 CrwyCookieValidException 60 | 1. 未出现异常,直接return方法执行结果 61 | 2. 出现异常,则先调用self.get_cookie()进行cookie刷新,若cookie刷新成功, 62 | 直接return返回 63 | :param func: 64 | :return: 65 | """ 66 | 67 | @functools.wraps(func) 68 | def wrapper(self, *args, **kwargs): 69 | try: 70 | return func(self, *args, **kwargs) 71 | except CrwyCookieValidException as e: 72 | if not self.get_cookie(): 73 | self.logger.warning("Func[%s]: cookie更新失败." % func.__name__) 74 | raise e 75 | self.logger.info("Func[%s]: cookie更新成功." % func.__name__) 76 | return func(self, *args, **kwargs) 77 | 78 | return wrapper 79 | -------------------------------------------------------------------------------- /crwy/exceptions.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: exceptions.py 8 | @create at: 2017-12-13 14:14 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | 14 | class CrwyException(Exception): 15 | def __init__(self, value): 16 | self.value = value 17 | 18 | def __str__(self): 19 | return repr(self.value) 20 | 21 | 22 | class CrwyImportException(CrwyException): 23 | pass 24 | 25 | 26 | class CrwyKafkaException(CrwyException): 27 | pass 28 | 29 | 30 | class CrwyMnsException(CrwyException): 31 | pass 32 | 33 | 34 | class CrwyDbException(CrwyException): 35 | pass 36 | 37 | 38 | class CrwyExtendException(CrwyException): 39 | pass 40 | 41 | 42 | class CrwyCookieValidException(CrwyException): 43 | pass 44 | 45 | 46 | class CrwyScrapyPlugsException(CrwyException): 47 | pass 48 | -------------------------------------------------------------------------------- /crwy/settings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/settings/__init__.py -------------------------------------------------------------------------------- /crwy/settings/default_settings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | import os 6 | 7 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | 9 | DATEBASE_DIR = os.path.join(BASE_DIR, 'data') 10 | 11 | TEMPLATE_DIR = os.path.join(BASE_DIR, 'templates') 12 | 13 | CONF_DIR = os.path.join(BASE_DIR, 'crwy') -------------------------------------------------------------------------------- /crwy/spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | import inspect 6 | from crwy.utils.html.html_downloader import HtmlDownloader 7 | from crwy.utils.html.html_parser import HtmlParser 8 | 9 | 10 | class BaseSpider(object): 11 | """ Spider基础类 """ 12 | def __init__(self): 13 | """ 14 | 初始化下载器/解析器及日志接口 15 | """ 16 | self.html_downloader = HtmlDownloader() 17 | self.html_parser = HtmlParser() 18 | 19 | 20 | class Spider(BaseSpider): 21 | """ Spider类 提供基本方法 """ 22 | def __init__(self, logger=None): 23 | super(Spider, self).__init__() 24 | self.login_kwargs = None # 用于存放登录时所需的参数 25 | self.proxies = None 26 | if logger: 27 | self.logger = logger 28 | else: 29 | from crwy.utils.logger import Logger 30 | self.logger = Logger.timed_rt_logger() 31 | 32 | def login(self, *args, **kwargs): 33 | pass 34 | 35 | def clean(self, *args, **kwargs): 36 | pass 37 | 38 | def save(self, *args, **kwargs): 39 | pass 40 | 41 | def get_cookie(self): 42 | pass 43 | 44 | @staticmethod 45 | def func_name(): 46 | """ 返回函数名称 """ 47 | return inspect.stack()[1][3] 48 | -------------------------------------------------------------------------------- /crwy/templates/project/logger_py2.conf.tmpl: -------------------------------------------------------------------------------- 1 | #logger.conf 2 | ############################################### 3 | [loggers] 4 | keys=root,fileLogger,rtLogger,timedRtLogger 5 | 6 | [logger_root] 7 | level=INFO 8 | handlers=consoleHandler 9 | 10 | [logger_fileLogger] 11 | handlers=consoleHandler,fileHandler 12 | qualname=fileLogger 13 | propagate=0 14 | 15 | [logger_rtLogger] 16 | handlers=consoleHandler,rtHandler 17 | qualname=rtLogger 18 | propagate=0 19 | 20 | [logger_timedRtLogger] 21 | handlers=consoleHandler,timedRtHandler 22 | qualname=timedRtLogger 23 | propagate=0 24 | 25 | ############################################### 26 | [handlers] 27 | keys=consoleHandler,fileHandler,rtHandler,timedRtHandler 28 | 29 | [handler_consoleHandler] 30 | class=StreamHandler 31 | level=DEBUG 32 | formatter=simpleFmt 33 | args=(sys.stderr,) 34 | 35 | [handler_fileHandler] 36 | class=FileHandler 37 | level=DEBUG 38 | formatter=defaultFmt 39 | args=('./log/default.log', 'a') 40 | 41 | [handler_rtHandler] 42 | class=handlers.RotatingFileHandler 43 | level=DEBUG 44 | formatter=defaultFmt 45 | args=('./log/default.log', 'a', 100*1024*1024, 10) 46 | 47 | [handler_timedRtHandler] 48 | class=handlers.TimedRotatingFileHandler 49 | level=DEBUG 50 | formatter=defaultFmt 51 | args=('./log/default.log', 'midnight', 1, 0) 52 | 53 | 54 | ############################################### 55 | 56 | [formatters] 57 | keys=defaultFmt,simpleFmt 58 | 59 | [formatter_defaultFmt] 60 | format=%(asctime)s %(filename)s %(funcName)s %(processName)s %(threadName)s [line:%(lineno)d] %(levelname)s %(message)s 61 | datefmt=%Y-%m-%d %H:%M:%S 62 | 63 | [formatter_simpleFmt] 64 | format=%(asctime)s %(levelname)s %(message)s 65 | datefmt=%Y-%m-%d %H:%M:%S -------------------------------------------------------------------------------- /crwy/templates/project/logger_py3.conf.tmpl: -------------------------------------------------------------------------------- 1 | #logger.conf 2 | ############################################### 3 | [loggers] 4 | keys=root,fileLogger,rtLogger,timedRtLogger 5 | 6 | [logger_root] 7 | level=INFO 8 | handlers=consoleHandler 9 | 10 | [logger_fileLogger] 11 | handlers=consoleHandler,fileHandler 12 | qualname=fileLogger 13 | propagate=0 14 | 15 | [logger_rtLogger] 16 | handlers=consoleHandler,rtHandler 17 | qualname=rtLogger 18 | propagate=0 19 | 20 | [logger_timedRtLogger] 21 | handlers=consoleHandler,timedRtHandler 22 | qualname=timedRtLogger 23 | propagate=0 24 | 25 | ############################################### 26 | [handlers] 27 | keys=consoleHandler,fileHandler,rtHandler,timedRtHandler 28 | 29 | [handler_consoleHandler] 30 | class=StreamHandler 31 | level=DEBUG 32 | formatter=simpleFmt 33 | args=(sys.stderr,) 34 | 35 | [handler_fileHandler] 36 | class=FileHandler 37 | level=DEBUG 38 | formatter=defaultFmt 39 | args=('./log/default.log', 'a', 'utf-8') 40 | 41 | [handler_rtHandler] 42 | class=handlers.RotatingFileHandler 43 | level=DEBUG 44 | formatter=defaultFmt 45 | args=('./log/default.log', 'a', 100*1024*1024, 10, 'utf-8') 46 | 47 | [handler_timedRtHandler] 48 | class=handlers.TimedRotatingFileHandler 49 | level=DEBUG 50 | formatter=defaultFmt 51 | args=('./log/default.log', 'midnight', 1, 0, 'utf-8') 52 | 53 | 54 | ############################################### 55 | 56 | [formatters] 57 | keys=defaultFmt,simpleFmt 58 | 59 | [formatter_defaultFmt] 60 | format=%(asctime)s %(filename)s %(funcName)s %(processName)s %(threadName)s [line:%(lineno)d] %(levelname)s: %(message)s 61 | datefmt=%Y-%m-%d %H:%M:%S 62 | 63 | [formatter_simpleFmt] 64 | format=%(asctime)s %(levelname)s: %(message)s 65 | datefmt=%Y-%m-%d %H:%M:%S -------------------------------------------------------------------------------- /crwy/templates/spiders/crwybasic.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import print_function 4 | import logging 5 | import scrapy 6 | from crwy.spider import BaseSpider 7 | 8 | 9 | class $classname(scrapy.Spider, BaseSpider): 10 | name = '$name' 11 | allowed_domains = ['$domain'] 12 | start_urls = ['http://$domain/'] 13 | 14 | custom_settings = { 15 | 'LOG_LEVEL': logging.INFO, 16 | 'LOG_ENCODING': 'utf-8', 17 | 'LOG_FORMAT': '%(asctime)s %(filename)s %(funcName)s %(processName)s ' 18 | '%(threadName)s [line:%(lineno)d] ' 19 | '%(levelname)s: %(message)s' 20 | } 21 | 22 | def __init__(self, *args, **kwargs): 23 | super($classname, self).__init__(*args, **kwargs) 24 | BaseSpider.__init__(self) 25 | 26 | def parse(self, response): 27 | pass 28 | -------------------------------------------------------------------------------- /crwy/templates/spiders/crwycrawl.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import print_function 4 | import logging 5 | import scrapy 6 | from scrapy.linkextractors import LinkExtractor 7 | from scrapy.spiders import CrawlSpider, Rule 8 | from crwy.spider import BaseSpider 9 | 10 | 11 | class $classname(CrawlSpider, BaseSpider): 12 | name = '$name' 13 | allowed_domains = ['$domain'] 14 | start_urls = ['http://$domain/'] 15 | 16 | custom_settings = { 17 | 'LOG_LEVEL': logging.INFO, 18 | 'LOG_ENCODING': 'utf-8', 19 | 'LOG_FORMAT': '%(asctime)s %(filename)s %(funcName)s %(processName)s ' 20 | '%(threadName)s [line:%(lineno)d] ' 21 | '%(levelname)s: %(message)s' 22 | } 23 | 24 | rules = ( 25 | Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), 26 | ) 27 | 28 | def __init__(self, *args, **kwargs): 29 | super($classname, self).__init__(*args, **kwargs) 30 | BaseSpider.__init__(self) 31 | 32 | def parse_item(self, response): 33 | item = {} 34 | #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() 35 | #item['name'] = response.xpath('//div[@id="name"]').get() 36 | #item['description'] = response.xpath('//div[@id="description"]').get() 37 | return item 38 | -------------------------------------------------------------------------------- /crwy/templates/spiders/crwyredis.tmpl: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import print_function 4 | import logging 5 | import scrapy 6 | from scrapy_redis.spiders import RedisSpider 7 | from crwy.spider import BaseSpider 8 | 9 | 10 | class $classname(RedisSpider, BaseSpider): 11 | name = '$name' 12 | allowed_domains = ['$domain'] 13 | redis_key = 'crawl_task:$name:start_urls' 14 | 15 | custom_settings = { 16 | 'SPIDER_NAME': '$name', 17 | 'DUPEFILTER_DO_HASH': False, 18 | # 'DUPEFILTER_DELAY_DAY': 2, 19 | 'DUPEFILTER_CLASS': 20 | 'crwy.utils.scrapy_plugs.dupefilters.RedisRFPDupeFilter', 21 | 'REDIS_URL': 'redis://root:password@host:port/db', 22 | 'LOG_LEVEL': logging.INFO, 23 | 'LOG_ENCODING': 'utf-8', 24 | 'LOG_FORMAT': '%(asctime)s %(filename)s %(funcName)s %(processName)s ' 25 | '%(threadName)s [line:%(lineno)d] ' 26 | '%(levelname)s: %(message)s' 27 | } 28 | 29 | def __init__(self, *args, **kwargs): 30 | super($classname, self).__init__(*args, **kwargs) 31 | BaseSpider.__init__(self) 32 | 33 | def parse(self, response): 34 | # use dupefilter_key filter with redis set or sorted set 35 | # 1. add a dupefilter_key, meta['dupefilter_key'] = url.encode('utf-8') 36 | # 2. rm a dupefilter_key, release_dupefilter_key.call(spider, request.meta.get('dupefilter_key')) 37 | pass 38 | -------------------------------------------------------------------------------- /crwy/utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | -------------------------------------------------------------------------------- /crwy/utils/common.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: common.py 8 | @create at: 2017-12-12 18:01 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import os 14 | import re 15 | import datetime 16 | 17 | try: 18 | import ConfigParser as configparser 19 | except ImportError: 20 | import configparser 21 | 22 | __all__ = [ 23 | 'cookie2str', 'cookie2dict', 'config_handle', 24 | 'file_handle', 'datetime2str', 'str2datetime', 25 | 'dict2obj', 'obj2dict', 'remove_emoji', 'change_kv', 26 | 'remove_item_from_dict', 'splice_list' 27 | ] 28 | 29 | 30 | def cookie2str(cookie_dict): 31 | """ 32 | 将requests 字典类型cookie转换成字符串 33 | :param cookie_dict: dict 34 | :return: string 35 | """ 36 | return '; '.join( 37 | [name + '=' + cookie_dict.get(name) for name in cookie_dict]) 38 | 39 | 40 | def cookie2dict(cookie_str): 41 | """ 42 | 将cookie_str转换成requests可用的dict类型 43 | :param cookie_str: string 44 | :return: dict 45 | """ 46 | cookie_dict = dict() 47 | for item in cookie_str.strip().replace(' ', '').split(';'): 48 | if not item: 49 | continue 50 | name, value = item.split('=', 1) 51 | cookie_dict[name] = value 52 | return cookie_dict 53 | 54 | 55 | def datetime2str(target, fmt='%Y-%m-%d %H:%M:%S'): 56 | """ 57 | 将datetime对象转换成字符串 58 | :param target: datetime 59 | :param fmt: string 60 | :return: string 61 | """ 62 | return datetime.datetime.strftime(target, fmt) 63 | 64 | 65 | def str2datetime(target, fmt='%Y-%m-%d %H:%M:%S'): 66 | """ 67 | 将string转换成datetime对象 68 | :param target: string 69 | :param fmt: string 70 | :return: datetime 71 | """ 72 | return datetime.datetime.strptime(target, fmt) 73 | 74 | 75 | def dict2obj(target, change_dict=True): 76 | """ 77 | 将dict转换成obj对象 78 | change_dict 用于控制是否转换target内部dict为obj 79 | 80 | :param target: dict 81 | :param change_dict: bool 82 | :return: obj 83 | """ 84 | 85 | class Obj(object): 86 | def __init__(self, d, change_dict): 87 | for a, b in d.items(): 88 | if change_dict is True: 89 | if isinstance(b, (list, tuple)): 90 | setattr(self, a, 91 | [Obj(x, change_dict) if isinstance(x, dict) else x 92 | for x in b]) 93 | else: 94 | setattr(self, a, Obj(b, change_dict) if isinstance( 95 | b, dict) else b) 96 | else: 97 | setattr(self, a, b) 98 | 99 | return Obj(target, change_dict=change_dict) 100 | 101 | 102 | def obj2dict(target): 103 | """ 104 | 将obj对象转换成dict 105 | :param target: obj 106 | :return: dict 107 | """ 108 | return target.__dict__ 109 | 110 | 111 | def config_handle(path): 112 | """ 113 | 用于对Config配置文件进行操作,初始化config_path 114 | :param path: config文件路径 115 | :return: 返回config对象 116 | """ 117 | config = configparser.ConfigParser() 118 | config.read(path) 119 | return config 120 | 121 | 122 | def file_handle(path, file_name, mode='r'): 123 | """ 124 | 用于对普通文件进行操作 125 | :param path: 文件路径 126 | :param file_name: 文件名称 127 | :param mode: 加载模式,默认'r' 128 | :return: file对象 129 | """ 130 | if path[-1] == '/': 131 | real_path = path + file_name 132 | else: 133 | real_path = path + '/' + file_name 134 | 135 | if not os.path.exists(path): 136 | os.makedirs(path) 137 | 138 | return open(real_path, mode=mode) 139 | 140 | 141 | def remove_emoji(content): 142 | """ 143 | 表情符去除 144 | :param content: unicode 145 | :return: unicode 146 | """ 147 | pattern = re.compile( 148 | u"(\ud83d[\ude00-\ude4f])|" # emoticons 149 | u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2) 150 | u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2) 151 | u"(\ud83d[\ude80-\udeff])|" # transport & map symbols 152 | u"(\ud83c[\udde0-\uddff])" # flags (iOS) 153 | "+", flags=re.UNICODE) 154 | return pattern.sub(r'', content) 155 | 156 | 157 | def change_kv(dict_ori): 158 | """ 159 | 字典kv调换 160 | :param dict_ori: 原字典 161 | :return: 新字典 162 | """ 163 | return dict(zip(dict_ori.values(), dict_ori.keys())) 164 | 165 | 166 | def remove_item_from_dict(obj, keys_to_remove): 167 | """ 168 | 移除字典中某些item 169 | :param obj: 170 | :param keys_to_remove: 171 | :return: 172 | """ 173 | for key in keys_to_remove: 174 | if obj.get(key, ''): 175 | obj.pop(key) 176 | return obj 177 | 178 | 179 | def splice_list(obj_list, group_number=3): 180 | """ 181 | 分割列表 182 | :param obj_list: 183 | :param group_number: 184 | :return: 185 | """ 186 | if len(obj_list) < group_number: 187 | raise Exception('obj_list length must greater than group_number.') 188 | 189 | distance = int(len(obj_list) / group_number) 190 | new_list = [] 191 | for group in range(group_number): 192 | if group == group_number - 1: 193 | # 若有超出部分并入最后一组 194 | new_list.append(obj_list[distance*group:len(obj_list)]) 195 | else: 196 | new_list.append(obj_list[distance*group:distance*(group+1)]) 197 | return new_list 198 | -------------------------------------------------------------------------------- /crwy/utils/data/RedisHash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | from crwy.utils.no_sql.redis_m import get_redis_client 6 | 7 | 8 | class RedisHash(object): 9 | """Simple Hash with Redis Backend""" 10 | 11 | def __init__(self, name, server=None, **redis_kwargs): 12 | """ 13 | The default connection parameters are: 14 | host='localhost', port=6379, db=0 15 | """ 16 | if server: 17 | self.__db = server 18 | else: 19 | self.__db = get_redis_client(**redis_kwargs) 20 | self.key = name 21 | 22 | def hget(self, item): 23 | """Get item value.""" 24 | return self.__db.hget(self.key, item) 25 | 26 | def hset(self, item, value): 27 | """Set item value.""" 28 | return self.__db.hset(self.key, item, value) 29 | 30 | def hexists(self, item): 31 | """Is item exist.""" 32 | return self.__db.hexists(self.key, item) 33 | 34 | def hlen(self): 35 | """Return total count.""" 36 | return self.__db.hlen(self.key) 37 | 38 | def hkeys(self): 39 | return self.__db.hkeys(self.key) 40 | 41 | def clean(self): 42 | """Empty key""" 43 | return self.__db.delete(self.key) 44 | 45 | def db(self): 46 | return self.__db 47 | -------------------------------------------------------------------------------- /crwy/utils/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/data/__init__.py -------------------------------------------------------------------------------- /crwy/utils/extend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/extend/__init__.py -------------------------------------------------------------------------------- /crwy/utils/extend/chaojiying.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: chaojiying.py 8 | @create at: 2018-05-11 16:33 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import requests 14 | from hashlib import md5 15 | 16 | 17 | class ChaoJiYingApi(object): 18 | def __init__(self, username, password, soft_id): 19 | self.username = username 20 | self.password = md5(password).hexdigest() 21 | self.soft_id = soft_id 22 | self.base_params = { 23 | 'user': self.username, 24 | 'pass2': self.password, 25 | 'softid': self.soft_id, 26 | } 27 | self.headers = { 28 | 'Connection': 'Keep-Alive', 29 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0;' 30 | ' Windows NT 5.1; Trident/4.0)', 31 | } 32 | 33 | def post_pic(self, im, code_type): 34 | """ 35 | im: 图片字节 36 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html 37 | """ 38 | params = { 39 | 'codetype': code_type, 40 | } 41 | params.update(self.base_params) 42 | files = {'userfile': ('ccc.jpg', im)} 43 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', 44 | data=params, files=files, headers=self.headers) 45 | return r.json() 46 | 47 | def report_error(self, im_id): 48 | """ 49 | im_id:报错题目的图片ID 50 | """ 51 | params = { 52 | 'id': im_id, 53 | } 54 | params.update(self.base_params) 55 | r = requests.post( 56 | 'http://upload.chaojiying.net/Upload/ReportError.php', data=params, 57 | headers=self.headers) 58 | return r.json() 59 | 60 | def decode(self, img_path, code_type): 61 | im = open(img_path, 'rb').read() 62 | res = self.post_pic(im, code_type) 63 | # {u'err_str': u'OK', u'err_no': 0, 64 | # u'md5': u'a11171f1f444e8d1992926f4ca16c7d8', 65 | # u'pic_id': u'6031116291508600001', 66 | # u'pic_str': u'113,72|220,81|138,101'} 67 | if res.get('err_no') == 0 and res.get('err_str') == u'OK': 68 | return res.get('pic_str') 69 | return 70 | -------------------------------------------------------------------------------- /crwy/utils/extend/dingding_robot.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: dingding_robot.py 8 | @create at: 2017-10-24 10:57 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import json 14 | 15 | from crwy.spider import BaseSpider 16 | from crwy.exceptions import CrwyExtendException 17 | 18 | 19 | class DingDingRobot(BaseSpider): 20 | def __init__(self, access_token=None, 21 | api_url="https://oapi.dingtalk.com/robot/send?access_token="): 22 | super(DingDingRobot, self).__init__() 23 | if not api_url: 24 | raise CrwyExtendException('access_token unset.') 25 | self.api_url = api_url 26 | self.header = {'Content-Type': 'application/json'} 27 | self.access_token = access_token 28 | self.html_downloader.session.headers = self.header 29 | 30 | def send_text(self, content, at_mobiles=list(), is_at_all=False): 31 | try: 32 | data = { 33 | "text": { 34 | "content": content 35 | }, 36 | "msgtype": "text", 37 | "at": { 38 | "isAtAll": is_at_all, 39 | "atMobiles": at_mobiles 40 | } 41 | } 42 | 43 | res = self.html_downloader.download( 44 | self.api_url + self.access_token, 45 | method='POST', 46 | data=json.dumps(data)) 47 | return res 48 | except Exception as e: 49 | raise CrwyExtendException(e) 50 | 51 | def send_markdown(self, title, content, at_mobiles=list(), 52 | is_at_all=False): 53 | try: 54 | data = { 55 | "msgtype": "markdown", 56 | "markdown": { 57 | "title": title, 58 | "text": content 59 | }, 60 | "at": { 61 | "atMobiles": at_mobiles, 62 | "isAtAll": is_at_all 63 | } 64 | } 65 | 66 | res = self.html_downloader.download( 67 | self.api_url + self.access_token, 68 | method='POST', 69 | data=json.dumps(data)) 70 | return res 71 | except Exception as e: 72 | raise CrwyExtendException(e) 73 | 74 | def send_action_card(self, title, content, hide_avatar="0", 75 | btn_oriengtation="0", single_title="阅读全文", 76 | single_url="#"): 77 | try: 78 | data = { 79 | "actionCard": { 80 | "title": title, 81 | "text": content, 82 | "hideAvatar": hide_avatar, 83 | "btnOrientation": btn_oriengtation, 84 | "singleTitle": single_title, 85 | "singleURL": single_url 86 | }, 87 | "msgtype": "actionCard" 88 | } 89 | res = self.html_downloader.download( 90 | self.api_url + self.access_token, 91 | method='POST', 92 | data=json.dumps(data)) 93 | return res 94 | except Exception as e: 95 | raise CrwyExtendException(e) 96 | 97 | def send_feed_card(self, links): 98 | """ 99 | 100 | :param links: array[{'title':'', 'messageURL':'', 'picURL':''}] 101 | :return: 102 | """ 103 | try: 104 | data = { 105 | "feedCard": { 106 | "links": links 107 | }, 108 | "msgtype": "feedCard" 109 | } 110 | res = self.html_downloader.download( 111 | self.api_url + self.access_token, 112 | method='POST', 113 | data=json.dumps(data)) 114 | return res 115 | except Exception as e: 116 | raise CrwyExtendException(e) 117 | -------------------------------------------------------------------------------- /crwy/utils/extend/xunma.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: IntelliJ IDEA 7 | @file: xunma.py 8 | @create at: 2018-09-14 11:41 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | 14 | from __future__ import print_function, unicode_literals 15 | 16 | from crwy.spider import Spider 17 | from crwy.exceptions import CrwyExtendException 18 | 19 | 20 | class XunMa(Spider): 21 | def __init__(self, username, password, item_id): 22 | super(XunMa, self).__init__() 23 | if username and password and item_id: 24 | self.username = username 25 | self.password = password 26 | self.item_id = item_id 27 | else: 28 | raise CrwyExtendException("[XunMa] params not valid.") 29 | 30 | def login(self): 31 | """ 32 | XunMa 登录 33 | :return: 登录token 34 | """ 35 | try: 36 | url = "http://xapi.xunma.net/Login?uName={username}" \ 37 | "&pWord={password}&Code=UTF8".format(username=self.username, 38 | password=self.password) 39 | res = self.html_downloader.download(url) 40 | 41 | return res.text.strip().split("&")[0] 42 | except Exception as e: 43 | raise CrwyExtendException(e) 44 | 45 | def get_phone(self, token, phone_type='', phone=''): 46 | """ 47 | 获取手机号 48 | :param token: 登录token 49 | :param phone_type: 运营商 1 [移动] 2 [联通] 3 [电信] 50 | :param phone: 指定号码 51 | :return: 手机号码 52 | """ 53 | try: 54 | url = "http://xapi.xunma.net/getPhone?ItemId=" \ 55 | "{item_id}&token={token}&" \ 56 | "PhoneType={phone_type}&Code=UTF8&" \ 57 | "Phone={phone}".format(token=token, item_id=self.item_id, 58 | phone_type=phone_type, phone=phone) 59 | 60 | res = self.html_downloader.download(url) 61 | return res.text.strip().split(';')[0] 62 | 63 | except Exception as e: 64 | raise CrwyExtendException(e) 65 | 66 | def get_message(self, token, phone): 67 | """ 68 | 获取短信消息 69 | :param token: 登录token 70 | :param phone: 手机号 71 | :return: 72 | """ 73 | try: 74 | # http://xapi.xunma.net/getMessage?token=登陆token&itemId=项目ID&phone=手机号码 75 | url = "http://xapi.xunma.net/getMessage?" \ 76 | "token={token}&itemId={item_id}&phone={phone}" \ 77 | "&Code=UTF8".format(token=token, 78 | item_id=self.item_id, phone=phone) 79 | res = self.html_downloader.download(url) 80 | 81 | return res.text.strip().split('&')[-1] 82 | 83 | except Exception as e: 84 | raise CrwyExtendException(e) 85 | 86 | def release_phone(self, token, phone): 87 | try: 88 | # http://xapi.xunma.net/releasePhone?token=登陆token&phoneList=phone-itemId;phone-itemId; 89 | url = "http://xapi.xunma.net/releasePhone?" \ 90 | "token={token}&phoneList={phone};" \ 91 | "&Code=UTF8".format(token=token, phone=phone) 92 | self.html_downloader.download(url) 93 | 94 | except Exception as e: 95 | raise CrwyExtendException(e) 96 | 97 | def add_black(self, token, phone): 98 | try: 99 | url = "http://xapi.xunma.net/addBlack?" \ 100 | "token={token}&phoneList={phone};" \ 101 | "&Code=UTF8".format(token=token, phone=phone) 102 | self.html_downloader.download(url) 103 | 104 | except Exception as e: 105 | raise CrwyExtendException(e) 106 | -------------------------------------------------------------------------------- /crwy/utils/extend/yima.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: yima.py 8 | @create at: 2017-10-27 09:57 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | from __future__ import print_function, unicode_literals 14 | 15 | from crwy.spider import Spider 16 | from crwy.exceptions import CrwyException 17 | 18 | 19 | class YiMa(Spider): 20 | def __init__(self, username, password, item_id): 21 | super(YiMa, self).__init__() 22 | if username and password and item_id: 23 | self.username = username 24 | self.password = password 25 | self.item_id = item_id 26 | else: 27 | raise CrwyException("[YiMa] params not valid.") 28 | 29 | def login(self): 30 | """ 31 | YiMa 登录 32 | :return: 登录token 33 | """ 34 | 35 | try: 36 | url = "http://api.fxhyd.cn/UserInterface.aspx?" \ 37 | "action=login&username={username}" \ 38 | "&password={password}".format(username=self.username, 39 | password=self.password) 40 | res = self.html_downloader.download(url) 41 | 42 | if 'success' not in res.text: 43 | raise CrwyException("[YiMa] Login failed.") 44 | 45 | return res.text.strip().split("|")[-1] 46 | except Exception as e: 47 | raise CrwyException(e) 48 | 49 | def get_phone(self, token, phone_type='', 50 | phone='', not_prefix=''): 51 | """ 52 | 获取手机号 53 | :param token: 登录token 54 | :param phone_type: 运营商 1 [移动] 2 [联通] 3 [电信] 55 | :param phone: 指定号码 56 | :param not_prefix: 不要号段 (例子:notPrefix=170.177 ,代表不获取170和177的号段) 57 | :return: 手机号码 58 | """ 59 | try: 60 | url = "http://api.fxhyd.cn/UserInterface.aspx?action=getmobile&" \ 61 | "token={token}&itemid={item_id}&excludeno=" \ 62 | "{not_prefix}&isp={phone_type}&" \ 63 | "mobile={phone}".format(token=token, item_id=self.item_id, 64 | not_prefix=not_prefix, 65 | phone_type=phone_type, phone=phone) 66 | 67 | res = self.html_downloader.download(url) 68 | if 'success' not in res.text: 69 | raise CrwyException("[YiMa] get phone failed.") 70 | 71 | # print(res.text) 72 | return res.text.strip().split('|')[-1] 73 | 74 | except Exception as e: 75 | raise CrwyException(e) 76 | 77 | def get_message(self, token, phone): 78 | """ 79 | 获取短信消息 80 | :param token: 登录token 81 | :param phone: 手机号 82 | :return: 83 | """ 84 | try: 85 | url = "http://api.fxhyd.cn/UserInterface.aspx?action=getsms&" \ 86 | "token={token}&itemid={item_id}&mobile={phone}" \ 87 | "&release=0".format(token=token, item_id=self.item_id, 88 | phone=phone) 89 | res = self.html_downloader.download(url) 90 | 91 | if 'success' not in res.text: 92 | raise CrwyException("[YiMa] get message failed.") 93 | 94 | else: 95 | return res.text.strip().split('|')[-1] 96 | 97 | except Exception as e: 98 | raise CrwyException(e) 99 | 100 | def release_phone(self, token, phone): 101 | try: 102 | url = "http://api.fxhyd.cn/UserInterface.aspx?action=release&" \ 103 | "token={token}&itemid={item_id}&mobile={phone}" \ 104 | "&release=0".format(token=token, item_id=self.item_id, 105 | phone=phone) 106 | res = self.html_downloader.download(url) 107 | 108 | if 'success' not in res.text: 109 | raise CrwyException("[YiMa] release phone failed.") 110 | 111 | except Exception as e: 112 | raise CrwyException(e) 113 | 114 | def add_black(self, token, phone): 115 | try: 116 | url = "http://api.fxhyd.cn/UserInterface.aspx?action=addignore&" \ 117 | "token={token}&itemid={item_id}&mobile={phone}" \ 118 | "&release=0".format(token=token, item_id=self.item_id, 119 | phone=phone) 120 | res = self.html_downloader.download(url) 121 | 122 | if 'success' not in res.text: 123 | raise CrwyException("[YiMa] black phone failed.") 124 | 125 | except Exception as e: 126 | raise CrwyException(e) 127 | -------------------------------------------------------------------------------- /crwy/utils/filter/RedisSet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | from crwy.utils.no_sql.redis_m import get_redis_client 6 | 7 | 8 | class RedisSet(object): 9 | """Simple Deduplicate with Redis Backend""" 10 | 11 | def __init__(self, name, namespace='deduplicate', server=None, 12 | **redis_kwargs): 13 | """ 14 | The default connection parameters are: 15 | host='localhost', port=6379, db=0 16 | """ 17 | if server: 18 | self.__db = server 19 | else: 20 | self.__db = get_redis_client(**redis_kwargs) 21 | self.key = '%s:%s' % (namespace, name) 22 | 23 | def sadd(self, item): 24 | """Add item.""" 25 | if self.__db.sadd(self.key, item) == 0: 26 | return False 27 | else: 28 | return True 29 | 30 | def srem(self, item): 31 | """Del item.""" 32 | if self.__db.srem(self.key, item) == 0: 33 | return False 34 | else: 35 | return True 36 | 37 | def scard(self): 38 | """Return total count.""" 39 | return self.__db.scard(self.key) 40 | 41 | def smembers(self): 42 | """Return all item.""" 43 | return self.__db.smembers(self.key) 44 | 45 | def clean(self): 46 | """Empty key""" 47 | return self.__db.delete(self.key) 48 | 49 | def db(self): 50 | return self.__db 51 | -------------------------------------------------------------------------------- /crwy/utils/filter/RedisSortedSet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | from crwy.utils.no_sql.redis_m import get_redis_client 6 | 7 | 8 | class RedisSortedSet(object): 9 | """Simple Sorted Deduplicate with Redis Backend""" 10 | 11 | def __init__(self, name, namespace='deduplicate_sorted', server=None, 12 | **redis_kwargs): 13 | """ 14 | The default connection parameters are: 15 | host='localhost', port=6379, db=0 16 | """ 17 | if server: 18 | self.__db = server 19 | else: 20 | self.__db = get_redis_client(**redis_kwargs) 21 | self.key = '%s:%s' % (namespace, name) 22 | 23 | def zadd(self, score, item): 24 | """Add item.""" 25 | if self.__db.zadd(self.key, score, item) == 0: 26 | return False 27 | else: 28 | return True 29 | 30 | def zrem(self, item): 31 | """Del item.""" 32 | if self.__db.zrem(self.key, item) == 0: 33 | return False 34 | else: 35 | return True 36 | 37 | def zcard(self): 38 | """Return total count.""" 39 | return self.__db.zcard(self.key) 40 | 41 | def zscore(self, item): 42 | """Return item score.""" 43 | return self.__db.zscore(self.key, item) 44 | 45 | def zmembers(self): 46 | """Return all item.""" 47 | return self.__db.zmembers(self.key) 48 | 49 | def clean(self): 50 | """Empty key""" 51 | return self.__db.delete(self.key) 52 | 53 | def db(self): 54 | return self.__db 55 | -------------------------------------------------------------------------------- /crwy/utils/filter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/filter/__init__.py -------------------------------------------------------------------------------- /crwy/utils/html/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /crwy/utils/html/font_analysis.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: IntelliJ IDEA 7 | @file: font_analysis.py 8 | @create at: 2018-08-22 19:42 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | 12 | 本工具类适用于58同城,其他站点可将该类作为基础类进行扩展 13 | 14 | 思路: 15 | 16 | 1. 获取web页面内容; 17 | 2. 获取字体文件; 18 | 3. 获取字体xml文件,解析出经过自定义的文字; 19 | 4. 获取文字value的hash值(经测试发现,文字的key、value对应关系每次请求都是变化的,而不可能 20 | 改变的是字的value,所以这里通过文字value的hash值来确定,是哪一个字,反推确定页面上的字符对应的 21 | 字是什么。) 22 | 23 | like: 24 | 25 | '77880914931fb6dda97269a9156404745f609d35': '黄' 26 | 27 | hash值与文字对应mapping需要人工,通过字体软件对应 推荐 fontforge 28 | 29 | 5. 通过人工确认的mapping,找到页面上字符与真实字体的对应关系; 30 | 6. 替换原始页面中的字符 31 | 32 | """ 33 | 34 | import base64 35 | import hashlib 36 | # import json 37 | import os 38 | import re 39 | import uuid 40 | from fontTools.ttLib import TTFont 41 | from crwy.spider import BaseSpider 42 | 43 | 44 | class FontAnalysis(BaseSpider): 45 | def __init__(self, html=None): 46 | super(FontAnalysis, self).__init__() 47 | uid = str(uuid.uuid1()) 48 | self.font_path = './data/font/font-{}.woff'.format(uid) 49 | self.xml_path = './data/xml/font-{}.xml'.format(uid) 50 | self.html = html if html else self.get_test_html() 51 | 52 | def get_test_html(self): 53 | # 58简历页 54 | url = 'https://bj.58.com/qzyewu/pn2/?PGTID=0d303353-0000-1188-7c8a-829b2b71d0e8&ClickID=2' 55 | 56 | res = self.html_downloader.download(url) 57 | 58 | return res.text 59 | 60 | def save_font(self): 61 | """ 62 | 保存字体 63 | :return: 64 | """ 65 | base64string = re.search('(?<=base64,).*?(?=\))', self.html).group() 66 | bin_data = base64.b64decode(base64string) 67 | with open(self.font_path, 'wb') as f: 68 | f.write(bin_data) 69 | 70 | def get_font_xml(self): 71 | """ 72 | 获取字体 xml 73 | :return: 74 | """ 75 | font = TTFont(self.font_path) 76 | font.saveXML(self.xml_path) 77 | 78 | def analysis(self, is_clean=True, debug=False): 79 | """ 80 | 解析xml,获取web页面字符与文字key及文字value hash值的对应关系 81 | :param is_clean: 是否清楚字体文件及字体xml文件 82 | :param debug: 是否终端输出字体对照关系 83 | :return: 84 | """ 85 | 86 | self.save_font() 87 | self.get_font_xml() 88 | with open(self.xml_path, 'rb') as xml: 89 | soups = self.html_parser.parser(xml.read()) 90 | ttglyph_lst = soups.find('glyf').find_all('ttglyph')[1:] 91 | map_lst = soups.find('cmap').find_all('map') 92 | map_dict = {} 93 | for map in map_lst: 94 | map_dict[map.get('name')] = map.get('code') 95 | # print(map_dict, len(map_dict)) 96 | analysis_res = [] 97 | for ttglyph in ttglyph_lst: 98 | analysis_dict = dict() 99 | analysis_dict['ttglyph_name'] = ttglyph.get('name') 100 | # analysis_dict['html_name'] = '&#x{};'.format( 101 | # analysis_dict['ttglyph_name'][3:].lower()) 102 | # x_distance = str(int(ttglyph.get('xmax')) - int(ttglyph.get('xmin'))) 103 | # y_distance = str(int(ttglyph.get('ymax')) - int(ttglyph.get('ymin'))) 104 | analysis_dict['html_name'] = '&#x{};'.format( 105 | map_dict.get(ttglyph.get('name'))[2:].upper()) 106 | ttglyph_value = [] 107 | contour_lst = ttglyph.find_all('contour') 108 | for contour in contour_lst: 109 | pt_lst = contour.find_all('pt') 110 | for pt in pt_lst[:1]: 111 | tmp = str(int(pt.get('x')) - int(pt.get('y'))) 112 | # pt['y'] = str(int(y_distance) - int(pt.get('y'))) 113 | ttglyph_value.append(tmp) 114 | analysis_dict['ttglyph_value'] = str(sorted(ttglyph_value)) 115 | analysis_dict['font_hash'] = hashlib.sha1( 116 | analysis_dict['ttglyph_value'].encode('utf-8') 117 | ).hexdigest() 118 | analysis_res.append(analysis_dict) 119 | 120 | if debug: 121 | print(analysis_dict['ttglyph_name'], # 字体key 122 | analysis_dict['html_name'], # web页面显示值 123 | analysis_dict['font_hash'], # 字体内容哈希值 124 | analysis_dict['ttglyph_value']) 125 | 126 | if is_clean: 127 | os.remove(self.font_path) 128 | os.remove(self.xml_path) 129 | 130 | return analysis_res 131 | 132 | @staticmethod 133 | def get_real_font_mapping(analysis_res, font_mapping, debug=False): 134 | real_font_mapping = dict() 135 | for item in analysis_res: 136 | real_font_mapping[item['html_name']] = font_mapping[ 137 | item['font_hash']] 138 | if debug is True: 139 | print(item['html_name'], font_mapping[item['font_hash']]) 140 | 141 | return real_font_mapping 142 | 143 | @staticmethod 144 | def recover_html(html, real_font_mapping): 145 | for k, v in real_font_mapping.items(): 146 | html = html.replace(k, v) 147 | return html 148 | 149 | 150 | font_mapping = { 151 | '7fd63556d48347cd5a50007b3151e2735f93bed2': '', 152 | 'ed465eefa32423091781b4cf7136d16d3ebce463': '技', 153 | 'f9e740d4af46806fd75ab69783555c87f6ec7706': '6', 154 | '50365252b61dbe2651e0c83bebc8d00ef763a158': '经', 155 | 'ecc7ed15aa268e5a699eb8ddbe73ad2b27911ee1': '王', 156 | '891090dd6e752593d367a61dc3891f1cb110f0dc': '应', 157 | '6aea2037fa2d83b11da6dcc837443c7b2a9be22e': '专', 158 | '7138fba5e0f9093c696c20ac994857385f257c1e': '赵', 159 | '2609770c8afd922eb37758dd8828db1b566c7fd6': '李', 160 | '02c3341d2d8085eded8233ece3f54b6540322eac': '以', 161 | '75859289bdf9b78f1842ed692daf445f403e2b88': '吴', 162 | 'f9dcd3c88958fc85f6fbf770532929d3b3891a53': '女', 163 | 'e83dc230a3f59d5361bbcdc82973529c6fcbf443': '杨', 164 | '8371e4d560301720541aa2b18c92d7624ff11082': '7', 165 | '19abe86dc73d03989b6e2c9ba3e86d05f187a3c6': '5', 166 | 'fdb060ded208610d1923ff00a5cee237a021be83': '张', 167 | 'b032add2a6287c5d6ab051bef17e37c45d714f40': 'B', 168 | '5b262e3ff34a8ec29d9b5b271e0de2396de260ce': '本', 169 | '512adeb01f06bca832fd2a6dea974f09029edeeb': '男', 170 | '6f15d67b48b66a140ca1858aae74897c4a5a79f6': '博', 171 | 'b2f5589957afac462f1620e6647e7c33f05dce96': '3', 172 | 'e24ed92db7331e919161e6d2961d35ccaae0593a': '无', 173 | '3cd650e51bbe8268ffb4ec2ab9537937eddac0dd': '9', 174 | 'e1eb5abd0f77e1c2627a50eda8a4765c4be7a606': '生', 175 | '04783616ad232d7ad4d886876f4eaaf5a0bbb580': '验', 176 | '374446ca738266d7c1da2a551f15c54cdd12e460': '8', 177 | 'e2af36b0e0add44124e65c78e0bb388a91da5373': '下', 178 | '679f2fd459c646e6e1668938a57f7c1a248f806d': '科', 179 | '6924fddd64c781721e91d3797742761616b58532': '1', 180 | 'c81c0fe74c783fdc90d8ff7bcb801ac73646f5bf': '4', 181 | '07aa1ed8e3c9b283fd20a9942fd81d925fb49de2': 'M', 182 | '968f86f893ff4e01575e3acc1e61c940b424d479': '中', 183 | '7b9ef322a4ea16ddd46ce3afbf9ae7265b638947': '硕', 184 | '5ac2c4226b898a1cb133c1cd012506399c291253': '届', 185 | 'e51700563d6f6c7c82def8c408b972ae28870a7e': '2', 186 | '44b8aeb98a4556a2d9e0121848d1ce5ddd1cf820': '刘', 187 | 'b1df1706be1e25ce632329d0967bc4d87de7a0ce': '士', 188 | 'b412e5268df2f25d418a5547c902e0794ed33a7c': '陈', 189 | 'c0fdac37111eee42dde8f1481eddf803c1c6eafd': '高', 190 | '859502b5860c99ec88061eb60dfc0c2af03e1778': '大', 191 | 'afd6a61616dbb4215b9327194e177e6caa71ace9': '0', 192 | 'f4d6418264b302bc9d70693b5cc3d4a7da01a445': 'E', 193 | 'b74117c62c7ff7ae79a1d01dfe0baa21874d3ae8': '周', 194 | '9372bfdc75a272eceb0f4fbf8bd5c86bc21b4b1d': 'A', 195 | '9e2c166141627880782747ba69b9972885ca798a': '校', 196 | 'f38c4fe982d55a9bcb4460b82db43f995bc5a992': '黄' 197 | } 198 | 199 | # def main(): 200 | # runner = FontAnalysis() 201 | # analysis_res = runner.analysis(debug=True) 202 | # real_font_mapping = runner.get_real_font_mapping(analysis_res, font_mapping) 203 | # print(real_font_mapping) 204 | # 205 | # real_html = runner.recover_html(html=runner.html, 206 | # real_font_mapping=real_font_mapping) 207 | # print(real_html) 208 | # 209 | # 210 | # if __name__ == '__main__': 211 | # main() 212 | -------------------------------------------------------------------------------- /crwy/utils/html/html_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | import requests 6 | 7 | 8 | class HtmlDownloader(object): 9 | """ 下载器 """ 10 | 11 | def __init__(self): 12 | self.session = requests.session() 13 | 14 | def download(self, url, method='GET', single=False, timeout=60, 15 | **kwargs): 16 | """ 17 | 请求页面 18 | :param url: 目标地址 19 | :param method: 请求方式 20 | :param single: 为True时,不使用session 21 | :param timeout: 初始化超时时间 22 | :param kwargs: 更多requests参数 23 | :return: 返回requests session对象 24 | """ 25 | 26 | if single is False: 27 | if method.upper() == 'GET': 28 | return self.session.get(url, timeout=timeout, **kwargs) 29 | return self.session.post(url, timeout=timeout, **kwargs) 30 | else: 31 | if method.upper() == 'GET': 32 | return requests.get(url, timeout=timeout, **kwargs) 33 | return requests.post(url, timeout=timeout, **kwargs) 34 | 35 | def download_file(self, url, method='GET', single=False, timeout=180, 36 | save_path='./data/', file_name=None, **kwargs): 37 | """ 38 | 请求文件 39 | :param url: 目标地址 40 | :param method: 请求方式 41 | :param single: 为True时,不使用session 42 | :param timeout: 初始化超时时间 43 | :param save_path: 保存路径 44 | :param file_name: 文件名称,默认为空 45 | :param kwargs: 更多requests参数 46 | :return: 返回保存路径 47 | """ 48 | if not file_name: 49 | file_name = url.split('/')[-1] 50 | tmp = self.download(url, method=method, single=single, 51 | timeout=timeout, 52 | stream=True, **kwargs) 53 | with open(save_path + file_name, 'wb') as f: 54 | for chunk in tmp.iter_content(chunk_size=1024): 55 | if chunk: # filter out keep-alive new chunks 56 | f.write(chunk) 57 | f.flush() 58 | f.close() 59 | return save_path + file_name 60 | -------------------------------------------------------------------------------- /crwy/utils/html/html_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | from bs4 import BeautifulSoup 5 | try: 6 | import PyV8 7 | except ImportError: 8 | pass 9 | 10 | 11 | class HtmlParser(object): 12 | """ 解析器 """ 13 | @staticmethod 14 | def parser(response): 15 | """ 16 | utf-8字符处理 17 | :param response: 待处理字符串 18 | :return: 返回bs对象 19 | """ 20 | if response is None: 21 | return 22 | 23 | if sys.version_info < (3, ): 24 | soup = BeautifulSoup(str(response), 'html.parser', 25 | from_encoding='utf-8') 26 | else: 27 | soup = BeautifulSoup(str(response), 'html.parser') 28 | 29 | return soup 30 | 31 | @staticmethod 32 | def gbk_parser(response): 33 | """ 34 | gbk字符处理 35 | :param response: 待处理字符串 36 | :return: 返回bs对象 37 | """ 38 | if response is None: 39 | return 40 | 41 | if sys.version_info < (3, ): 42 | soup = BeautifulSoup(str(response), 'html.parser', 43 | from_encoding='gb18030') 44 | else: 45 | soup = BeautifulSoup(str(response), 'html.parser') 46 | 47 | return soup 48 | 49 | @staticmethod 50 | def jsonp_parser(data): 51 | """ 52 | 非规范json数据处理 {a:1, b:1} 53 | key非字符串 54 | :param data: 待处理字符串 55 | :return: 返回标准json数据 56 | """ 57 | ctx = PyV8.JSContext() 58 | ctx.enter() 59 | ctx.eval(""" 60 | function func() { 61 | var data = """ + data + """; 62 | var json_data = JSON.stringify(data); 63 | return json_data; 64 | } 65 | """) 66 | return ctx.locals.func() 67 | 68 | -------------------------------------------------------------------------------- /crwy/utils/load_settings.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: load_settings.py 8 | @create at: 2018-06-20 19:32 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import consul 14 | from crwy.exceptions import CrwyException 15 | 16 | 17 | class LoadSettingsFromConsul(object): 18 | def __init__(self, **kwargs): 19 | self.c = consul.Consul(**kwargs) 20 | self.main_key = None 21 | 22 | def init_main_key(self, key=None): 23 | if not key: 24 | raise CrwyException('Please set key first.') 25 | self.main_key = key 26 | 27 | def _get_settings(self, key=None): 28 | self.init_main_key(key=key) 29 | index, data = self.c.kv.get(self.main_key, recurse=True) 30 | if not data: 31 | raise CrwyException('Please make sure the key: <%s> is exist.' % 32 | self.main_key) 33 | 34 | new_data = { 35 | item.get('Key').split('/')[-1]: eval(item.get('Value')) 36 | for item in data 37 | } 38 | 39 | return new_data 40 | 41 | @classmethod 42 | def get_settings(cls, key=None, **kwargs): 43 | load_settings = cls(**kwargs) 44 | return load_settings._get_settings(key=key) 45 | -------------------------------------------------------------------------------- /crwy/utils/logger.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: wuyue 4 | # Email: wuyue92tree@163.com 5 | 6 | 7 | import os 8 | import sys 9 | import logging 10 | import logging.config 11 | import logging.handlers 12 | from crwy.exceptions import CrwyException 13 | from crwy.settings.default_settings import TEMPLATE_DIR 14 | 15 | try: 16 | import ConfigParser as configparser 17 | except ImportError: 18 | import configparser 19 | 20 | DEFAULT_LOGGER_CONF = './conf/logger.conf' 21 | 22 | if sys.version_info[0] == 2: 23 | BASE_LOGGER_CONF = os.path.join( 24 | TEMPLATE_DIR, 'project/logger_py2.conf.tmpl') 25 | else: 26 | BASE_LOGGER_CONF = os.path.join( 27 | TEMPLATE_DIR, 'project/logger_py3.conf.tmpl') 28 | 29 | try: 30 | try: 31 | logging.config.fileConfig(DEFAULT_LOGGER_CONF) 32 | except KeyError: 33 | logging.config.fileConfig(BASE_LOGGER_CONF) 34 | except: 35 | pass 36 | 37 | 38 | def _install_handlers_custom(cp, formatters, log_path): 39 | """Install and return handlers""" 40 | hlist = cp.get("handlers", "keys") 41 | if not len(hlist): 42 | return {} 43 | hlist = hlist.split(",") 44 | hlist = logging.config._strip_spaces(hlist) 45 | handlers = {} 46 | fixups = [] # for inter-handler references 47 | for hand in hlist: 48 | sectname = "handler_%s" % hand 49 | klass = cp.get(sectname, "class") 50 | opts = cp.options(sectname) 51 | if "formatter" in opts: 52 | fmt = cp.get(sectname, "formatter") 53 | else: 54 | fmt = "" 55 | try: 56 | klass = eval(klass, vars(logging)) 57 | except (AttributeError, NameError): 58 | klass = logging.config._resolve(klass) 59 | args = cp.get(sectname, "args") 60 | args = eval(args, vars(logging)) 61 | 62 | # 修改args中的path参数 63 | if isinstance(args[0], str): 64 | args = tuple([log_path] + list(args)[1:]) 65 | 66 | h = klass(*args) 67 | 68 | if "level" in opts: 69 | level = cp.get(sectname, "level") 70 | try: 71 | h.setLevel(logging._levelNames[level]) 72 | except AttributeError: 73 | h.setLevel(logging._nameToLevel[level]) 74 | if len(fmt): 75 | h.setFormatter(formatters[fmt]) 76 | if issubclass(klass, logging.handlers.MemoryHandler): 77 | if "target" in opts: 78 | target = cp.get(sectname, "target") 79 | else: 80 | target = "" 81 | if len(target): 82 | fixups.append((h, target)) 83 | handlers[hand] = h 84 | 85 | for h, t in fixups: 86 | h.setTarget(handlers[t]) 87 | return handlers 88 | 89 | 90 | def fileConfigWithLogPath(fname=BASE_LOGGER_CONF, 91 | log_path=None, 92 | defaults=None, 93 | disable_existing_loggers=True): 94 | """ 95 | 通过拦截重写handler的方式传入log_path,实现日志位置修改 96 | """ 97 | if not log_path: 98 | raise CrwyException('Please setup first!') 99 | 100 | cp = configparser.ConfigParser(defaults) 101 | if hasattr(fname, 'readline'): 102 | cp.read_file(fname) 103 | else: 104 | cp.read(fname) 105 | try: 106 | formatters = logging.config._create_formatters(cp) 107 | except configparser.NoSectionError: 108 | raise CrwyException('Please make sure fname: "%s" is exist.' % fname) 109 | 110 | logging._acquireLock() 111 | try: 112 | logging._handlers.clear() 113 | del logging._handlerList[:] 114 | # Handlers add themselves to logging._handlers 115 | handlers = _install_handlers_custom(cp, formatters, log_path) 116 | logging.config._install_loggers(cp, handlers, disable_existing_loggers) 117 | finally: 118 | logging._releaseLock() 119 | 120 | 121 | class Logger(object): 122 | @staticmethod 123 | def file_logger(): 124 | return logging.getLogger('fileLogger') 125 | 126 | @staticmethod 127 | def rt_logger(): 128 | return logging.getLogger('rtLogger') 129 | 130 | @staticmethod 131 | def timed_rt_logger(): 132 | return logging.getLogger('timedRtLogger') 133 | 134 | @staticmethod 135 | def extra_logger(name=None): 136 | return logging.getLogger(name) 137 | 138 | -------------------------------------------------------------------------------- /crwy/utils/mail.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Author: wuyue 4 | # Email: wuyue92tree@163.com 5 | 6 | 7 | from __future__ import print_function, unicode_literals 8 | 9 | import email 10 | import re 11 | import traceback 12 | import imaplib 13 | 14 | from imapclient import IMAPClient 15 | from email.header import decode_header 16 | 17 | imaplib._MAXLINE = 10000000 18 | 19 | SEEN = br'\Seen' 20 | 21 | 22 | class MailReceiver(IMAPClient): 23 | def __init__(self, host, timeout=60, **kwargs): 24 | super(MailReceiver, self).__init__(host, timeout=timeout, **kwargs) 25 | 26 | def get_folder_list(self): 27 | """ 28 | 获取邮箱文件夹 29 | :return: list 30 | """ 31 | folders = self.list_folders() 32 | res_list = [] 33 | for folder in folders: 34 | if folder: 35 | # print(folder[2].encode("utf-8")) 36 | res_list.append(folder[2]) 37 | return res_list 38 | 39 | def get_message_id_list(self, mailbox='INBOX', search_='all'): 40 | """ 41 | 获取邮件ID列表 42 | :param mailbox: 邮箱文件夹 43 | :param search_: 搜索规则 44 | :return: list 45 | """ 46 | self.select_folder(mailbox) 47 | # message_list = self.server.search('ON 21-Dec-2017') 48 | message_list = self.search(search_) 49 | return message_list 50 | 51 | def get_message_list(self, message_id_list): 52 | """ 53 | 获取邮件列表 54 | :param message_id_list: 邮件ID列表 55 | :return: dict id:email 56 | """ 57 | message_list = self.fetch( 58 | message_id_list, ['INTERNALDATE', 'FLAGS', 'BODY.PEEK[]']) 59 | if not message_list: 60 | return 61 | 62 | return message_list 63 | 64 | @staticmethod 65 | def parse_email(m, flag=None): 66 | """ 67 | 解析邮件header内容 68 | :param m: 原内容 69 | :param flag: 解析类型标识 70 | :return: 编码转换后内容 71 | """ 72 | res = [] 73 | try: 74 | for s, c in decode_header(m): 75 | if c: 76 | res.append(s.decode(c, 'ignore')) 77 | else: 78 | res.append(s.decode('utf-8') if isinstance(s, bytes) else s) 79 | 80 | if not res: 81 | return 82 | 83 | # 处理邮件发送方 返回邮箱地址 84 | if flag == 'from': 85 | res = re.findall( 86 | '[0-9a-zA-Z_\.]{0,19}@[0-9a-zA-Z\.]{1,100}', res[1]) 87 | return res[0] 88 | 89 | # 处理邮件接收方 返回邮箱地址列表 90 | if flag == 'to': 91 | new_res = [] 92 | for e in res[0].split(','): 93 | em = re.findall('[0-9a-zA-Z_\.]{0,19}@[0-9a-zA-Z\.]{1,100}', e) 94 | if em: 95 | new_res.append(em[0]) 96 | 97 | return new_res 98 | 99 | return res[0] 100 | except Exception as e: 101 | traceback.print_exc() 102 | return res 103 | 104 | def get_message_content(self, message): 105 | """ 106 | 获取邮件内容 107 | :param message: 108 | :return: 109 | """ 110 | try: 111 | while True: 112 | res = {} 113 | msg = email.message_from_bytes(message[b'BODY[]']) 114 | res['subject'] = self.parse_email(msg['Subject']) 115 | res['from'] = self.parse_email(msg['From'], flag='from') 116 | res['to'] = self.parse_email(msg['To'], flag='to') 117 | res['date'] = self.parse_email(msg['Date']) 118 | 119 | for par in msg.walk(): 120 | if not par.is_multipart(): 121 | name = par.get_param("name") 122 | if name: 123 | # print(name) 124 | pass 125 | else: 126 | body = par.get_payload(decode=True) 127 | if not body: 128 | continue 129 | try: 130 | code = par.get_content_charset() 131 | res['body'] = body.decode(code, 'ignore') 132 | except TypeError: 133 | res['body'] = body 134 | return res 135 | 136 | except Exception as e: 137 | traceback.print_exc() 138 | return 139 | 140 | def delete_message(self, messages, deleted_folder="Deleted Messages"): 141 | """ 142 | 删除邮件 143 | :param messages: 144 | :param deleted_folder: 145 | :return: 146 | """ 147 | try: 148 | self.add_flags(messages, SEEN) 149 | if deleted_folder: 150 | # 将邮件移动到 已删除 151 | self.copy(messages, deleted_folder) 152 | self.delete_messages(messages) 153 | self.expunge() 154 | return True 155 | except Exception as e: 156 | traceback.print_exc(e) 157 | return False 158 | -------------------------------------------------------------------------------- /crwy/utils/no_sql/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/no_sql/__init__.py -------------------------------------------------------------------------------- /crwy/utils/no_sql/redis_m.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: redis_m.py 8 | @create at: 2017-12-26 14:50 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | from crwy.exceptions import CrwyImportException 14 | from crwy.decorates import cls2singleton 15 | 16 | try: 17 | import redis 18 | except ImportError: 19 | raise CrwyImportException( 20 | "You should install redis plugin first! try: pip install redis") 21 | 22 | 23 | @cls2singleton 24 | class RedisDb(object): 25 | def __init__(self, **kwargs): 26 | if 'url' in kwargs.keys(): 27 | url = kwargs.pop('url') 28 | self.pool = redis.ConnectionPool.from_url(url, **kwargs) 29 | else: 30 | self.pool = redis.ConnectionPool(**kwargs) 31 | self.db = redis.StrictRedis(connection_pool=self.pool) 32 | 33 | 34 | def get_redis_client(**kwargs): 35 | r = RedisDb(**kwargs) 36 | return r.db 37 | -------------------------------------------------------------------------------- /crwy/utils/pyppeteer_api.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: IntelliJ IDEA 7 | @file: pyppeteer_api.py 8 | @create at: 2019-03-24 17:04 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import asyncio 14 | from crwy.spider import Spider 15 | 16 | try: 17 | from pyppeteer import launch 18 | except ImportError: 19 | pass 20 | 21 | 22 | class PyppeteerApi(Spider): 23 | def __init__(self, logger=None, proxy=None, **kwargs): 24 | super(PyppeteerApi, self).__init__(logger=logger) 25 | 26 | 27 | def main(): 28 | executor = PyppeteerApi() 29 | # TODO 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /crwy/utils/queue/RedisQueue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | from crwy.utils.no_sql.redis_m import get_redis_client 6 | 7 | 8 | class RedisQueue(object): 9 | """Simple Queue with Redis Backend""" 10 | 11 | def __init__(self, name, namespace='queue', server=None, **redis_kwargs): 12 | """The default connection parameters are: 13 | host='localhost', port=6379, db=0""" 14 | if server: 15 | self.__db = server 16 | else: 17 | self.__db = get_redis_client(**redis_kwargs) 18 | self.key = '%s:%s' % (namespace, name) 19 | 20 | def qsize(self): 21 | """Return the approximate size of the queue.""" 22 | return self.__db.llen(self.key) 23 | 24 | def empty(self): 25 | """Return True if the queue is empty, False otherwise.""" 26 | return self.qsize() == 0 27 | 28 | def put(self, item): 29 | """Put item into the queue.""" 30 | self.__db.rpush(self.key, item) 31 | 32 | def get(self, block=True, timeout=None): 33 | """Remove and return an item from the queue. 34 | 35 | If optional args block is true and timeout is None (the default), block 36 | if necessary until an item is available.""" 37 | if block: 38 | item = self.__db.blpop(self.key, timeout=timeout) 39 | else: 40 | item = self.__db.lpop(self.key) 41 | 42 | if item: 43 | item = item[1] 44 | return item 45 | 46 | def get_nowait(self): 47 | """Equivalent to get(False).""" 48 | return self.get(False) 49 | 50 | def clean(self): 51 | """Empty key""" 52 | return self.__db.delete(self.key) 53 | 54 | def db(self): 55 | return self.__db 56 | -------------------------------------------------------------------------------- /crwy/utils/queue/SsdbQueue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | import pyssdb 6 | 7 | 8 | class SsdbQueue(object): 9 | """Simple Queue with SSDB Backend""" 10 | 11 | def __init__(self, name, **ssdb_kwargs): 12 | """The default connection parameters are: 13 | host='localhost', port=8888""" 14 | self.__db = pyssdb.Client(**ssdb_kwargs) 15 | self.key = name 16 | 17 | def qsize(self): 18 | """Return the approximate size of the queue.""" 19 | return self.__db.qsize(self.key) 20 | 21 | def empty(self): 22 | """Return True if the queue is empty, False otherwise.""" 23 | return self.qsize() == 0 24 | 25 | def put(self, item): 26 | """Put item into the queue.""" 27 | self.__db.qpush(self.key, item) 28 | 29 | def get(self): 30 | """Remove and return an item from the queue. 31 | 32 | If optional args block is true and timeout is None (the default), block 33 | if necessary until an item is available.""" 34 | 35 | item = self.__db.qpop(self.key) 36 | 37 | return item 38 | 39 | def clean(self): 40 | """Empty key""" 41 | return self.__db.qclear(self.key) 42 | 43 | def db(self): 44 | return self.__db 45 | -------------------------------------------------------------------------------- /crwy/utils/queue/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/queue/__init__.py -------------------------------------------------------------------------------- /crwy/utils/scrapy_plugs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/scrapy_plugs/__init__.py -------------------------------------------------------------------------------- /crwy/utils/scrapy_plugs/dupefilters.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: dupefilters.py 8 | @create at: 2018-06-14 14:52 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import logging 14 | import time 15 | import datetime 16 | import hashlib 17 | from crwy.utils.filter.RedisSet import RedisSet 18 | from crwy.utils.filter.RedisSortedSet import RedisSortedSet 19 | from scrapy.dupefilters import BaseDupeFilter 20 | from scrapy.exceptions import NotConfigured 21 | from scrapy_redis.connection import get_redis_from_settings 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class RedisRFPDupeFilter(BaseDupeFilter): 27 | """ 28 | dupefilter by redis, redis connect base on scrapy-redis connect 29 | 30 | warning: 31 | config SPIDER_NAME in settings before use 32 | default: 33 | DUPEFILTER_DEBUG = False 34 | DUPEFILTER_DELAY_DAY = 0 35 | """ 36 | logger = logger 37 | 38 | def __init__(self, debug=False, 39 | server=None, 40 | bot_name=None, 41 | spider_name=None, 42 | duperliter_delay_day=None, 43 | do_hash=None): 44 | self.debug = debug 45 | self.logdupes = True 46 | self.server = server 47 | self.bot_name = bot_name 48 | self.spider_name = spider_name 49 | self.duperliter_delay_day = duperliter_delay_day 50 | self.do_hash = do_hash 51 | 52 | @classmethod 53 | def from_settings(cls, settings): 54 | server = get_redis_from_settings(settings) 55 | debug = settings.getbool('DUPEFILTER_DEBUG') 56 | bot_name = settings.get('BOT_NAME') 57 | spider_name = settings.get('SPIDER_NAME') 58 | duperliter_delay_day = settings.getint('DUPEFILTER_DELAY_DAY', 0) 59 | do_hash = settings.getbool('DUPEFILTER_DO_HASH', True) 60 | if not spider_name: 61 | raise NotConfigured('%s - "SPIDER_NAME" is not found.' % 62 | cls.__name__) 63 | return cls(debug=debug, server=server, bot_name=bot_name, 64 | spider_name=spider_name, 65 | duperliter_delay_day=duperliter_delay_day, 66 | do_hash=do_hash) 67 | 68 | def request_seen(self, request): 69 | if not request.meta.get('dupefilter_key', None): 70 | return False 71 | 72 | if len(request.meta.get('redirect_urls', [])) > 0: 73 | # skip url from redirect 74 | return False 75 | 76 | dupefilter_key = request.meta.get('dupefilter_key') 77 | dupefilter_key = hashlib.sha1(dupefilter_key).hexdigest() if \ 78 | self.do_hash else dupefilter_key 79 | 80 | # SPIDER_NAME for dupefilter 81 | key = '{bot_name}:{spider_name}'.format( 82 | bot_name=self.bot_name, 83 | spider_name=self.spider_name) 84 | 85 | if request.meta.get('duperliter_delay_day', ''): 86 | self.duperliter_delay_day = int(request.meta.get( 87 | 'duperliter_delay_day')) 88 | 89 | if self.duperliter_delay_day == 0: 90 | s = RedisSet(key, server=self.server) 91 | if s.sadd(dupefilter_key) is True: 92 | return False 93 | self.logger.info('Filtered dupefilter_key: %s' % 94 | dupefilter_key) 95 | return True 96 | else: 97 | z = RedisSortedSet(key, server=self.server) 98 | now = time.time() 99 | last_time = z.zscore(dupefilter_key) 100 | 101 | if not last_time: 102 | z.zadd(now, dupefilter_key) 103 | return False 104 | 105 | if (datetime.datetime.utcfromtimestamp(now) - 106 | datetime.datetime.utcfromtimestamp(last_time)).days >= \ 107 | self.duperliter_delay_day: 108 | z.zadd(now, dupefilter_key) 109 | return False 110 | self.logger.info('Filtered dupefilter_key within %s day(s): %s' % 111 | (self.duperliter_delay_day, 112 | request.meta.get('dupefilter_key'))) 113 | return True 114 | 115 | def log(self, request, spider): # log that a request has been filtered 116 | if self.debug: 117 | msg = "Filtered duplicate request: %(request)s" 118 | self.logger.debug(msg, { 119 | 'request': request.meta.get('dupefilter_key')}, extra={ 120 | 'spider': spider}) 121 | elif self.logdupes: 122 | msg = ("Filtered duplicate request: %(request)s" 123 | " - no more duplicates will be shown" 124 | " (see DUPEFILTER_DEBUG to show all duplicates)") 125 | self.logger.debug(msg, {'request': request}, 126 | extra={'spider': spider}) 127 | self.logdupes = False 128 | 129 | spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider) 130 | 131 | 132 | class ReleaseDupefilterKey(object): 133 | """ 134 | rm dupefilter_key from redis, when call response 135 | """ 136 | 137 | def call(self, spider, dupefilter_key): 138 | 139 | if not dupefilter_key: 140 | return 141 | 142 | obj = RedisRFPDupeFilter().from_settings(spider.settings) 143 | 144 | dupefilter_key = hashlib.sha1(dupefilter_key).hexdigest() if \ 145 | obj.do_hash else dupefilter_key 146 | 147 | # SPIDER_NAME for dupefilter 148 | key = '{bot_name}:{spider_name}'.format( 149 | bot_name=obj.bot_name, 150 | spider_name=obj.spider_name) 151 | 152 | if obj.duperliter_delay_day == 0: 153 | s = RedisSet(key, server=obj.server) 154 | s.srem(dupefilter_key) 155 | else: 156 | z = RedisSortedSet(key, server=obj.server) 157 | z.zrem(dupefilter_key) 158 | obj.logger.info('dupefilter_key: {} released.'.format( 159 | dupefilter_key)) 160 | 161 | 162 | release_dupefilter_key = ReleaseDupefilterKey() 163 | -------------------------------------------------------------------------------- /crwy/utils/scrapy_plugs/middlewares.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: middlewares.py 8 | @create at: 2018-06-26 18:21 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | 12 | 通过redis hash表记录站点cookie 13 | 14 | key为 cookie_pool:SITE (SITE需要在settings中指定) 15 | field为 账号cookie的唯一标识,可以是username,id等,具体自行约定 16 | value为 cookie值,必须为json格式 17 | 18 | """ 19 | import datetime 20 | import random 21 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 22 | from scrapy.exceptions import IgnoreRequest 23 | from scrapy_redis.connection import get_redis_from_settings 24 | 25 | from crwy.utils.common import datetime2str 26 | from crwy.utils.data.RedisHash import RedisHash 27 | from crwy.exceptions import CrwyScrapyPlugsException, CrwyCookieValidException 28 | from crwy.utils.scrapy_plugs.dupefilters import release_dupefilter_key 29 | 30 | 31 | class CookieMiddleware(RetryMiddleware): 32 | """ 33 | cookie_pool 34 | eg: '{"a": 1, "b": "aaa"}' 35 | """ 36 | 37 | def __init__(self, settings): 38 | super(CookieMiddleware, self).__init__(settings) 39 | self.site = settings.get('SITE', None) 40 | if not self.site: 41 | raise CrwyScrapyPlugsException('SITE_NOT_SET') 42 | 43 | self.server = get_redis_from_settings(settings) 44 | 45 | self.h = RedisHash( 46 | 'cookie_pool:{}'.format(self.site), 47 | server=self.server 48 | ) 49 | 50 | def process_request(self, request, spider): 51 | """ 52 | cookie_user不为空时,获取cookie_user对应的cookie 53 | keep_cookie_user为True时,将设置cookie_user,并传递到子请求 54 | :param request: 55 | :param spider: 56 | :return: 57 | """ 58 | if request.meta.get('cookie_user', ''): 59 | user = request.meta.get('cookie_user') 60 | else: 61 | users = self.h.hkeys() 62 | if len(users) > 0: 63 | user = random.choice(users) 64 | if request.meta.get('keep_cookie_user', False) is True: 65 | request.meta['cookie_user'] = user 66 | else: 67 | raise CrwyScrapyPlugsException( 68 | 'no user in cookie_pool:{}'.format(self.site)) 69 | cookie = self.h.hget(user) 70 | if cookie: 71 | # 字典存入redis,取出时未string,通过eval进行还原 72 | request.cookies = eval(cookie) 73 | spider.logger.debug('get_cookie_success: {}'.format( 74 | user.decode('utf-8'))) 75 | else: 76 | spider.logger.warning('get_cookie_failed: {}'.format( 77 | user.decode('utf-8'))) 78 | 79 | 80 | class LimitCookieMiddleware(CookieMiddleware): 81 | """ 82 | 在cookieMiddleware基础上限制账号 83 | 84 | 1. 每日搜索上限 85 | 通过 update_requests_count method 更新有效请求次数 86 | 2. cookie失效标识 87 | 捕捉 CrwyCookieValidException 异常更改标识状态 88 | """ 89 | def __init__(self, settings): 90 | super(LimitCookieMiddleware, self).__init__(settings) 91 | 92 | # 每日搜索上限 93 | self.search_limit = RedisHash( 94 | 'search_limit:{}'.format(self.site), server=self.server) 95 | # cookie失效标识, 1为cookie失效 96 | self.account_limit = RedisHash( 97 | 'account_limit:{}'.format(self.site), server=self.server) 98 | 99 | def get_requests_count(self, request): 100 | user = request.meta.get('cookie_user') 101 | today = datetime2str(datetime.datetime.now(), fmt='%Y-%m-%d') 102 | if not self.search_limit.hget(user): 103 | count = 1 104 | else: 105 | date, count = self.search_limit.hget( 106 | user).decode('utf-8').split('|') 107 | if date == today: 108 | count = int(count) 109 | else: 110 | count = 1 111 | return user, count 112 | 113 | def update_requests_count(self, request, spider): 114 | """ 115 | 请求完毕后添加详情页计数 116 | :param request: 117 | :param spider: 118 | :return: 119 | """ 120 | user, count = self.get_requests_count(request) 121 | today = datetime2str(datetime.datetime.now(), fmt='%Y-%m-%d') 122 | count += 1 123 | self.search_limit.hset(user, '{}|{}'.format(today, count)) 124 | spider.logger.debug('update search_limit: {} {}'.format( 125 | user.decode('utf-8'), count)) 126 | 127 | def _retry(self, request, reason, spider): 128 | callback = super(LimitCookieMiddleware, self)._retry( 129 | request, reason, spider 130 | ) 131 | if not callback: 132 | if isinstance(reason, CrwyCookieValidException): 133 | user = request.meta.get('cookie_user') 134 | self.account_limit.hset(user, 1) 135 | spider.logger.warning('account limit: {} after retry'.format( 136 | user.decode('utf-8'))) 137 | raise IgnoreRequest 138 | else: 139 | return callback 140 | 141 | def process_request(self, request, spider): 142 | super(LimitCookieMiddleware, self).process_request(request, spider) 143 | 144 | user, count = self.get_requests_count(request) 145 | 146 | dupefilter_key = request.meta.get('dupefilter_key') 147 | 148 | # 判断account_limit, 若为1则表示账号禁用中 149 | if self.account_limit.hget(user) == b'1': 150 | spider.logger.warning( 151 | 'account_limit: {}'.format(user.decode('utf-8'))) 152 | release_dupefilter_key.call(spider, dupefilter_key) 153 | raise IgnoreRequest 154 | 155 | # 判断是否为受保护搜索账号 156 | if user.decode('utf-8') in spider.custom_settings.get( 157 | 'SAFE_SEARCH_ACCOUNT'): 158 | if count >= spider.custom_settings.get('SAFE_SEARCH_LIMIT'): 159 | spider.logger.warning( 160 | '{} safe_search_limit: {}'.format( 161 | user.decode('utf-8'), count)) 162 | release_dupefilter_key.call(spider, dupefilter_key) 163 | raise IgnoreRequest 164 | 165 | # 判断search_limit,若大于上限则跳过 166 | if count >= spider.custom_settings.get('SEARCH_LIMIT'): 167 | spider.logger.warning( 168 | '{} search_limit: {}'.format(user.decode('utf-8'), count)) 169 | release_dupefilter_key.call(spider, dupefilter_key) 170 | raise IgnoreRequest 171 | 172 | if not request.cookies: 173 | spider.logger.warning('cookie is empty: {}'.format( 174 | user.decode('utf-8'))) 175 | release_dupefilter_key.call(spider, dupefilter_key) 176 | raise IgnoreRequest 177 | -------------------------------------------------------------------------------- /crwy/utils/scrapy_plugs/pipelines.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: pipelines.py 8 | @create at: 2018-06-15 15:26 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import logging 14 | from pymysql.cursors import DictCursor 15 | from sqlalchemy.ext.compiler import compiles 16 | from sqlalchemy.sql.expression import Insert 17 | from crwy.utils.sql.mysql import MysqlHandle 18 | from crwy.utils.sql.sqlalchemy_m import SqlalchemyHandle 19 | from crwy.exceptions import CrwyScrapyPlugsException 20 | 21 | 22 | class MysqlSavePipeline(object): 23 | def __init__(self, db_name=None, db_host=None, db_port=None, 24 | db_username=None, db_password=None, db_charset=None, 25 | db_cursorclass=None): 26 | self.db_name = db_name 27 | self.db_host = db_host 28 | self.db_port = db_port 29 | self.db_username = db_username 30 | self.db_password = db_password 31 | self.db_charset = db_charset 32 | self.db_cursorclass = db_cursorclass 33 | self.logger = logging.getLogger(__name__) 34 | self.mysql_handle = None 35 | 36 | @classmethod 37 | def from_crawler(cls, crawler): 38 | """ 39 | loading mysql settings 40 | :param crawler: 41 | :return: 42 | """ 43 | settings = crawler.settings 44 | db_name = settings.get('MYSQL_DB_NAME', '') 45 | db_host = settings.get('MYSQL_DB_HOST', '127.0.0.1') 46 | db_port = settings.getint('MYSQL_DB_PORT', 3306) 47 | db_username = settings.get('MYSQL_DB_USERNAME', 'root') 48 | db_password = settings.get('MYSQL_DB_PASSWORD', '123456') 49 | db_charset = settings.get('MYSQL_DB_CHARSET', 'utf8') 50 | db_cursorclass = settings.get('MYSQL_DB_CURSORCLASS', DictCursor) 51 | return cls(db_name=db_name, db_host=db_host, db_port=db_port, 52 | db_username=db_username, 53 | db_password=db_password, 54 | db_charset=db_charset, 55 | db_cursorclass=db_cursorclass) 56 | 57 | def process_item(self, item, spider): 58 | self.insert_db(item) 59 | return item 60 | 61 | def open_spider(self, spider): 62 | self.mysql_handle = MysqlHandle( 63 | host=self.db_host, 64 | port=self.db_port, 65 | user=self.db_username, 66 | password=self.db_password, 67 | db=self.db_name, 68 | charset=self.db_charset, 69 | cursorclass=self.db_cursorclass 70 | ) 71 | 72 | def insert_db(self, item): 73 | """ 74 | ----------------------------------- 75 | Do something here with mysql_handle 76 | ----------------------------------- 77 | 78 | eg: 79 | sql = None 80 | data = None 81 | last_insert_id = self.mysql_handle.save( 82 | sql, data, get_last_insert_id=True) 83 | self.logger.info('item saved succcess to mysql: %s' % last_insert_id) 84 | """ 85 | pass 86 | 87 | 88 | @compiles(Insert) 89 | def append_string(insert, compiler, **kw): 90 | s = compiler.visit_insert(insert, **kw) 91 | if 'append_string' in insert.kwargs: 92 | return s + " " + insert.kwargs['append_string'] 93 | return s 94 | 95 | 96 | class SqlalchemySavePipeline(object): 97 | def __init__(self, db_url, echo=True): 98 | self.db_url = db_url 99 | self.echo = echo 100 | self.sqlalchemy_handle = None 101 | self.logger = logging.getLogger(__name__) 102 | 103 | @classmethod 104 | def from_crawler(cls, crawler): 105 | """ 106 | loading sqlalchemy settings 107 | :param crawler: 108 | :return: 109 | """ 110 | settings = crawler.settings 111 | db_url = settings.get('SQLALCHEMY_URI') 112 | echo = settings.getbool('SQLALCHEMY_ECHO') 113 | if not db_url: 114 | raise CrwyScrapyPlugsException('SQLALCHEMY_URI must be setup.') 115 | return cls(db_url, echo) 116 | 117 | def process_item(self, item, spider): 118 | self.insert_db(item) 119 | return item 120 | 121 | def open_spider(self, spider): 122 | self.sqlalchemy_handle = SqlalchemyHandle( 123 | db_url=self.db_url, echo=self.echo) 124 | self.sqlalchemy_handle.init_table() 125 | 126 | def insert_db(self, item): 127 | """ 128 | ----------------------------------- 129 | Do something here with sqlalchemy_handle 130 | ----------------------------------- 131 | 132 | eg: 133 | self.sqlalchemy_handle.session.execute( 134 | Test.__table__.insert(), item 135 | ) 136 | self.sqlalchemy_handle.session.commit() 137 | self.logger.info('sqlachemy inserted success.') 138 | """ 139 | pass 140 | 141 | def close_spider(self, spider): 142 | self.sqlalchemy_handle.session.close() 143 | -------------------------------------------------------------------------------- /crwy/utils/scrapy_plugs/settings.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: settings.py 8 | @create at: 2018-06-20 19:33 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | from crwy.utils.load_settings import LoadSettingsFromConsul 14 | from crwy.exceptions import CrwyException 15 | 16 | 17 | class ScrapySettingsFromConsul(LoadSettingsFromConsul): 18 | def __init__(self, spider_name, bot_name, prefix='scrapy', **kwargs): 19 | super(ScrapySettingsFromConsul, self).__init__(**kwargs) 20 | self.spider_name = spider_name 21 | self.bot_name = bot_name 22 | self.prefix = prefix 23 | 24 | def init_main_key(self, key=None): 25 | if not key: 26 | self.main_key = '{prefix}/{bot_name}/{spider_name}'.format( 27 | prefix=self.prefix, bot_name=self.bot_name, 28 | spider_name=self.spider_name 29 | ) 30 | else: 31 | self.main_key = key 32 | 33 | def _get_settings(self, key=None): 34 | self.init_main_key(key=key) 35 | index, data = self.c.kv.get(self.main_key, recurse=True) 36 | if not data: 37 | raise CrwyException('Please make sure the key: <%s> is exist.' % 38 | self.main_key) 39 | 40 | new_data = { 41 | item.get('Key').split('/')[-1]: eval(item.get('Value')) 42 | for item in data 43 | } 44 | new_data['SPIDER_NAME'] = self.spider_name 45 | 46 | return new_data 47 | 48 | @classmethod 49 | def get_settings(cls, spider_name, bot_name, key=None, prefix='scrapy', 50 | **kwargs): 51 | load_settings = cls(spider_name, bot_name, prefix=prefix, **kwargs) 52 | return load_settings._get_settings(key=key) 53 | -------------------------------------------------------------------------------- /crwy/utils/selenium_api.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: IntelliJ IDEA 7 | @file: selenium_api.py 8 | @create at: 2018-10-15 11:51 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | import os 14 | import re 15 | import time 16 | import uuid 17 | 18 | from PIL import Image 19 | from crwy.spider import Spider 20 | from crwy.exceptions import CrwyImportException 21 | 22 | try: 23 | from selenium import webdriver 24 | except ImportError: 25 | raise CrwyImportException( 26 | "You should install selenium first! suggestion: pip install " 27 | "selenium==3.6.0") 28 | from selenium.common.exceptions import TimeoutException, NoSuchElementException 29 | from selenium.webdriver import DesiredCapabilities, ActionChains 30 | from selenium.webdriver.support import expected_conditions as EC 31 | from selenium.webdriver.support.wait import WebDriverWait 32 | 33 | 34 | class SeleniumApi(Spider): 35 | def __init__(self, driver_type='chrome', 36 | hub_url='http://127.0.0.1:4444/wd/hub', 37 | proxy=None, user_agent=None, use_hub=True, 38 | logger=None): 39 | super(SeleniumApi, self).__init__(logger=logger) 40 | """ 41 | :param driver_type: driver类型 42 | :param hub_url: hub server地址 43 | :param proxy: 代理地址 44 | :param user_agent: user_agent 45 | :param use_hub: 是否启用hub,为False时使用本地driver 46 | """ 47 | self.driver_type = driver_type 48 | self.hub_url = hub_url 49 | self.proxy = proxy 50 | self.use_hub = use_hub 51 | self.user_agent = user_agent 52 | # device_pixel_ratio 用于处理高分屏dpi抠图 53 | self.device_pixel_ratio = 1 54 | self.driver = self.init_driver() 55 | self.driver.set_window_size(1280, 960) 56 | 57 | def _init_chrome_driver(self): 58 | chrome_options = webdriver.ChromeOptions() 59 | if self.proxy: 60 | chrome_options.add_argument('--proxy-server=%s' % self.proxy) 61 | if self.user_agent: 62 | chrome_options.add_argument('--user-agent=%s' % self.user_agent) 63 | desired_capabilities = chrome_options.to_capabilities() 64 | if self.use_hub: 65 | driver = webdriver.Remote( 66 | command_executor=self.hub_url, 67 | desired_capabilities=desired_capabilities 68 | ) 69 | else: 70 | driver = webdriver.Chrome( 71 | chrome_options=chrome_options 72 | ) 73 | return driver 74 | 75 | def _init_firefox_driver(self): 76 | firefox_profile = webdriver.FirefoxProfile() 77 | if self.proxy: 78 | ip, port = self.proxy.split(':') 79 | firefox_profile.set_preference('network.proxy.type', 1) 80 | firefox_profile.set_preference('network.proxy.http', ip) 81 | firefox_profile.set_preference('network.proxy.http_port', int(port)) 82 | firefox_profile.set_preference('network.proxy.ssl', ip) 83 | firefox_profile.set_preference('network.proxy.ssl_port', int(port)) 84 | if self.user_agent: 85 | firefox_profile.set_preference( 86 | 'general.useragent.override', self.user_agent) 87 | firefox_profile.update_preferences() 88 | desired_capabilities = DesiredCapabilities.FIREFOX 89 | if self.use_hub: 90 | driver = webdriver.Remote( 91 | command_executor=self.hub_url, 92 | desired_capabilities=desired_capabilities, 93 | browser_profile=firefox_profile 94 | ) 95 | else: 96 | driver = webdriver.Firefox( 97 | firefox_profile=firefox_profile, 98 | capabilities=desired_capabilities 99 | ) 100 | return driver 101 | 102 | def init_driver(self): 103 | if self.driver_type.upper() == 'CHROME': 104 | return self._init_chrome_driver() 105 | elif self.driver_type.upper() == 'FIREFOX': 106 | return self._init_firefox_driver() 107 | raise Exception('No supported driver: %s' % self.driver_type) 108 | 109 | @staticmethod 110 | def is_similar(image1, image2, x, y, distance=25): 111 | """ 112 | 对比RGB值 113 | :param image1: 待对比的图片1 114 | :param image2: 待对比的图片2 115 | :param x: x坐标 116 | :param y: y坐标 117 | :param distance: 色差 118 | :return: 119 | """ 120 | # 获取指定位置的RGB值 121 | pixel1 = image1.getpixel((x, y)) 122 | pixel2 = image2.getpixel((x, y)) 123 | for i in range(0, 3): 124 | # 如果相差超过50则就认为找到了缺口的位置 125 | # print(x, y, pixel1, pixel2) 126 | if abs(pixel1[i] - pixel2[i]) >= distance: 127 | return False 128 | return True 129 | 130 | def get_diff_location(self, image1, image2): 131 | """ 132 | 计算缺口的位置 133 | :param image1: 134 | :param image2: 135 | :return: 136 | """ 137 | i = 0 138 | # 两张原始图的大小都是相同的260*160 139 | # 那就通过两个for循环依次对比每个像素点的RGB值 140 | # 如果相差超过50则就认为找到了缺口的位置 141 | for i in range(0, image1.width): 142 | for j in range(0, image1.height): 143 | if self.is_similar(image1, image2, i, j) is False: 144 | return i 145 | return i 146 | 147 | def get_img(self, screenshot, xpath): 148 | """ 149 | 获取验证码图片 150 | 151 | :param screenshot: 页面截图 152 | :param xpath: 验证码图片xpath 153 | :return: img对象 154 | """ 155 | self.device_pixel_ratio = self.driver.execute_script( 156 | "return window.devicePixelRatio;") 157 | element = self.driver.find_element_by_xpath(xpath) 158 | left = int(element.location['x']) * self.device_pixel_ratio 159 | top = int(element.location['y']) * self.device_pixel_ratio 160 | right = int(element.location['x'] + 161 | element.size['width']) * self.device_pixel_ratio 162 | bottom = int(element.location['y'] + 163 | element.size['height']) * self.device_pixel_ratio 164 | img = Image.open(screenshot) 165 | img = img.crop((left, top, right, bottom)) 166 | return img 167 | 168 | def click_img(self, answer, height, identify_img_xpath1=None, 169 | identify_button_xpath=None): 170 | """ 171 | 根据打码返回的坐标进行点击操作 172 | 仅适用与点击型验证码 173 | 174 | :param answer: 打码返回结果 175 | :param height: 答案高度 176 | :param identify_img_xpath1: 题目xpath 177 | :param identify_button_xpath: 验证按钮 178 | :return: 179 | """ 180 | actions = ActionChains(self.driver) 181 | img = self.driver.find_element_by_xpath(identify_img_xpath1) 182 | points = answer.split('|') 183 | for point in points: 184 | x, y = eval(point) 185 | actions.move_to_element_with_offset( 186 | img, x, y - int((height / self.device_pixel_ratio))) 187 | actions.click() 188 | actions.perform() 189 | time.sleep(2) 190 | if not identify_button_xpath: 191 | return 192 | self.driver.find_element_by_xpath(identify_button_xpath).click() 193 | 194 | def deal_normal_verification_code( 195 | self, captcha_obj, captcha_code, identify_img_xpath): 196 | uuid_str = str(uuid.uuid1()) 197 | screenshot_path = './data/img/screenshot_%s.png' % uuid_str 198 | check_img_path = './data/img/check_image-%s.png' % uuid_str 199 | self.driver.save_screenshot(screenshot_path) 200 | img = self.get_img(screenshot_path, xpath=identify_img_xpath) 201 | img.save(check_img_path) 202 | answer = captcha_obj.decode(check_img_path, captcha_code) 203 | self.logger.info('get normal captcha code : %s' % answer) 204 | os.remove(screenshot_path) 205 | os.remove(check_img_path) 206 | return answer 207 | 208 | def deal_click_verification_code( 209 | self, captcha_obj, captcha_code, 210 | identify_img_xpath, identify_img_xpath1, 211 | identify_button_xpath): 212 | """ 213 | 处理点击型验证码 214 | 215 | :param captcha_obj: 216 | :param captcha_code: 217 | :param identify_img_xpath: 218 | :param identify_img_xpath1: 219 | :param identify_button_xpath: 220 | :return: 221 | """ 222 | uuid_str = str(uuid.uuid1()) 223 | screenshot_path = './data/img/screenshot_%s.png' % uuid_str 224 | check_img_path = './data/img/check_image-%s.png' % uuid_str 225 | self.driver.save_screenshot(screenshot_path) 226 | img1 = self.get_img(screenshot_path, xpath=identify_img_xpath) 227 | img2 = self.get_img(screenshot_path, xpath=identify_img_xpath1) 228 | to_image = Image.new('RGBA', (img2.width, img1.height + img2.height)) 229 | to_image.paste(img1, (0, 0)) 230 | to_image.paste(img2, (0, img1.height)) 231 | if self.device_pixel_ratio > 1: 232 | to_image = to_image.resize( 233 | (int(to_image.width / self.device_pixel_ratio), 234 | int(to_image.height / self.device_pixel_ratio)) 235 | ) 236 | to_image.save(check_img_path) 237 | answer = captcha_obj.decode(check_img_path, captcha_code) 238 | self.logger.info('get click captcha code : %s' % answer) 239 | self.click_img(answer, img1.height, 240 | identify_img_xpath1=identify_img_xpath1, 241 | identify_button_xpath=identify_button_xpath) 242 | os.remove(screenshot_path) 243 | os.remove(check_img_path) 244 | 245 | def get_mobile_code(self, phone_obj, phone, phone_token, 246 | check_str='智联招聘', regexp='\d+', 247 | retry_times=20, sleep_time=5): 248 | while retry_times > 0: 249 | msg = phone_obj.get_message(token=phone_token, phone=phone) 250 | if check_str in msg: 251 | code = re.findall(regexp, msg)[0] 252 | self.logger.info( 253 | '{}: get mobile code success. code is: {}'.format( 254 | phone, code)) 255 | return code 256 | 257 | self.logger.info( 258 | '{}: no more message received. sleep {}s'.format( 259 | phone, sleep_time)) 260 | time.sleep(sleep_time) 261 | retry_times -= 1 262 | 263 | def is_element_visible(self, element): 264 | """ 265 | 判断元素是否存在 266 | :param element: 267 | :return: 268 | """ 269 | driver = self.driver 270 | try: 271 | the_element = EC.visibility_of_element_located(element) 272 | assert the_element(driver) 273 | flag = True 274 | except (AssertionError, NoSuchElementException): 275 | self.logger.warning('the element is not visible.') 276 | flag = False 277 | except Exception as e: 278 | self.logger.exception(e) 279 | flag = False 280 | return flag 281 | 282 | def wait_element(self, by, by_value, timeout=5): 283 | try: 284 | WebDriverWait( 285 | self.driver, timeout).until( 286 | EC.presence_of_element_located((by, by_value))) 287 | except TimeoutException as e: 288 | self.logger.exception(e) 289 | 290 | @staticmethod 291 | def cookies2dict(cookies): 292 | """ 293 | trans cookies 294 | :param cookies: driver.get_cookies() 295 | :return: 296 | """ 297 | cookie_dict = {} 298 | for item in cookies: 299 | cookie_dict[item['name']] = item['value'] 300 | return cookie_dict 301 | 302 | def release(self): 303 | try: 304 | self.driver.quit() 305 | self.driver.close() 306 | except: 307 | pass 308 | -------------------------------------------------------------------------------- /crwy/utils/sql/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuyue92tree/crwy/da19e3a7615990612e41d0d871345075150b88f6/crwy/utils/sql/__init__.py -------------------------------------------------------------------------------- /crwy/utils/sql/mysql.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: mysql.py 8 | @create at: 2017-12-15 14:20 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | from crwy.exceptions import CrwyImportException, CrwyDbException 14 | from crwy.decorates import cls2singleton 15 | 16 | try: 17 | import pymysql 18 | except ImportError: 19 | raise CrwyImportException( 20 | "You should install pymysql first! try: pip install " 21 | "pymysql") 22 | try: 23 | from dbutils.persistent_db import PersistentDB 24 | except ImportError: 25 | raise CrwyImportException( 26 | "You should install DBUtils first! try: pip install " 27 | "dbutils>=2.0") 28 | 29 | 30 | @cls2singleton 31 | class MysqlHandle(object): 32 | def __init__(self, **kwargs): 33 | self._mysql_pool = PersistentDB(pymysql, **kwargs) 34 | 35 | def query_by_sql(self, sql): 36 | conn = self._mysql_pool.connection() 37 | cur = conn.cursor() 38 | try: 39 | cur.execute(sql) 40 | result = cur.fetchall() 41 | return result 42 | except Exception as e: 43 | raise CrwyDbException(e) 44 | finally: 45 | cur.close() 46 | conn.close() 47 | 48 | def save(self, sql, data, many=False, get_last_insert_id=False): 49 | conn = self._mysql_pool.connection() 50 | cur = conn.cursor() 51 | try: 52 | if many is False: 53 | cur.execute(sql, data) 54 | else: 55 | cur.executemany(sql, data) 56 | conn.commit() 57 | 58 | if get_last_insert_id is False: 59 | return 60 | 61 | cur.execute("select last_insert_id() as id") 62 | res = cur.fetchone() 63 | if isinstance(res, tuple): 64 | return res[0] 65 | elif isinstance(res, dict): 66 | return res.get('id') 67 | else: 68 | return res 69 | 70 | except Exception as e: 71 | raise CrwyDbException(e) 72 | finally: 73 | cur.close() 74 | conn.close() 75 | -------------------------------------------------------------------------------- /crwy/utils/sql/pg.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | @author: wuyue 5 | @contact: wuyue92tree@163.com 6 | @software: PyCharm 7 | @file: pg.py 8 | @create at: 2017-12-15 14:28 9 | 10 | 这一行开始写关于本文件的说明与解释 11 | """ 12 | 13 | from crwy.exceptions import CrwyImportException, CrwyDbException 14 | from crwy.decorates import cls2singleton 15 | 16 | try: 17 | import pgdb 18 | except ImportError: 19 | raise CrwyImportException("You should install PyGreSQL first! try: pip " 20 | "install PyGreSQL") 21 | try: 22 | from dbutils.persistent_db import PersistentDB 23 | except ImportError: 24 | raise CrwyImportException( 25 | "You should install DBUtils first! try: pip install " 26 | "dbutils>=2.0") 27 | 28 | 29 | @cls2singleton 30 | class PgHandle(object): 31 | def __init__(self, **kwargs): 32 | self._pg_pool = PersistentDB(pgdb, **kwargs) 33 | 34 | def query_by_sql(self, sql): 35 | conn = self._pg_pool.connection() 36 | cur = conn.cursor() 37 | try: 38 | cur.execute(sql) 39 | result = cur.fetchall() 40 | return result 41 | except Exception as e: 42 | raise CrwyDbException(e) 43 | finally: 44 | cur.close() 45 | conn.close() 46 | 47 | def save(self, sql, data, many=False, get_last_insert_id=False): 48 | conn = self._pg_pool.connection() 49 | cur = conn.cursor() 50 | try: 51 | if get_last_insert_id is True: 52 | sql = sql.strip(';') 53 | sql = sql + ' returning id' 54 | 55 | if many is False: 56 | cur.execute(sql, data) 57 | else: 58 | cur.executemany(sql, data) 59 | 60 | conn.commit() 61 | 62 | if get_last_insert_id is True: 63 | res = cur.fetchone() 64 | return res.id 65 | 66 | except Exception as e: 67 | raise CrwyDbException(e) 68 | finally: 69 | cur.close() 70 | conn.close() 71 | -------------------------------------------------------------------------------- /crwy/utils/sql/sqlalchemy_m.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python 2 | # -*- coding: utf-8 -*- 3 | # author: wuyue92tree@163.com 4 | 5 | from sqlalchemy import create_engine 6 | from sqlalchemy.orm import sessionmaker 7 | from sqlalchemy.ext.declarative import declarative_base 8 | from crwy.decorates import cls2singleton 9 | 10 | Base = declarative_base() 11 | 12 | 13 | @cls2singleton 14 | class SqlalchemyHandle(object): 15 | """ 16 | 以ORM的方式连接数据库 17 | """ 18 | 19 | def __init__(self, db_url, **kwargs): 20 | self.engine = create_engine(db_url, **kwargs) 21 | DBSession = sessionmaker(bind=self.engine) 22 | self.session = DBSession() 23 | 24 | def init_table(self): 25 | return Base.metadata.create_all(self.engine) 26 | 27 | def delete_table(self): 28 | return Base.metadata.drop_all(self.engine) 29 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.5.1 2 | configparser>=3.5.0 3 | requests>=2.20.0 4 | gevent>=1.2.1 5 | retrying>=1.3.3 6 | imapclient>=2.0.0 7 | DBUtils>=2.0 8 | redis 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | from crwy import version 6 | 7 | install_requires = [] 8 | 9 | with open('requirements.txt', 'r') as f: 10 | for req in f.readlines(): 11 | install_requires.append(req.strip('\n')) 12 | 13 | 14 | setup( 15 | name='Crwy', 16 | version=version, 17 | url='https://github.com/wuyue92tree/crwy', 18 | description='A Simple Web Crawling and Web Scraping framework', 19 | long_description=open('README.rst', encoding='utf-8').read(), 20 | author='wuyue', 21 | author_email='wuyue92tree@163.com', 22 | maintainer='wuyue', 23 | maintainer_email='wuyue92tree@163.com', 24 | license='MIT', 25 | packages=find_packages(exclude=('tests', 'tests.*')), 26 | include_package_data=True, 27 | zip_safe=False, 28 | entry_points={ 29 | 'console_scripts': ['crwy = crwy.cmdline:execute'] 30 | }, 31 | install_requires=install_requires, 32 | ) 33 | --------------------------------------------------------------------------------