├── .gitignore ├── README.md ├── coolscrapy ├── __init__.py ├── items.py ├── middlewares.py ├── models.py ├── pipelines.py ├── run.py ├── settings.py ├── spiders │ ├── __init__.py │ ├── article_spider.py │ ├── drug_spider.py │ ├── huxiu_spider.py │ ├── joke_spider.py │ ├── js_spider.py │ ├── link_spider.py │ ├── login1_spider.py │ ├── login2_spider.py │ ├── test_spider.py │ ├── tobacco_spider.py │ └── xml_spider.py └── utils.py ├── doc ├── LICENSE ├── README.md ├── SUMMARY.md ├── assets │ ├── favicon.png │ └── logo.png ├── book.json ├── fonts │ ├── fontawesome-webfont.woff │ └── fontawesome-webfont.woff2 ├── source │ ├── images │ │ ├── scrapy.png │ │ ├── scrapy01.png │ │ ├── scrapy02.png │ │ ├── scrapy03.png │ │ └── weixin1.png │ ├── other │ │ └── about.md │ ├── part1 │ │ ├── README.md │ │ ├── scrapy-01.md │ │ └── scrapy-02.md │ ├── part2 │ │ ├── README.md │ │ ├── scrapy-03.md │ │ ├── scrapy-04.md │ │ ├── scrapy-05.md │ │ ├── scrapy-06.md │ │ ├── scrapy-07.md │ │ └── scrapy-08.md │ ├── part3 │ │ ├── README.md │ │ └── scrapy-09.md │ ├── part4 │ │ ├── README.md │ │ ├── scrapy-10.md │ │ ├── scrapy-11.md │ │ └── scrapy-12.md │ └── part5 │ │ └── README.md └── styles │ └── website.scss ├── publish_gitbook.sh └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pyc 3 | *.log 4 | node_modules/ 5 | _book/ 6 | .project 7 | *~ 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Python网络爬虫Scrapy框架研究 2 | 3 | ### Scrapy1.0教程 4 | 5 | * [Scrapy笔记(1)- 入门篇](https://www.xncoding.com/2016/03/08/scrapy-01.html) 6 | * [Scrapy笔记(2)- 完整示例](https://www.xncoding.com/2016/03/10/scrapy-02.html) 7 | * [Scrapy笔记(3)- Spider详解](https://www.xncoding.com/2016/03/12/scrapy-03.html) 8 | * [Scrapy笔记(4)- Selector详解](https://www.xncoding.com/2016/03/14/scrapy-04.html) 9 | * [Scrapy笔记(5)- Item详解](https://www.xncoding.com/2016/03/16/scrapy-05.html) 10 | * [Scrapy笔记(6)- Item Pipeline](https://www.xncoding.com/2016/03/18/scrapy-06.html) 11 | * [Scrapy笔记(7)- 内置服务](https://www.xncoding.com/2016/03/19/scrapy-07.html) 12 | * [Scrapy笔记(8)- 文件与图片](https://www.xncoding.com/2016/03/20/scrapy-08.html) 13 | * [Scrapy笔记(9)- 部署](https://www.xncoding.com/2016/03/21/scrapy-09.html) 14 | * [Scrapy笔记(10)- 动态配置爬虫](https://www.xncoding.com/2016/04/10/scrapy-10.html) 15 | * [Scrapy笔记(11)- 模拟登录](https://www.xncoding.com/2016/04/12/scrapy-11.html) 16 | * [Scrapy笔记(12)- 抓取动态网站](https://www.xncoding.com/2016/04/15/scrapy-12.html) 17 | 18 | ### Wiki 19 | Scrapy是Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据。 20 | Scrapy用途广泛,可以用于数据挖掘、监测和自动化测试。 21 | 22 | Scrapy吸引人的地方在于它是一个框架,任何人都可以根据需求方便的修改。它也提供了多种类型爬虫的基类, 23 | 如BaseSpider、sitemap爬虫等,还有对web2.0爬虫的支持。 24 | 25 | Scrach是抓取的意思,这个Python的爬虫框架叫Scrapy,大概也是这个意思吧,就叫它:小刮刮吧。 26 | 27 | 基于最新的Scrapy 1.0编写,已更新至Python3.6 28 | 29 | ------------------------------------------ 30 | 31 | ### 对多个内容网站的采集,主要功能实现如下: 32 | 33 | * 最新文章列表的爬取 34 | * 采集的数据放入MySQL数据库中,并且包含标题,发布日期,文章来源,链接地址等等信息 35 | * URL去重复,程序保证对于同一个链接不会爬取两次 36 | * 防止封IP策略,如果抓取太频繁了,就被被封IP,目前采用三种策略保证不会被封: 37 | 38 | * 策略1:设置download_delay下载延迟,数字设置为5秒,越大越安全 39 | * 策略2:禁止Cookie,某些网站会通过Cookie识别用户身份,禁用后使得服务器无法识别爬虫轨迹 40 | * 策略3:使用user agent池。也就是每次发送的时候随机从池中选择不一样的浏览器头信息,防止暴露爬虫身份 41 | * 策略4:使用IP池,这个需要大量的IP资源,貌似还达不到这个要求 42 | * 策略5:分布式爬取,这个是针对大型爬虫系统的,对目前而言我们还用不到。 43 | 44 | * 模拟登录后的爬取 45 | * 针对RSS源的爬取 46 | * 对于每个新的爬取目标网站,或者原来的网站格式有变动的时候,需要做到可配置, 47 | 只修改配置文件即可,而不是修改源文件,增加一段爬虫代码,主要是用xpath配置爬取规则 48 | * 定时爬取,设置定时任务周期性爬取 49 | * 与微信公共平台的结合,给大量的订阅号随机分配最新的订阅文章。 50 | * 利用scrapy-splash执行页面javascript后的内容爬取 51 | 52 | ------------------------------------------ 53 | 54 | ## 贡献代码 55 | 56 | 1. Fork 57 | 1. 创建您的特性分支 git checkout -b my-new-feature 58 | 1. 提交您的改动 git commit -am 'Added some feature' 59 | 1. 将您的修改记录提交到远程 git 仓库 git push origin my-new-feature 60 | 1. 然后到 github 网站的该 git 远程仓库的 my-new-feature 分支下发起 Pull Request 61 | 62 | ## 许可证 63 | Copyright (c) 2014-2016 [Xiong Neng](https://www.xncoding.com/) 64 | 65 | 基于 MIT 协议发布: 66 | 67 | -------------------------------------------------------------------------------- /coolscrapy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yidao620c/core-scrapy/3c671c6ed3ba0fdc222d43048b083f18626ed117/coolscrapy/__init__.py -------------------------------------------------------------------------------- /coolscrapy/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Article(scrapy.Item): 12 | title = scrapy.Field() 13 | url = scrapy.Field() 14 | body = scrapy.Field() 15 | publish_time = scrapy.Field() 16 | source_site = scrapy.Field() 17 | 18 | 19 | class NewsItem(scrapy.Item): 20 | """医药网新闻Item""" 21 | crawlkey = scrapy.Field() # 关键字 22 | title = scrapy.Field() # 标题 23 | link = scrapy.Field() # 链接 24 | desc = scrapy.Field() # 简述 25 | pubdate = scrapy.Field() # 发布时间 26 | category = scrapy.Field() # 分类 27 | location = scrapy.Field() # 来源 28 | content = scrapy.Field() # 内容 29 | htmlcontent = scrapy.Field() # html内容 30 | 31 | 32 | class HuxiuItem(scrapy.Item): 33 | """虎嗅网新闻Item""" 34 | title = scrapy.Field() # 标题 35 | link = scrapy.Field() # 链接 36 | desc = scrapy.Field() # 简述 37 | published = scrapy.Field() # 发布时间 38 | 39 | 40 | class BlogItem(scrapy.Item): 41 | """博客Item""" 42 | title = scrapy.Field() # 标题 43 | link = scrapy.Field() # 链接 44 | id = scrapy.Field() # ID号 45 | published = scrapy.Field() # 发布时间 46 | updated = scrapy.Field() # 更新时间 47 | 48 | 49 | class JokeItem(scrapy.Item): 50 | """糗事百科笑话Item""" 51 | content = scrapy.Field() 52 | image_urls = scrapy.Field() 53 | images = scrapy.Field() 54 | 55 | 56 | class TobaccoItem(scrapy.Item): 57 | """烟草条形码Item""" 58 | pics = scrapy.Field() # 图片 59 | product = scrapy.Field() # 产品 60 | product_type = scrapy.Field() # 产品类型 61 | package_spec = scrapy.Field() # 包装规格 62 | reference_price = scrapy.Field() # 参考零售价格 63 | manufacturer = scrapy.Field() # 生产厂家 64 | -------------------------------------------------------------------------------- /coolscrapy/middlewares.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 中间件集合 5 | Desc : 6 | """ 7 | import redis 8 | import random 9 | import logging 10 | from scrapy import signals 11 | from scrapy.http import Request 12 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class RotateUserAgentMiddleware(UserAgentMiddleware): 18 | """避免被ban策略之一:使用useragent池。 19 | 使用注意:需在settings.py中进行相应的设置。 20 | 更好的方式是使用: 21 | pip install scrapy-fake-useragent 22 | DOWNLOADER_MIDDLEWARES = { 23 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 24 | 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, 25 | } 26 | """ 27 | def __init__(self, user_agent=''): 28 | super(RotateUserAgentMiddleware, self).__init__() 29 | self.user_agent = user_agent 30 | 31 | def process_request(self, request, spider): 32 | ua = random.choice(self.user_agent_list) 33 | if ua: 34 | # 记录当前使用的useragent 35 | logger.debug('Current UserAgent: ' + ua) 36 | request.headers.setdefault('User-Agent', ua) 37 | 38 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 39 | # for more visit http://www.useragentstring.com/pages/useragentstring.php 40 | user_agent_list = [ 41 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36", 42 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36", 43 | "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36", 44 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36", 45 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F", 46 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36", 47 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36", 48 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36", 49 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", 50 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", 51 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", 52 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", 53 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 54 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 55 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 56 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 57 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 58 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 59 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 60 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 61 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 62 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 63 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 64 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 65 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 66 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 67 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 68 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 69 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 70 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 71 | ] -------------------------------------------------------------------------------- /coolscrapy/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 定义数据库模型实体 5 | Desc : 6 | """ 7 | import datetime 8 | 9 | from sqlalchemy.engine.url import URL 10 | from sqlalchemy.ext.declarative import declarative_base 11 | from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime 12 | from coolscrapy.settings import DATABASE 13 | 14 | 15 | def db_connect(): 16 | """ 17 | Performs database connection using database settings from settings.py. 18 | Returns sqlalchemy engine instance 19 | """ 20 | return create_engine(URL(**DATABASE)) 21 | 22 | 23 | def create_news_table(engine): 24 | """""" 25 | Base.metadata.create_all(engine) 26 | 27 | 28 | def _get_date(): 29 | return datetime.datetime.now() 30 | 31 | Base = declarative_base() 32 | 33 | 34 | class ArticleRule(Base): 35 | """自定义文章爬取规则""" 36 | __tablename__ = 'article_rule' 37 | 38 | id = Column(Integer, primary_key=True) 39 | # 规则名称 40 | name = Column(String(30)) 41 | # 运行的域名列表,逗号隔开 42 | allow_domains = Column(String(100)) 43 | # 开始URL列表,逗号隔开 44 | start_urls = Column(String(100)) 45 | # 下一页的xpath 46 | next_page = Column(String(100)) 47 | # 文章链接正则表达式(子串) 48 | allow_url = Column(String(200)) 49 | # 文章链接提取区域xpath 50 | extract_from = Column(String(200)) 51 | # 文章标题xpath 52 | title_xpath = Column(String(100)) 53 | # 文章内容xpath 54 | body_xpath = Column(Text) 55 | # 发布时间xpath 56 | publish_time_xpath = Column(String(30)) 57 | # 文章来源 58 | source_site = Column(String(30)) 59 | # 规则是否生效 60 | enable = Column(Integer) 61 | 62 | 63 | class Article(Base): 64 | """文章类""" 65 | __tablename__ = 'articles' 66 | 67 | id = Column(Integer, primary_key=True) 68 | url = Column(String(100)) 69 | title = Column(String(100)) 70 | body = Column(Text) 71 | publish_time = Column(String(30)) 72 | source_site = Column(String(30)) 73 | 74 | 75 | class News(Base): 76 | """定义新闻实体""" 77 | __tablename__ = "wqy_push_essay" 78 | # 主键 79 | id = Column(Integer, primary_key=True) 80 | # 爬虫key 81 | crawlkey = Column('crawlkey', String(30), nullable=True) 82 | # 新闻分类 83 | category = Column('category', String(40), nullable=True) 84 | # 新闻链接地址 85 | link = Column('link', String(120), nullable=True) 86 | # 新闻来源 87 | location = Column('location', String(60), nullable=True) 88 | # 发布时间 89 | pubdate = Column('pubdate', DateTime, default=_get_date) 90 | # 新闻标题 91 | title = Column('title', String(120), nullable=True) 92 | # 正文 93 | content = Column('content', Text, nullable=True) 94 | # 带html标签的正文 95 | htmlcontent = Column('htmlcontent', Text, nullable=True) 96 | 97 | 98 | class Tobacco(Base): 99 | """烟草""" 100 | __tablename__ = 't_tobacco' 101 | 102 | id = Column(Integer, primary_key=True) 103 | product_name = Column(String(32)) 104 | brand = Column(String(32)) 105 | product_type = Column(String(16)) 106 | package_spec = Column(String(32)) 107 | reference_price = Column(String(32)) 108 | manufacturer = Column(String(32)) 109 | pics = Column(String(255)) 110 | created_time = Column(DateTime, default=_get_date) 111 | updated_time = Column(DateTime, default=_get_date) 112 | 113 | 114 | class Barcode(Base): 115 | """烟草条形码""" 116 | __tablename__ = 't_barcode' 117 | 118 | id = Column(Integer, primary_key=True) 119 | tobacco_id = Column(Integer) 120 | barcode = Column(String(32)) 121 | btype = Column(String(32)) 122 | created_time = Column(DateTime, default=_get_date) 123 | updated_time = Column(DateTime, default=_get_date) 124 | -------------------------------------------------------------------------------- /coolscrapy/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # centos安装MySQL-python,root用户下 5 | # yum install mysql-devel 6 | # pip install MySQL-python 7 | # 8 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 9 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 10 | 11 | import datetime 12 | import redis 13 | import json 14 | import logging 15 | from contextlib import contextmanager 16 | 17 | from scrapy import signals, Request 18 | from scrapy.exporters import JsonItemExporter 19 | from scrapy.pipelines.images import ImagesPipeline 20 | from scrapy.exceptions import DropItem 21 | from sqlalchemy.orm import sessionmaker 22 | from coolscrapy.models import News, db_connect, create_news_table, Article, Tobacco, Barcode 23 | 24 | Redis = redis.StrictRedis(host='localhost', port=6379, db=0) 25 | _log = logging.getLogger(__name__) 26 | 27 | 28 | class DuplicatesPipeline(object): 29 | """Item去重复""" 30 | 31 | def process_item(self, item, spider): 32 | if Redis.exists('url:%s' % item['url']): 33 | raise DropItem("Duplicate item found: %s" % item) 34 | else: 35 | Redis.set('url:%s' % item['url'], 1) 36 | return item 37 | 38 | 39 | class FilterWordsPipeline(object): 40 | """A pipeline for filtering out items which contain certain words in their 41 | description""" 42 | 43 | # put all words in lowercase 44 | words_to_filter = ['pilgrim'] 45 | 46 | def process_item(self, item, spider): 47 | for word in self.words_to_filter: 48 | if False: 49 | raise DropItem("Contains forbidden word: %s" % word) 50 | else: 51 | return item 52 | 53 | 54 | class JsonWriterPipeline(object): 55 | """ 56 | The purpose of JsonWriterPipeline is just to introduce how to write item pipelines. 57 | If you really want to store all scraped items into a JSON file you should use the Feed exports. 58 | """ 59 | 60 | def __init__(self): 61 | pass 62 | self.file = open('items.json', 'wb') 63 | 64 | def open_spider(self, spider): 65 | """This method is called when the spider is opened.""" 66 | _log.info('open_spider....') 67 | 68 | def process_item(self, item, spider): 69 | _log.info('process_item....') 70 | line = json.dumps(dict(item)) + "\n" 71 | self.file.write(line) 72 | return item 73 | 74 | def close_spider(self, spider): 75 | """This method is called when the spider is closed.""" 76 | _log.info('close_spider....') 77 | self.file.close() 78 | 79 | 80 | class JsonExportPipeline(object): 81 | def __init__(self): 82 | _log.info('JsonExportPipeline.init....') 83 | self.files = {} 84 | 85 | @classmethod 86 | def from_crawler(cls, crawler): 87 | _log.info('JsonExportPipeline.from_crawler....') 88 | pipeline = cls() 89 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 90 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 91 | return pipeline 92 | 93 | def spider_opened(self, spider): 94 | _log.info('JsonExportPipeline.spider_opened....') 95 | file = open('%s.json' % spider.name, 'w+b') 96 | self.files[spider] = file 97 | self.exporter = JsonItemExporter(file) 98 | self.exporter.start_exporting() 99 | 100 | def spider_closed(self, spider): 101 | _log.info('JsonExportPipeline.spider_closed....') 102 | self.exporter.finish_exporting() 103 | file = self.files.pop(spider) 104 | file.close() 105 | 106 | def process_item(self, item, spider): 107 | _log.info('JsonExportPipeline.process_item....') 108 | self.exporter.export_item(item) 109 | return item 110 | 111 | 112 | @contextmanager 113 | def session_scope(Session): 114 | """Provide a transactional scope around a series of operations.""" 115 | session = Session() 116 | session.expire_on_commit = False 117 | try: 118 | yield session 119 | session.commit() 120 | except: 121 | session.rollback() 122 | raise 123 | finally: 124 | session.close() 125 | 126 | 127 | class ArticleDataBasePipeline(object): 128 | """保存文章到数据库""" 129 | 130 | def __init__(self): 131 | engine = db_connect() 132 | create_news_table(engine) 133 | self.Session = sessionmaker(bind=engine) 134 | 135 | def open_spider(self, spider): 136 | """This method is called when the spider is opened.""" 137 | pass 138 | 139 | def process_item(self, item, spider): 140 | a = Article(url=item["url"], 141 | title=item["title"].encode("utf-8"), 142 | publish_time=item["publish_time"].encode("utf-8"), 143 | body=item["body"].encode("utf-8"), 144 | source_site=item["source_site"].encode("utf-8")) 145 | with session_scope(self.Session) as session: 146 | session.add(a) 147 | 148 | def close_spider(self, spider): 149 | pass 150 | 151 | 152 | class NewsDatabasePipeline(object): 153 | """保存新闻到数据库""" 154 | 155 | def __init__(self): 156 | """ 157 | Initializes database connection and sessionmaker. 158 | Creates deals table. 159 | """ 160 | engine = db_connect() 161 | create_news_table(engine) 162 | # 初始化对象属性Session为可调用对象 163 | self.Session = sessionmaker(bind=engine) 164 | self.recent_links = None 165 | self.nowtime = datetime.datetime.now() 166 | 167 | def open_spider(self, spider): 168 | """This method is called when the spider is opened.""" 169 | _log.info('open_spider[%s]....' % spider.name) 170 | session = self.Session() 171 | recent_news = session.query(News).filter( 172 | News.crawlkey == spider.name, 173 | self.nowtime - News.pubdate <= datetime.timedelta(days=30)).all() 174 | self.recent_links = [t.link for t in recent_news] 175 | _log.info(self.recent_links) 176 | 177 | def process_item(self, item, spider): 178 | """Save deals in the database. 179 | This method is called for every item pipeline component. 180 | """ 181 | # 每次获取到Item调用这个callable,获得一个新的session 182 | _log.info('mysql->%s' % item['link']) 183 | if item['link'] not in self.recent_links: 184 | with session_scope(self.Session) as session: 185 | news = News(**item) 186 | session.add(news) 187 | self.recent_links.append(item['link']) 188 | return item 189 | 190 | def close_spider(self, spider): 191 | pass 192 | 193 | 194 | class MyImagesPipeline(ImagesPipeline): 195 | """先安装:pip install Pillow""" 196 | 197 | def item_completed(self, results, item, info): 198 | image_paths = [x['path'] for ok, x in results if ok] 199 | if not image_paths: 200 | raise DropItem("Item contains no images") 201 | return item 202 | 203 | 204 | class TobaccoImagePipeline(ImagesPipeline): 205 | """先安装:pip install Pillow""" 206 | 207 | def get_media_requests(self, item, info): 208 | yield Request(item['pics']) 209 | 210 | def item_completed(self, results, item, info): 211 | image_paths = [x['path'] for ok, x in results if ok] 212 | if not image_paths: 213 | raise DropItem("Item contains no images") 214 | # 设置tobacco的pics字段 215 | item['pics'] = image_paths[0] 216 | return item 217 | 218 | 219 | class TobaccoDatabasePipeline(object): 220 | """将烟草记录保存到数据库""" 221 | 222 | def __init__(self): 223 | engine = db_connect() 224 | self.Session = sessionmaker(bind=engine) 225 | 226 | def open_spider(self, spider): 227 | """This method is called when the spider is opened.""" 228 | pass 229 | 230 | def process_item(self, item, spider): 231 | logging.info("将烟草记录保存到数据库 start....") 232 | product_vals = item['product'].split('/') 233 | # 先插入一条烟的记录 234 | tobacco = Tobacco(product_name=product_vals[0], 235 | brand=product_vals[1], 236 | product_type=item['product_type'], 237 | package_spec=item['package_spec'], 238 | reference_price=item['reference_price'], 239 | manufacturer=item['manufacturer'], 240 | pics=item['pics']) 241 | with session_scope(self.Session) as session: 242 | session.add(tobacco) 243 | logging.info("tobacco.iiiiiiiiiiiiiiiiiiiiiiiiidddddd=, {}".format(tobacco.id)) 244 | # 然后再插入二维码记录 245 | if product_vals[2]: 246 | code_vals = product_vals[2].split(':') 247 | barcode = Barcode(tobacco_id=tobacco.id, 248 | btype=code_vals[0], 249 | barcode=code_vals[1]) 250 | with session_scope(self.Session) as session: 251 | session.add(barcode) 252 | if product_vals[3]: 253 | code_vals = product_vals[3].split(':') 254 | barcode = Barcode(tobacco_id=tobacco.id, 255 | btype=code_vals[0], 256 | barcode=code_vals[1]) 257 | with session_scope(self.Session) as session: 258 | # if barcode not in session: 259 | # session.merge(barcode) 260 | session.add(barcode) 261 | logging.info("将烟草记录保存到数据库 end....") 262 | 263 | def close_spider(self, spider): 264 | pass 265 | -------------------------------------------------------------------------------- /coolscrapy/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: sample 5 | Desc : 6 | """ 7 | 8 | import logging 9 | from twisted.internet import reactor 10 | from scrapy.crawler import CrawlerRunner 11 | from scrapy.utils.project import get_project_settings 12 | from scrapy.utils.log import configure_logging 13 | from coolscrapy.models import db_connect 14 | from coolscrapy.models import ArticleRule 15 | from sqlalchemy.orm import sessionmaker 16 | from coolscrapy.spiders.article_spider import ArticleSpider 17 | 18 | if __name__ == '__main__': 19 | settings = get_project_settings() 20 | configure_logging(settings) 21 | db = db_connect() 22 | Session = sessionmaker(bind=db) 23 | session = Session() 24 | rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all() 25 | session.close() 26 | runner = CrawlerRunner(settings) 27 | 28 | for rule in rules: 29 | # spider = ArticleSpider(rule) # instantiate every spider using rule 30 | # stop reactor when spider closes 31 | # runner.signals.connect(spider_closing, signal=signals.spider_closed) 32 | runner.crawl(ArticleSpider, rule=rule) 33 | 34 | d = runner.join() 35 | d.addBoth(lambda _: reactor.stop()) 36 | 37 | # blocks process so always keep as the last statement 38 | reactor.run() 39 | logging.info('all finished.') 40 | -------------------------------------------------------------------------------- /coolscrapy/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for coolscrapy project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | import logging 12 | 13 | BOT_NAME = 'coolscrapy' 14 | 15 | SPIDER_MODULES = ['coolscrapy.spiders'] 16 | NEWSPIDER_MODULE = 'coolscrapy.spiders' 17 | 18 | ITEM_PIPELINES = { 19 | # 'coolscrapy.pipelines.DuplicatesPipeline': 1, 20 | # 'coolscrapy.pipelines.FilterWordsPipeline': 2, 21 | # 'coolscrapy.pipelines.JsonWriterPipeline': 3, 22 | # 'coolscrapy.pipelines.JsonExportPipeline': 4, 23 | # 'coolscrapy.pipelines.ArticleDataBasePipeline': 5, 24 | 'coolscrapy.pipelines.TobaccoImagePipeline': 6, 25 | 'coolscrapy.pipelines.TobaccoDatabasePipeline': 7, 26 | } 27 | DOWNLOADER_MIDDLEWARES = { 28 | # 这里是下载中间件 29 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 30 | 'coolscrapy.middlewares.RotateUserAgentMiddleware': 400, 31 | 'scrapy_splash.SplashCookiesMiddleware': 723, 32 | 'scrapy_splash.SplashMiddleware': 725, 33 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 34 | } 35 | SPIDER_MIDDLEWARES = { 36 | # 这是爬虫中间件, 543是运行的优先级 37 | # 'coolscrapy.middlewares.UrlUniqueMiddleware': 543, 38 | } 39 | 40 | # 几个反正被Ban的策略设置 41 | DOWNLOAD_TIMEOUT = 20 42 | DOWNLOAD_DELAY = 5 43 | # 禁用Cookie 44 | COOKIES_ENABLES = True 45 | #COOKIES_DEBUG = True 46 | 47 | LOG_LEVEL = logging.INFO 48 | LOG_STDOUT = True 49 | LOG_FILE = "E:/logs/spider.log" 50 | LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s" 51 | 52 | 53 | # windows pip install mysqlclient 54 | # linux pip install MySQL-python 55 | DATABASE = {'drivername': 'mysql', 56 | 'host': '123.207.66.156', 57 | 'port': '3306', 58 | 'username': 'root', 59 | 'password': '******', 60 | 'database': 'test', 61 | 'query': {'charset': 'utf8'}} 62 | 63 | # 图片下载设置 64 | IMAGES_STORE = 'E:/logs/' 65 | IMAGES_EXPIRES = 30 # 30天内抓取的都不会被重抓 66 | # 图片链接前缀 67 | URL_PREFIX = 'http://enzhico.net/pics/' 68 | 69 | # js异步加载支持 70 | SPLASH_URL = 'http://192.168.203.91:8050' 71 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 72 | HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' 73 | 74 | # 扩展-定义爬取数量 75 | # CLOSESPIDER_ITEMCOUNT = 10 76 | 77 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 78 | # USER_AGENT = 'coolscrapy (+http://www.yourdomain.com)' 79 | 80 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 81 | # CONCURRENT_REQUESTS=32 82 | 83 | # Configure a delay for requests for the same website (default: 0) 84 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 85 | # See also autothrottle settings and docs 86 | # DOWNLOAD_DELAY=3 87 | # The download delay setting will honor only one of: 88 | # CONCURRENT_REQUESTS_PER_DOMAIN=16 89 | # CONCURRENT_REQUESTS_PER_IP=16 90 | 91 | # Disable cookies (enabled by default) 92 | # COOKIES_ENABLED=False 93 | 94 | # Disable Telnet Console (enabled by default) 95 | # TELNETCONSOLE_ENABLED=False 96 | 97 | # Override the default request headers: 98 | # DEFAULT_REQUEST_HEADERS = { 99 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 100 | # 'Accept-Language': 'en', 101 | # } 102 | 103 | # Enable or disable spider middlewares 104 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 105 | # SPIDER_MIDDLEWARES = { 106 | # 'coolscrapy.middlewares.MyCustomSpiderMiddleware': 543, 107 | # } 108 | 109 | # Enable or disable downloader middlewares 110 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 111 | # DOWNLOADER_MIDDLEWARES = { 112 | # 'coolscrapy.middlewares.MyCustomDownloaderMiddleware': 543, 113 | # } 114 | 115 | # Enable or disable extensions 116 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 117 | # EXTENSIONS = { 118 | # 'scrapy.telnet.TelnetConsole': None, 119 | # } 120 | 121 | # Configure item pipelines 122 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 123 | # ITEM_PIPELINES = { 124 | # 'coolscrapy.pipelines.SomePipeline': 300, 125 | # } 126 | 127 | # Enable and configure the AutoThrottle extension (disabled by default) 128 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 129 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 130 | # AUTOTHROTTLE_ENABLED=True 131 | # The initial download delay 132 | # AUTOTHROTTLE_START_DELAY=5 133 | # The maximum download delay to be set in case of high latencies 134 | # AUTOTHROTTLE_MAX_DELAY=60 135 | # Enable showing throttling stats for every response received: 136 | # AUTOTHROTTLE_DEBUG=False 137 | 138 | # Enable and configure HTTP caching (disabled by default) 139 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 140 | # HTTPCACHE_ENABLED=True 141 | # HTTPCACHE_EXPIRATION_SECS=0 142 | # HTTPCACHE_DIR='httpcache' 143 | # HTTPCACHE_IGNORE_HTTP_CODES=[] 144 | # HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 145 | -------------------------------------------------------------------------------- /coolscrapy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /coolscrapy/spiders/article_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: sample 5 | Desc : 6 | """ 7 | 8 | from coolscrapy.utils import parse_text 9 | from scrapy.spiders import CrawlSpider, Rule 10 | from scrapy.linkextractors import LinkExtractor 11 | from coolscrapy.items import Article 12 | 13 | 14 | class ArticleSpider(CrawlSpider): 15 | name = "article" 16 | 17 | def __init__(self, rule): 18 | self.rule = rule 19 | self.name = rule.name 20 | self.allowed_domains = rule.allow_domains.split(",") 21 | self.start_urls = rule.start_urls.split(",") 22 | rule_list = [] 23 | # 添加`下一页`的规则 24 | if rule.next_page: 25 | rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page))) 26 | # 添加抽取文章链接的规则 27 | rule_list.append(Rule(LinkExtractor( 28 | allow=[rule.allow_url], 29 | restrict_xpaths=[rule.extract_from]), 30 | callback='parse_item')) 31 | self.rules = tuple(rule_list) 32 | super(ArticleSpider, self).__init__() 33 | 34 | def parse_item(self, response): 35 | self.log('Hi, this is an article page! %s' % response.url) 36 | 37 | article = Article() 38 | article["url"] = response.url 39 | 40 | title = response.xpath(self.rule.title_xpath).extract() 41 | article["title"] = parse_text(title, self.rule.name, 'title') 42 | 43 | body = response.xpath(self.rule.body_xpath).extract() 44 | article["body"] = parse_text(body, self.rule.name, 'body') 45 | 46 | publish_time = response.xpath(self.rule.publish_time_xpath).extract() 47 | article["publish_time"] = parse_text(publish_time, self.rule.name, 'publish_time') 48 | 49 | article["source_site"] = self.rule.source_site 50 | 51 | return article 52 | -------------------------------------------------------------------------------- /coolscrapy/spiders/drug_spider.py: -------------------------------------------------------------------------------- 1 | # #!/usr/bin/env python 2 | # # -*- encoding: utf-8 -*- 3 | # """ 4 | # Topic: 网络爬虫 5 | # Desc : 6 | # """ 7 | from ..items import * 8 | from scrapy.spiders import Spider 9 | from scrapy.spiders import XMLFeedSpider, CrawlSpider, Rule 10 | from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor 11 | from scrapy.linkextractors import LinkExtractor 12 | from scrapy.selector import Selector, HtmlXPathSelector 13 | from scrapy.loader import ItemLoader 14 | from scrapy import Request 15 | from scrapy.exceptions import DropItem 16 | from coolscrapy.utils import * 17 | from datetime import datetime 18 | import coolscrapy.settings as setting 19 | import re 20 | import uuid 21 | import urllib 22 | import contextlib 23 | import os 24 | import logging 25 | 26 | 27 | class CnyywXMLFeedSpider(CrawlSpider): 28 | """RSS/XML源爬虫,从医药网cn-yyw.cn上面订阅行业资讯""" 29 | name = 'cnyywfeed' 30 | allowed_domains = ['cn-yyw.cn'] 31 | start_urls = [ 32 | 'http://www.cn-yyw.cn/feed/rss.php?mid=21', 33 | ] 34 | 35 | def parse(self, response): 36 | item_xpaths = response.xpath('//channel/item') 37 | for i_xpath in item_xpaths: 38 | xitem = NewsItem() 39 | xitem['crawlkey'] = self.name 40 | xitem['title'] = ltos(i_xpath.xpath('title/text()').extract()) 41 | self.log('title=%s' % xitem['title'].encode('utf-8'), logging.INFO) 42 | xitem['link'] = ltos(i_xpath.xpath('link/text()').extract()) 43 | self.log('link=%s' % xitem['link'], logging.INFO) 44 | pubdate_temp = ltos(i_xpath.xpath('pubDate/text()').extract()) 45 | self.log('pubdate=%s' % pubdate_temp, logging.INFO) 46 | xitem['pubdate'] = datetime.strptime(pubdate_temp, '%Y-%m-%d %H:%M:%S') 47 | self.log('((((^_^))))'.center(50, '-'), logging.INFO) 48 | yield Request(url=xitem['link'], meta={'item': xitem}, callback=self.parse_item_page) 49 | 50 | def parse_item_page(self, response): 51 | page_item = response.meta['item'] 52 | try: 53 | self.log('-------------------> link_page url=%s' % page_item['link'], logging.INFO) 54 | page_item['category'] = ltos(response.xpath( 55 | '//div[@class="pos"]/a[last()]/text()').extract()) 56 | page_item['location'] = ltos(response.xpath( 57 | '//div[@class="info"]/a/text()').extract()) 58 | content_temp = "".join([tt.strip() for tt in response.xpath( 59 | '//div[@id="article"]').extract()]) 60 | re_con_strong = re.compile(r'(\s*)') 61 | content_temp = re_con_strong.sub(r'\1', content_temp) 62 | re_start_strong = re.compile(r'', re.I) 63 | content_temp = re_start_strong.sub('

', content_temp) 64 | re_end_strong = re.compile(r'', re.I) 65 | content_temp = re_end_strong.sub('

', content_temp) 66 | page_item['content'] = filter_tags(content_temp) 67 | page_item['htmlcontent'] = page_item['content'] 68 | return page_item 69 | except: 70 | self.log('ERROR-----%s' % response.url, logging.ERROR) 71 | return None 72 | 73 | 74 | class Drug39Spider(Spider): 75 | name = "drug39" 76 | allowed_domains = ["drug.39.net"] 77 | start_urls = [ 78 | "http://drug.39.net/yjxw/yydt/index.html" 79 | ] 80 | 81 | def parse(self, response): 82 | self.log('-------------------> link_list url=%s' % response.url, logging.INFO) 83 | links = response.xpath('//div[starts-with(@class, "listbox")]/ul/li') 84 | for link in links: 85 | url = link.xpath('span[1]/a/@href').extract()[0] 86 | date_str = link.xpath('span[2]/text()').extract()[0] 87 | date_str = date_str.split(' ')[1] + ':00' 88 | self.log('+++++++++++' + date_str, logging.INFO) 89 | yield Request(url=url, meta={'ds': date_str}, callback=self.parse_item_page) 90 | 91 | def parse_item_page(self, response): 92 | dstr = response.meta['ds'] 93 | try: 94 | self.log('-------------------> link_page url=%s' % response.url, logging.INFO) 95 | item = NewsItem() 96 | item['crawlkey'] = self.name 97 | item['category'] = ltos(response.xpath( 98 | '//span[@class="art_location"]/a[last()]/text()').extract()) 99 | item['link'] = response.url 100 | item['location'] = ltos(response.xpath( 101 | '//div[@class="date"]/em[2]/a/text()' 102 | '|//div[@class="date"]/em[2]/text()').extract()) 103 | pubdate_temp = ltos(response.xpath('//div[@class="date"]/em[1]/text()').extract()) 104 | item['pubdate'] = datetime.strptime(pubdate_temp + ' ' + dstr, '%Y-%m-%d %H:%M:%S') 105 | item['title'] = ltos(response.xpath('//h1/text()').extract()) 106 | content_temp = "".join([tt.strip() for tt in response.xpath( 107 | '//div[@id="contentText"]/p').extract()]) 108 | item['content'] = filter_tags(content_temp) 109 | htmlcontent = ltos(response.xpath('//div[@id="contentText"]').extract()) 110 | item['htmlcontent'] = clean_html(htmlcontent) 111 | # 特殊构造,不作为分组 112 | # (?=...)之后的字符串需要匹配表达式才能成功匹配 113 | # (?<=...)之前的字符串需要匹配表达式才能成功匹配 114 | pat_img = re.compile(r'( link_list url=%s' % response.url, logging.INFO) 159 | links = response.xpath('//div[@class="list"]/ul/li/p/a') 160 | for link in links: 161 | url = link.xpath('@href').extract()[0] 162 | yield Request(url=url, callback=self.parse_page) 163 | 164 | def parse_page(self, response): 165 | try: 166 | self.log('-------------------> link_page url=%s' % response.url, logging.INFO) 167 | item = NewsItem() 168 | item['crawlkey'] = self.name 169 | item['category'] = ltos(response.xpath( 170 | '//div[@class="current"]/a[last()]/text()').extract()) 171 | item['link'] = response.url 172 | head_line = ltos(response.xpath('//div[@class="ct01"]/text()[1]').extract()) 173 | item['location'] = head_line.strip().split()[1] 174 | item['pubdate'] = datetime.strptime(head_line.strip().split()[0], '%Y-%m-%d') 175 | item['title'] = ltos(response.xpath('//h1/text()').extract()) 176 | content_temp = "".join([tt.strip() for tt in response.xpath( 177 | '//div[@class="ct02"]/font/div/div|//div[@class="ct02"]/font/div').extract()]) 178 | item['content'] = filter_tags(content_temp) 179 | hc = ltos(response.xpath('//div[@class="ct02"]').extract()) 180 | htmlcontent = clean_html(hc) 181 | # 特殊构造,不作为分组 182 | # (?=...)之后的字符串需要匹配表达式才能成功匹配 183 | # (?<=...)之前的字符串需要匹配表达式才能成功匹配 184 | pat_img = re.compile(r'( link_list url=%s' % response.url, logging.INFO) 215 | links = response.xpath('//div[@class="list"]') 216 | for link in links: 217 | url = link.xpath('div[1]/a/@href').extract()[0] 218 | url = 'http://www.haoyao.net/news/' + url.split('/')[-1] 219 | self.log('+++++++++++url=' + url, logging.INFO) 220 | date_str = (link.xpath('div[2]/text()').extract()[0]).strip() + ' 00:00:00' 221 | self.log('+++++++++++date_str=' + date_str, logging.INFO) 222 | yield Request(url=url, meta={'ds': date_str}, callback=self.parse_item_page) 223 | 224 | def parse_item_page(self, response): 225 | dstr = response.meta['ds'] 226 | try: 227 | self.log('-------------------> link_page url=%s' % response.url, logging.INFO) 228 | item = NewsItem() 229 | item['crawlkey'] = self.name 230 | item['category'] = '医药新闻' 231 | item['link'] = response.url 232 | item['location'] = ltos(response.xpath('//font[@color="#666666"]/a/text()').extract()) 233 | item['pubdate'] = datetime.strptime(dstr, '%Y-%m-%d %H:%M:%S') 234 | item['title'] = ltos(response.xpath('//span[@id="lblTitle"]/text()').extract()) 235 | content_temp = "".join([tt.strip() for tt in response.xpath( 236 | '//span[@id="spContent"]/p').extract()]) 237 | item['content'] = filter_tags(content_temp) 238 | htmlcontent = ltos(response.xpath('//div[@id="divContent"]').extract()) 239 | item['htmlcontent'] = clean_html(htmlcontent) 240 | # 特殊构造,不作为分组 241 | # (?=...)之后的字符串需要匹配表达式才能成功匹配 242 | # (?<=...)之前的字符串需要匹配表达式才能成功匹配 243 | pat_img = re.compile(r'( 6: 40 | break 41 | count += 1 42 | item = JokeItem() 43 | title = jk.xpath('*[1]/text()').extract_first().encode('utf-8') 44 | pic_content = jk.xpath('a[2]/img') 45 | txt_content = jk.xpath('a[2]/p') 46 | img_src = None 47 | if pic_content: 48 | item['image_urls'] = pic_content.xpath('@src').extract() 49 | full_imgurl = item['image_urls'][0] 50 | img_src = full_imgurl 51 | filename = os.path.basename(item['image_urls'][0]) 52 | self.log('-------------' + full_imgurl, logging.INFO) 53 | with contextlib.closing(request.urlopen(full_imgurl)) as f: 54 | with open(os.path.join(IMAGES_STORE, filename), 'wb') as bfile: 55 | bfile.write(f.read()) 56 | item['content'] = '

%s

' % title 57 | else: 58 | content = '
'.join(txt_content.xpath('text()').extract().encode('utf-8')) 59 | strong_txt = txt_content.xpath('strong/text()').extract_first() 60 | if strong_txt: 61 | content = '%s
' % strong_txt.encode('utf-8') + content 62 | item['content'] = '

%s

%s' % (title, content) 63 | items.append(item) 64 | jokelist.append((item['content'], img_src)) 65 | send_mail(jokelist) 66 | return items 67 | -------------------------------------------------------------------------------- /coolscrapy/spiders/js_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 对于js异步加载网页的支持 5 | Desc : 爬取京东网首页,下面内容基本都是异步加载的,我选取“猜你喜欢”这个异步加载内容来测试 6 | """ 7 | import logging 8 | import re 9 | import json 10 | import base64 11 | import scrapy 12 | from scrapy_splash import SplashRequest 13 | 14 | 15 | class JsSpider(scrapy.Spider): 16 | name = "jd" 17 | allowed_domains = ["jd.com"] 18 | start_urls = [ 19 | "http://www.jd.com/" 20 | ] 21 | 22 | def start_requests(self): 23 | splash_args = { 24 | 'wait': 0.5, 25 | # 'http_method': 'GET', 26 | # 'html': 1, 27 | # 'png': 1, 28 | # 'width': 600, 29 | # 'render_all': 1, 30 | } 31 | for url in self.start_urls: 32 | yield SplashRequest(url, self.parse_result, endpoint='render.html', 33 | args=splash_args) 34 | 35 | def parse_result(self, response): 36 | logging.info(u'----------使用splash爬取京东网首页异步加载内容-----------') 37 | guessyou = response.xpath('//div[@id="guessyou"]/div[1]/h2/text()').extract_first() 38 | logging.info(u"find:%s" % guessyou) 39 | logging.info(u'---------------success----------------') 40 | 41 | 42 | if __name__ == '__main__': 43 | body = u'发布于: 2016年04月08日' 44 | pat4 = re.compile(r'\d{4}年\d{2}月\d{2}日') 45 | if (re.search(pat4, body)): 46 | print(re.search(pat4, body).group()) 47 | 48 | -------------------------------------------------------------------------------- /coolscrapy/spiders/link_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 爬取链接的蜘蛛 5 | Desc : 6 | """ 7 | import logging 8 | from coolscrapy.items import HuxiuItem 9 | import scrapy 10 | from scrapy.spiders import CrawlSpider, Rule 11 | from scrapy.linkextractors import LinkExtractor 12 | 13 | 14 | class LinkSpider(CrawlSpider): 15 | name = "link" 16 | allowed_domains = ["huxiu.com"] 17 | start_urls = [ 18 | "http://www.huxiu.com/index.php" 19 | ] 20 | 21 | rules = ( 22 | # 提取匹配正则式'/group?f=index_group'链接 (但是不能匹配'deny.html') 23 | # 并且会递归爬取(如果没有定义callback,默认follow=True). 24 | Rule(LinkExtractor(allow=('/group?f=index_group', ), deny=('deny\.html', ))), 25 | # 提取匹配'/article/\d+/\d+.html'的链接,并使用parse_item来解析它们下载后的内容,不递归 26 | Rule(LinkExtractor(allow=('/article/\d+/\d+\.html', )), callback='parse_item'), 27 | ) 28 | 29 | def parse_item(self, response): 30 | self.logger.info('Hi, this is an item page! %s', response.url) 31 | detail = response.xpath('//div[@class="article-wrap"]') 32 | item = HuxiuItem() 33 | item['title'] = detail.xpath('h1/text()')[0].extract() 34 | item['link'] = response.url 35 | item['published'] = detail.xpath( 36 | 'div[@class="article-author"]/span[@class="article-time"]/text()')[0].extract() 37 | logging.info(item['title'],item['link'],item['published']) 38 | yield item 39 | 40 | 41 | -------------------------------------------------------------------------------- /coolscrapy/spiders/login1_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 登录爬虫 5 | Desc : 模拟登录https://github.com后将自己的issue全部爬出来 6 | tips:使用chrome调试post表单的时候勾选Preserve log和Disable cache 7 | """ 8 | import logging 9 | import re 10 | import sys 11 | import scrapy 12 | from scrapy.spiders import CrawlSpider, Rule 13 | from scrapy.linkextractors import LinkExtractor 14 | from scrapy.http import Request, FormRequest, HtmlResponse 15 | 16 | logging.basicConfig(level=logging.INFO, 17 | format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 18 | datefmt='%Y-%m-%d %H:%M:%S', 19 | handlers=[logging.StreamHandler(sys.stdout)]) 20 | 21 | 22 | class GithubSpider(CrawlSpider): 23 | name = "github" 24 | allowed_domains = ["github.com"] 25 | start_urls = [ 26 | 'https://github.com/issues', 27 | ] 28 | rules = ( 29 | # 消息列表 30 | Rule(LinkExtractor(allow=('/issues/\d+',), 31 | restrict_xpaths='//ul[starts-with(@class, "table-list")]/li/div[2]/a[2]'), 32 | callback='parse_page'), 33 | # 下一页, If callback is None follow defaults to True, otherwise it defaults to False 34 | Rule(LinkExtractor(restrict_xpaths='//a[@class="next_page"]')), 35 | ) 36 | post_headers = { 37 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 38 | "Accept-Encoding": "gzip, deflate", 39 | "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", 40 | "Cache-Control": "no-cache", 41 | "Connection": "keep-alive", 42 | "Content-Type": "application/x-www-form-urlencoded", 43 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36", 44 | "Referer": "https://github.com/", 45 | } 46 | 47 | # 重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数 48 | def start_requests(self): 49 | return [Request("https://github.com/login", 50 | meta={'cookiejar': 1}, callback=self.post_login)] 51 | 52 | # FormRequeset 53 | def post_login(self, response): 54 | # 先去拿隐藏的表单参数authenticity_token 55 | authenticity_token = response.xpath( 56 | '//input[@name="authenticity_token"]/@value').extract_first() 57 | logging.info('authenticity_token=' + authenticity_token) 58 | # FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单 59 | # 登陆成功后, 会调用after_login回调函数,如果url跟Request页面的一样就省略掉 60 | return [FormRequest.from_response(response, 61 | url='https://github.com/session', 62 | meta={'cookiejar': response.meta['cookiejar']}, 63 | headers=self.post_headers, # 注意此处的headers 64 | formdata={ 65 | 'utf8': '✓', 66 | 'login': 'yidao620c', 67 | 'password': '******', 68 | 'authenticity_token': authenticity_token 69 | }, 70 | callback=self.after_login, 71 | dont_filter=True 72 | )] 73 | 74 | def after_login(self, response): 75 | # 登录之后,开始进入我要爬取的私信页面 76 | for url in self.start_urls: 77 | logging.info('letter url=' + url) 78 | # 因为我们上面定义了Rule,所以只需要简单的生成初始爬取Request即可 79 | # yield self.make_requests_from_url(url) 80 | yield Request(url, meta={'cookiejar': response.meta['cookiejar']}) 81 | # 如果是普通的Spider,而不是CrawlerSpider,没有定义Rule规则, 82 | # 那么就需要像下面这样定义每个Request的callback 83 | # yield Request(url, dont_filter=True, 84 | # # meta={'dont_redirect': True, 85 | # # 'handle_httpstatus_list': [302]}, 86 | # callback=self.parse_page, ) 87 | 88 | def parse_page(self, response): 89 | """这个是使用LinkExtractor自动处理链接以及`下一页`""" 90 | logging.info(u'--------------消息分割线-----------------') 91 | logging.info(response.url) 92 | issue_title = response.xpath( 93 | '//span[@class="js-issue-title"]/text()').extract_first() 94 | logging.info(u'issue_title:' + issue_title.encode('utf-8')) 95 | 96 | # def parse_page(self, response): 97 | # """这个是不使用LinkExtractor我自己手动处理链接以及下一页""" 98 | # logging.info(response.url) 99 | # for each_msg in response.xpath('//ul[@class="Msgs"]/li'): 100 | # logging.info('--------------消息分割线-----------------') 101 | # logging.info(''.join(each_msg.xpath('.//div[@class="msg"]//*/text()').extract())) 102 | # next_page = response.xpath('//li[@class="page next"]/a') 103 | # if next_page: 104 | # logging.info(u'继续处理下一页') 105 | # yield Request(response.url + next_page.xpath('@href').extract()) 106 | # else: 107 | # logging.info(u"已经处理完成,没有下一页了") 108 | 109 | def _requests_to_follow(self, response): 110 | """重写加入cookiejar的更新""" 111 | if not isinstance(response, HtmlResponse): 112 | return 113 | seen = set() 114 | for n, rule in enumerate(self._rules): 115 | links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] 116 | if links and rule.process_links: 117 | links = rule.process_links(links) 118 | for link in links: 119 | seen.add(link) 120 | r = Request(url=link.url, callback=self._response_downloaded) 121 | # 下面这句是我重写的 122 | r.meta.update(rule=n, link_text=link.text, cookiejar=response.meta['cookiejar']) 123 | yield rule.process_request(r) 124 | -------------------------------------------------------------------------------- /coolscrapy/spiders/login2_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 登录爬虫 5 | Desc : 模拟登录http://www.iteye.com后将自己的私信全部爬出来 6 | tips:使用chrome调试post表单的时候勾选Preserve log和Disable cache 7 | """ 8 | import logging 9 | import re 10 | import sys 11 | import scrapy 12 | from scrapy.spiders import CrawlSpider, Rule 13 | from scrapy.linkextractors import LinkExtractor 14 | from scrapy.http import Request, FormRequest, HtmlResponse 15 | 16 | logging.basicConfig(level=logging.INFO, 17 | format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 18 | datefmt='%Y-%m-%d %H:%M:%S', 19 | handlers=[logging.StreamHandler(sys.stdout)]) 20 | 21 | 22 | class IteyeSpider(CrawlSpider): 23 | name = "iteye" 24 | allowed_domains = ["iteye.com"] 25 | start_urls = [ 26 | 'http://my.iteye.com/messages', 27 | 'http://my.iteye.com/messages/store', 28 | ] 29 | rules = ( 30 | # 消息列表 31 | Rule(LinkExtractor(allow=('/messages/\d+',), 32 | restrict_xpaths='//table[@class="admin"]/tbody/tr/td[2]'), 33 | callback='parse_page'), 34 | # 下一页, If callback is None follow defaults to True, otherwise it defaults to False 35 | Rule(LinkExtractor(restrict_xpaths='//a[@class="next_page"]')), 36 | ) 37 | request_headers = { 38 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", 39 | "Referer": "http://www.iteye.com/login", 40 | } 41 | 42 | post_headers = { 43 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 44 | "Accept-Encoding": "gzip, deflate", 45 | "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", 46 | "Cache-Control": "no-cache", 47 | "Connection": "keep-alive", 48 | "Content-Type": "application/x-www-form-urlencoded", 49 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36", 50 | "Referer": "http://www.iteye.com/", 51 | } 52 | 53 | # 重写了爬虫类的方法, 实现了自定义请求, 运行成功后会调用callback回调函数 54 | def start_requests(self): 55 | return [Request("http://www.iteye.com/login", 56 | meta={'cookiejar': 1}, callback=self.post_login)] 57 | 58 | # FormRequeset 59 | def post_login(self, response): 60 | # 先去拿隐藏的表单参数authenticity_token 61 | authenticity_token = response.xpath( 62 | '//input[@name="authenticity_token"]/@value').extract_first() 63 | logging.info('authenticity_token=' + authenticity_token) 64 | # FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单 65 | # 登陆成功后, 会调用after_login回调函数,如果url跟Request页面的一样就省略掉 66 | return [FormRequest.from_response(response, 67 | url='http://www.iteye.com/login', 68 | meta={'cookiejar': response.meta['cookiejar']}, 69 | headers=self.post_headers, # 注意此处的headers 70 | formdata={ 71 | 'name': 'yidao620c', 72 | 'password': '******', 73 | 'authenticity_token': authenticity_token 74 | }, 75 | callback=self.after_login, 76 | dont_filter=True 77 | )] 78 | 79 | def after_login(self, response): 80 | # logging.info(response.body.encode('utf-8')) 81 | # 登录之后,开始进入我要爬取的私信页面 82 | # 对于登录成功后的页面我不感兴趣,所以这里response没啥用 83 | for url in self.start_urls: 84 | logging.info('letter url=' + url) 85 | # 因为我们上面定义了Rule,所以只需要简单的生成初始爬取Request即可 86 | yield Request(url, meta={'cookiejar': response.meta['cookiejar']}) 87 | # 如果是普通的Spider,而不是CrawlerSpider,没有定义Rule规则, 88 | # 那么就需要像下面这样定义每个Request的callback 89 | # yield Request(url, dont_filter=True, 90 | # callback=self.parse_page, ) 91 | 92 | def parse_page(self, response): 93 | """这个是使用LinkExtractor自动处理链接以及`下一页`""" 94 | logging.info(u'--------------消息分割线-----------------') 95 | logging.info(response.url) 96 | logging.info(response.xpath('//a[@href="/messages/new"]/text()').extract_first()) 97 | # msg_time = response.xpath( 98 | # '//div[@id="main"]/table[1]/tbody/tr[1]/td[1]/text()').extract_first() 99 | # logging.info(msg_time) 100 | # msg_title = response.xpath( 101 | # '//div[@id="main"]/table[1]/tbody/tr[2]/th[2]/span/text()').extract_first() 102 | # logging.info(msg_title) 103 | 104 | # def parse_page(self, response): 105 | # """这个是不使用LinkExtractor我自己手动处理链接以及下一页""" 106 | # logging.info(response.url) 107 | # for each_msg in response.xpath('//ul[@class="Msgs"]/li'): 108 | # logging.info('--------------消息分割线-----------------') 109 | # logging.info(''.join(each_msg.xpath('.//div[@class="msg"]//*/text()').extract())) 110 | # next_page = response.xpath('//li[@class="page next"]/a') 111 | # if next_page: 112 | # logging.info(u'继续处理下一页') 113 | # yield Request(response.url + next_page.xpath('@href').extract()) 114 | # else: 115 | # logging.info(u"已经处理完成,没有下一页了") 116 | 117 | def _requests_to_follow(self, response): 118 | """重写加入cookiejar的更新""" 119 | if not isinstance(response, HtmlResponse): 120 | return 121 | seen = set() 122 | for n, rule in enumerate(self._rules): 123 | links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] 124 | if links and rule.process_links: 125 | links = rule.process_links(links) 126 | for link in links: 127 | seen.add(link) 128 | r = Request(url=link.url, callback=self._response_downloaded) 129 | # 下面这句是我重写的 130 | r.meta.update(rule=n, link_text=link.text, cookiejar=response.meta['cookiejar']) 131 | yield rule.process_request(r) 132 | -------------------------------------------------------------------------------- /coolscrapy/spiders/test_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 爬虫小测试类 5 | Desc : 6 | """ 7 | import logging 8 | import scrapy 9 | import re 10 | 11 | 12 | class TestSpider(scrapy.Spider): 13 | name = "test" 14 | allowed_domains = ["jd.com"] 15 | start_urls = [ 16 | "http://www.jd.com/" 17 | ] 18 | 19 | def parse(self, response): 20 | logging.info(u'---------我这个是简单的直接获取京东网首页测试---------') 21 | guessyou = response.xpath('//div[@id="guessyou"]/div[1]/h2/text()').extract_first() 22 | logging.info(u"find:%s" % guessyou) 23 | logging.info(u'---------------success----------------') 24 | 25 | if __name__ == '__main__': 26 | body = u'发布于: 2016年04月08日' 27 | pat4 = re.compile(r'\d{4}年\d{2}月\d{2}日') 28 | if (re.search(pat4, body)): 29 | print(re.search(pat4, body).group()) 30 | -------------------------------------------------------------------------------- /coolscrapy/spiders/tobacco_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 烟草条形码爬虫 5 | -- 烟草表 6 | DROP TABLE IF EXISTS `t_tobacco`; 7 | CREATE TABLE `t_tobacco` ( 8 | `id` BIGINT PRIMARY KEY AUTO_INCREMENT COMMENT '主键ID', 9 | `product_name` VARCHAR(32) COMMENT '产品名称', 10 | `brand` VARCHAR(32) COMMENT '品牌', 11 | `product_type` VARCHAR(32) COMMENT '产品类型', 12 | `package_spec` VARCHAR(64) COMMENT '包装规格', 13 | `reference_price` VARCHAR(32) COMMENT '参考价格', 14 | `manufacturer` VARCHAR(32) COMMENT '生产厂家', 15 | `pics` VARCHAR(255) COMMENT '图片URL', 16 | `created_time` DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 17 | `updated_time` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间' 18 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='烟草表'; 19 | 20 | -- 烟草条形码表 21 | DROP TABLE IF EXISTS `t_barcode`; 22 | CREATE TABLE `t_barcode` ( 23 | `id` BIGINT PRIMARY KEY AUTO_INCREMENT COMMENT '主键ID', 24 | `tobacco_id` BIGINT COMMENT '香烟产品ID', 25 | `barcode` VARCHAR(32) COMMENT '条形码', 26 | `btype` VARCHAR(32) COMMENT '类型 小盒条形码/条包条形码', 27 | `created_time` DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', 28 | `updated_time` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间' 29 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='烟草条形码表'; 30 | """ 31 | from scrapy import Request 32 | 33 | from coolscrapy.utils import parse_text, tx 34 | from scrapy.spiders import CrawlSpider, Rule, Spider 35 | from scrapy.linkextractors import LinkExtractor 36 | from coolscrapy.items import Article, TobaccoItem 37 | 38 | 39 | class TobaccoSpider(Spider): 40 | """ 41 | 爬取本页面内容,然后再提取下一页链接生成新的Request标准做法 42 | 另外还使用了图片下载管道 43 | """ 44 | name = "tobacco" 45 | allowed_domains = ["etmoc.com"] 46 | base_url = "http://www.etmoc.com/market/Brandlist.asp" 47 | start_urls = [ 48 | "http://www.etmoc.com/market/Brandlist.asp?page=86&worded=&temp=" 49 | ] 50 | pics_pre = 'http://www.etmoc.com/' 51 | 52 | def parse(self, response): 53 | # 处理本页内容 54 | for item in self.parse_page(response): 55 | yield item 56 | # 找下一页链接递归爬 57 | next_url = tx(response.xpath('//a[text()="【下一页】"]/@href')) 58 | if next_url: 59 | self.logger.info('+++++++++++next_url++++++++++=' + self.base_url + next_url) 60 | yield Request(url=self.base_url + next_url, meta={'ds': "ds"}, callback=self.parse) 61 | 62 | def parse_page(self, response): 63 | self.logger.info('Hi, this is a page = %s', response.url) 64 | items = [] 65 | for ind, each_row in enumerate(response.xpath('//div[@id="mainlist"]/table/tbody/tr')): 66 | if ind == 0: 67 | continue 68 | item = TobaccoItem() 69 | item['pics'] = self.pics_pre + tx(each_row.xpath('td[1]/a/img/@src'))[3:] 70 | product_name = tx(each_row.xpath('td[2]/p[1]/text()')) 71 | brand = tx(each_row.xpath('td[2]/p[2]/a/text()')) 72 | barcode1 = tx(each_row.xpath('td[2]/p[3]/text()')) 73 | barcode2 = tx(each_row.xpath('td[2]/p[4]/text()')) 74 | item['product'] = "{}/{}/{}/{}".format(product_name, brand, barcode1, barcode2) 75 | item['product_type'] = tx(each_row.xpath('td[3]/text()')) 76 | item['package_spec'] = tx(each_row.xpath('td[4]/text()')) 77 | item['reference_price'] = tx(each_row.xpath('td[5]/span/text()')) 78 | # 生产厂家有可能包含链接,我取里面的文本,使用//text() 79 | item['manufacturer'] = tx(each_row.xpath('td[6]//text()')) 80 | self.logger.info("pics={},product={},product_type={},package_spec={}," 81 | "reference_price={},manufacturer={}".format( 82 | item['pics'], item['product'], item['product_type'], item['package_spec'] 83 | , item['reference_price'], item['manufacturer'])) 84 | items.append(item) 85 | return items 86 | -------------------------------------------------------------------------------- /coolscrapy/spiders/xml_spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 爬取XML订阅的蜘蛛 5 | Desc : 6 | """ 7 | from coolscrapy.items import BlogItem 8 | import scrapy 9 | from scrapy.spiders import XMLFeedSpider 10 | 11 | 12 | class XMLSpider(XMLFeedSpider): 13 | name = "xml" 14 | namespaces = [('atom', 'http://www.w3.org/2005/Atom')] 15 | allowed_domains = ["github.io"] 16 | start_urls = [ 17 | "http://yidao620c.github.io/atom.xml" 18 | ] 19 | iterator = 'xml' # 缺省的iternodes,貌似对于有namespace的xml不行 20 | itertag = 'atom:entry' 21 | 22 | def parse_node(self, response, node): 23 | # self.logger.info('Hi, this is a <%s> node!', self.itertag) 24 | item = BlogItem() 25 | item['title'] = node.xpath('atom:title/text()')[0].extract() 26 | item['link'] = node.xpath('atom:link/@href')[0].extract() 27 | item['id'] = node.xpath('atom:id/text()')[0].extract() 28 | item['published'] = node.xpath('atom:published/text()')[0].extract() 29 | item['updated'] = node.xpath('atom:updated/text()')[0].extract() 30 | self.logger.info('|'.join([item['title'],item['link'],item['id'],item['published']])) 31 | return item 32 | 33 | -------------------------------------------------------------------------------- /coolscrapy/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | """ 4 | Topic: 一些工具类 5 | Desc : 6 | """ 7 | import re 8 | import sys 9 | import smtplib 10 | from contextlib import contextmanager 11 | from email.mime.multipart import MIMEMultipart 12 | from email.mime.text import MIMEText 13 | from email.mime.image import MIMEImage 14 | import os.path 15 | from coolscrapy.settings import IMAGES_STORE 16 | from coolscrapy.models import ArticleRule 17 | from coolscrapy.models import db_connect, create_news_table 18 | from sqlalchemy.orm import sessionmaker 19 | 20 | 21 | def filter_tags(htmlstr): 22 | """更深层次的过滤,类似instapaper或者readitlater这种服务,很有意思的研究课题 23 | 过滤HTML中的标签 24 | 将HTML中标签等信息去掉 25 | @param htmlstr HTML字符串. 26 | """ 27 | # 先过滤CDATA 28 | re_pp = re.compile('

', re.I) # 段落结尾 29 | re_cdata = re.compile('//]*//\]\]>', re.I) # 匹配CDATA 30 | re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) # Script 31 | re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) # style 32 | re_br = re.compile('') # 处理换行 33 | re_h = re.compile(']*>') # HTML标签 34 | re_comment = re.compile('') # HTML注释 35 | 36 | s = re_pp.sub('\n', htmlstr) # 段落结尾变换行符 37 | s = re_cdata.sub('', s) # 去掉CDATA 38 | s = re_script.sub('', s) # 去掉SCRIPT 39 | s = re_style.sub('', s) # 去掉style 40 | s = re_br.sub('\n', s) # 将br转换为换行 41 | s = re_h.sub('', s) # 去掉HTML 标签 42 | s = re_comment.sub('', s) # 去掉HTML注释 43 | # 去掉多余的空行 44 | blank_line = re.compile('\n+') 45 | s = blank_line.sub('\n', s) 46 | s = replace_charentity(s) # 替换实体 47 | return "".join([t.strip() + '\n' for t in s.split('\n') if t.strip() != '']) 48 | 49 | 50 | def replace_charentity(htmlstr): 51 | """ 52 | ##替换常用HTML字符实体. 53 | #使用正常的字符替换HTML中特殊的字符实体. 54 | #你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体. 55 | #@param htmlstr HTML字符串. 56 | """ 57 | char_entities = {'nbsp': ' ', '160': ' ', 58 | 'lt': '<', '60': '<', 59 | 'gt': '>', '62': '>', 60 | 'amp': '&', '38': '&', 61 | 'quot': '"', '34': '"',} 62 | 63 | re_charentity = re.compile(r'&#?(?P\w+);') 64 | sz = re_charentity.search(htmlstr) 65 | while sz: 66 | entity = sz.group() # entity全称,如> 67 | key = sz.group('name') # 去除&;后entity,如>为gt 68 | try: 69 | htmlstr = re_charentity.sub(char_entities[key], htmlstr, 1) 70 | sz = re_charentity.search(htmlstr) 71 | except KeyError: 72 | # 以空串代替 73 | htmlstr = re_charentity.sub('', htmlstr, 1) 74 | sz = re_charentity.search(htmlstr) 75 | return htmlstr 76 | 77 | 78 | pat1 = re.compile(r'
(?:.|\n)*?
') 79 | pat2 = re.compile(r'