├── .gitignore ├── README.md ├── __init__.py ├── db.sql ├── model ├── __init__.py ├── article.py ├── config.py └── rule.py ├── pipelines.py ├── run.py └── spiders ├── __init__.py ├── deep_spider.py └── dmoz_spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy Dynamic Configurable 2 | 3 | A dynamic configurable news crawler based Scrapy. See the [blogs](http://wuchong.me/blog/2015/05/22/running-scrapy-programmatically) for more detail. 4 | 5 | ##Requirements 6 | 7 | - Scrapy 8 | - MySQL 9 | - Redis 10 | - SQLAlchemy 11 | 12 | ##Install in development 13 | 14 | **Mac OS X, use Homebrew** 15 | 16 | ```bash 17 | $ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 18 | $ brew install mysql 19 | $ brew install redis 20 | $ sudo pip install scrapy 21 | $ sudo pip install SQLAlchemy 22 | $ sudo pip install redis 23 | ``` 24 | **Ubuntu*** 25 | 26 | ```bash 27 | $ sudo apt-get install redis-server mysql-server mysql-client 28 | $ pip install scrapy 29 | $ pip install SQLAlchemy 30 | $ pip install redis 31 | ``` 32 | **and restore datatables from 'db.sql'** 33 | 34 | It is for scrapy 1.0 . 35 | It is for scrapy 1.0 . 36 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wuchong/scrapy-dynamic-configurable/801a275d774e863360c28790c4522342aa26123b/__init__.py -------------------------------------------------------------------------------- /db.sql: -------------------------------------------------------------------------------- 1 | # ************************************************************ 2 | # Sequel Pro SQL dump 3 | # Version 4096 4 | # 5 | # http://www.sequelpro.com/ 6 | # http://code.google.com/p/sequel-pro/ 7 | # 8 | # Host: 127.0.0.1 (MySQL 5.6.22) 9 | # Database: spider 10 | # Generation Time: 2015-05-22 14:07:57 +0000 11 | # ************************************************************ 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 21 | 22 | 23 | # Dump of table articles 24 | # ------------------------------------------------------------ 25 | 26 | DROP TABLE IF EXISTS `articles`; 27 | 28 | CREATE TABLE `articles` ( 29 | `id` int(11) NOT NULL AUTO_INCREMENT, 30 | `title` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 31 | `url` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 32 | `body` text COLLATE utf8_unicode_ci, 33 | `publish_time` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 34 | `source_site` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 35 | PRIMARY KEY (`id`) 36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 37 | 38 | 39 | 40 | # Dump of table rules 41 | # ------------------------------------------------------------ 42 | 43 | DROP TABLE IF EXISTS `rules`; 44 | 45 | CREATE TABLE `rules` ( 46 | `id` int(11) NOT NULL AUTO_INCREMENT, 47 | `name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 48 | `allow_domains` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 49 | `start_urls` text COLLATE utf8_unicode_ci, 50 | `next_page` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 51 | `allow_url` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 52 | `extract_from` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 53 | `title_xpath` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 54 | `body_xpath` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 55 | `publish_time_xpath` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 56 | `source_site_xpath` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, 57 | `enable` tinyint(1) DEFAULT NULL, 58 | PRIMARY KEY (`id`) 59 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 60 | 61 | LOCK TABLES `rules` WRITE; 62 | /*!40000 ALTER TABLE `rules` DISABLE KEYS */; 63 | 64 | INSERT INTO `rules` (`id`, `name`, `allow_domains`, `start_urls`, `next_page`, `allow_url`, `extract_from`, `title_xpath`, `body_xpath`, `publish_time_xpath`, `source_site_xpath`, `enable`) 65 | VALUES 66 | (1,'中国会展门户','cnena.com','http://www.cnena.com/news/list-htm-fid-55.html','//a[@title=\'下一页\']','bencandy-htm-fid-.*\\.html','//*[@id=\"sort_list\"]','//div[@class=\"content\"]//h1//text()','//*[@id=\"divcontent\"]//div//text()','//div[@class=\"content\"]//div[@class=\"content_info\"]/text()','//div[@class=\"content\"]//div[@class=\"content_info\"]/a/text()',0), 67 | (2,'中国会展网','expo-china.com','http://www.expo-china.com/web/news/news_list.aspx','','\\/pages\\/news\\/\\d{6}\\/\\d*\\/index.shtml','//*[@id=\"NewsView_GNZX_bianju\"]/div','//*[@id=\"NewsView_HDTK_bianju\"]/div[1]/h2/text()','//*[@id=\"NewsView_HDTK_bianju\"]/div[3]/p//text()','//*[@id=\"NewsView_HDTK_bianju\"]/div[1]/span[1]/text()','//*[@id=\"NewsView_HDTK_bianju\"]/div[1]/span[2]/text()',0), 68 | (3,'成都会展','cdexpo.com.cn','http://www.cdexpo.com.cn/article-1-35-1.html','//a[@class=\"next\"]','article-detail-.*\\.html','//*[@id=\"news-list\"]/div','//*[@id=\"content\"]//div[@class=\"detail-tit\"]/h3/text()','//*[@id=\"content\"]//div[@class=\"detail-content\"]/p/text()','//*[@id=\"content\"]/div/div[1]/div/div/div[2]/p/span[1]/text()','//*[@id=\"content\"]/div/div[1]/div/div/div[2]/p/span[2]/text()',1); 69 | 70 | /*!40000 ALTER TABLE `rules` ENABLE KEYS */; 71 | UNLOCK TABLES; 72 | 73 | 74 | 75 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 76 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 77 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 78 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 79 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 80 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 81 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'wuchong' 2 | -------------------------------------------------------------------------------- /model/article.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sqlalchemy import Column, String , Integer 3 | from sqlalchemy.ext.declarative import declarative_base 4 | 5 | Base = declarative_base() 6 | 7 | class Article(Base): 8 | __tablename__ = 'articles' 9 | 10 | id = Column(Integer, primary_key=True) 11 | title = Column(String) 12 | url = Column(String) 13 | body = Column(String) 14 | publish_time = Column(String) 15 | source_site = Column(String) 16 | 17 | -------------------------------------------------------------------------------- /model/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from sqlalchemy import create_engine 4 | from sqlalchemy.orm import sessionmaker 5 | import redis 6 | 7 | # 初始化数据库连接: 8 | engine = create_engine('mysql+mysqldb://root:root@localhost:3306/spider?charset=utf8') 9 | # 创建DBSession类型: 10 | DBSession = sessionmaker(bind=engine) 11 | # 初始化redis数据库连接 12 | Redis = redis.StrictRedis(host='localhost',port=6379,db=0) -------------------------------------------------------------------------------- /model/rule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from sqlalchemy import Column, String , DateTime, Integer 3 | from sqlalchemy.ext.declarative import declarative_base 4 | 5 | # 创建对象的基类: 6 | Base = declarative_base() 7 | 8 | class Rule(Base): 9 | __tablename__ = 'rules' 10 | 11 | # 表的结构: 12 | id = Column(Integer, primary_key=True) 13 | name = Column(String) 14 | allow_domains = Column(String) 15 | start_urls = Column(String) 16 | next_page = Column(String) 17 | allow_url = Column(String) 18 | extract_from = Column(String) 19 | title_xpath = Column(String) 20 | body_xpath = Column(String) 21 | publish_time_xpath = Column(String) 22 | source_site_xpath = Column(String) 23 | enable = Column(Integer) 24 | 25 | -------------------------------------------------------------------------------- /pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import codecs 5 | from scrapy.exceptions import DropItem 6 | from model.config import DBSession 7 | from model.config import Redis 8 | from model.article import Article 9 | 10 | # 去重 11 | class DuplicatesPipeline(object): 12 | def process_item(self, item, spider): 13 | if Redis.exists('url:%s' % item['url']): 14 | raise DropItem("Duplicate item found: %s" % item) 15 | else: 16 | Redis.set('url:%s' % item['url'],1) 17 | return item 18 | 19 | # 存储到数据库 20 | class DataBasePipeline(object): 21 | def open_spider(self, spider): 22 | self.session = DBSession() 23 | 24 | def process_item(self, item, spider): 25 | a = Article(title=item["title"].encode("utf-8"), 26 | url=item["url"], 27 | body=item["body"].encode("utf-8"), 28 | publish_time=item["publish_time"].encode("utf-8"), 29 | source_site=item["source_site"].encode("utf-8")) 30 | self.session.add(a) 31 | self.session.commit() 32 | 33 | def close_spider(self,spider): 34 | self.session.close() 35 | 36 | # 存储到文件 37 | class JsonWriterPipeline(object): 38 | 39 | def __init__(self): 40 | self.file = codecs.open('items.json', 'w', encoding='utf-8') 41 | 42 | def process_item(self, item, spider): 43 | line = json.dumps(dict(item)) + "\n" 44 | self.file.write(line.decode('unicode_escape')) 45 | return item 46 | 47 | # 爬取指定条数 100条 48 | class CountDropPipline(object): 49 | def __init__(self): 50 | self.count = 100 51 | 52 | def process_item(self, item, spider): 53 | if self.count == 0: 54 | raise DropItem("Over item found: %s" % item) 55 | else: 56 | self.count -= 1 57 | return item -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from spiders.deep_spider import DeepSpider 3 | from model.config import DBSession 4 | from model.rule import Rule 5 | from scrapy.crawler import CrawlerProcess 6 | from scrapy.settings import Settings 7 | 8 | settings = Settings() 9 | 10 | # crawl settings 11 | settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36") 12 | settings.set("ITEM_PIPELINES" , { 13 | 'pipelines.DuplicatesPipeline': 200, 14 | # 'pipelines.CountDropPipline': 100, 15 | 'pipelines.DataBasePipeline': 300, 16 | }) 17 | 18 | process = CrawlerProcess(settings) 19 | 20 | db = DBSession() 21 | rules = db.query(Rule).filter(Rule.enable == 1) 22 | for rule in rules: 23 | process.crawl(DeepSpider,rule) 24 | process.start() 25 | -------------------------------------------------------------------------------- /spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/deep_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from scrapy.linkextractors import LinkExtractor 6 | 7 | class Article(scrapy.Item): 8 | title = scrapy.Field() 9 | url = scrapy.Field() 10 | body = scrapy.Field() 11 | publish_time = scrapy.Field() 12 | source_site = scrapy.Field() 13 | 14 | class DeepSpider(CrawlSpider): 15 | name = "Deep" 16 | 17 | def __init__(self,rule): 18 | self.rule = rule 19 | self.name = rule.name 20 | self.allowed_domains = rule.allow_domains.split(",") 21 | self.start_urls = rule.start_urls.split(",") 22 | rule_list = [] 23 | #添加`下一页`的规则 24 | if rule.next_page: 25 | rule_list.append(Rule(LinkExtractor(restrict_xpaths = rule.next_page))) 26 | #添加抽取文章链接的规则 27 | rule_list.append(Rule(LinkExtractor( 28 | allow=[rule.allow_url], 29 | restrict_xpaths = [rule.extract_from]), 30 | callback='parse_item')) 31 | self.rules = tuple(rule_list) 32 | super(DeepSpider, self).__init__() 33 | 34 | 35 | def parse_item(self, response): 36 | self.log('Hi, this is an article page! %s' % response.url) 37 | 38 | article = Article() 39 | 40 | article["url"] = response.url 41 | 42 | title = response.xpath(self.rule.title_xpath).extract() 43 | article["title"] = title[0] if title else "" 44 | 45 | body = response.xpath(self.rule.body_xpath).extract() 46 | article["body"] = '\n'.join(body) if body else "" 47 | 48 | publish_time = response.xpath(self.rule.publish_time_xpath).extract() 49 | article["publish_time"] = publish_time[0] if publish_time else "" 50 | 51 | source_site = response.xpath(self.rule.source_site_xpath).extract() 52 | article["source_site"] = source_site[0] if source_site else "" 53 | 54 | return article -------------------------------------------------------------------------------- /spiders/dmoz_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | class DmozItem(scrapy.Item): 6 | """Item object to store title, link, description""" 7 | 8 | title = scrapy.Field() 9 | link = scrapy.Field() 10 | desc = scrapy.Field() 11 | 12 | class DmozSpider(scrapy.Spider): 13 | """ 14 | Spider to crawl Python books and resources on dmoz.org 15 | """ 16 | name = "dmoz" 17 | allowed_domains = ["dmoz.org"] 18 | start_urls = [ 19 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", 20 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" 21 | ] 22 | 23 | def parse(self, response): 24 | for sel in response.xpath('//ul/li'): 25 | item = DmozItem() 26 | item['title'] = sel.xpath('a/text()').extract() 27 | item['link'] = sel.xpath('a/@href').extract() 28 | item['desc'] = sel.xpath('text()').extract() 29 | yield item --------------------------------------------------------------------------------