├── .gitignore
├── README.md
├── __init__.py
├── db.sql
├── model
    ├── __init__.py
    ├── article.py
    ├── config.py
    └── rule.py
├── pipelines.py
├── run.py
└── spiders
    ├── __init__.py
    ├── deep_spider.py
    └── dmoz_spider.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .idea/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrapy Dynamic Configurable
 2 | 
 3 | A dynamic configurable news crawler based Scrapy. See the [blogs](http://wuchong.me/blog/2015/05/22/running-scrapy-programmatically) for more detail.
 4 | 
 5 | ##Requirements
 6 | 
 7 | - Scrapy
 8 | - MySQL
 9 | - Redis
10 | - SQLAlchemy
11 | 
12 | ##Install in development
13 | 
14 | **Mac OS X, use Homebrew**
15 | 
16 | ```bash
17 | $ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
18 | $ brew install mysql
19 | $ brew install redis
20 | $ sudo pip install scrapy
21 | $ sudo pip install SQLAlchemy
22 | $ sudo pip install redis
23 | ```
24 | **Ubuntu***
25 | 
26 | ```bash
27 | $ sudo apt-get install redis-server mysql-server mysql-client
28 | $ pip install scrapy
29 | $ pip install SQLAlchemy
30 | $ pip install redis
31 | ```
32 | **and restore datatables from 'db.sql'**
33 | 
34 | It is for scrapy 1.0 .
35 | It is for scrapy 1.0 .
36 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wuchong/scrapy-dynamic-configurable/801a275d774e863360c28790c4522342aa26123b/__init__.py


--------------------------------------------------------------------------------
/db.sql:
--------------------------------------------------------------------------------
 1 | # ************************************************************
 2 | # Sequel Pro SQL dump
 3 | # Version 4096
 4 | #
 5 | # http://www.sequelpro.com/
 6 | # http://code.google.com/p/sequel-pro/
 7 | #
 8 | # Host: 127.0.0.1 (MySQL 5.6.22)
 9 | # Database: spider
10 | # Generation Time: 2015-05-22 14:07:57 +0000
11 | # ************************************************************
12 | 
13 | 
14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
17 | /*!40101 SET NAMES utf8 */;
18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
21 | 
22 | 
23 | # Dump of table articles
24 | # ------------------------------------------------------------
25 | 
26 | DROP TABLE IF EXISTS `articles`;
27 | 
28 | CREATE TABLE `articles` (
29 |   `id` int(11) NOT NULL AUTO_INCREMENT,
30 |   `title` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
31 |   `url` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
32 |   `body` text COLLATE utf8_unicode_ci,
33 |   `publish_time` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
34 |   `source_site` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
35 |   PRIMARY KEY (`id`)
36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
37 | 
38 | 
39 | 
40 | # Dump of table rules
41 | # ------------------------------------------------------------
42 | 
43 | DROP TABLE IF EXISTS `rules`;
44 | 
45 | CREATE TABLE `rules` (
46 |   `id` int(11) NOT NULL AUTO_INCREMENT,
47 |   `name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
48 |   `allow_domains` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
49 |   `start_urls` text COLLATE utf8_unicode_ci,
50 |   `next_page` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
51 |   `allow_url` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
52 |   `extract_from` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
53 |   `title_xpath` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
54 |   `body_xpath` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
55 |   `publish_time_xpath` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
56 |   `source_site_xpath` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL,
57 |   `enable` tinyint(1) DEFAULT NULL,
58 |   PRIMARY KEY (`id`)
59 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
60 | 
61 | LOCK TABLES `rules` WRITE;
62 | /*!40000 ALTER TABLE `rules` DISABLE KEYS */;
63 | 
64 | INSERT INTO `rules` (`id`, `name`, `allow_domains`, `start_urls`, `next_page`, `allow_url`, `extract_from`, `title_xpath`, `body_xpath`, `publish_time_xpath`, `source_site_xpath`, `enable`)
65 | VALUES
66 | 	(1,'中国会展门户','cnena.com','http://www.cnena.com/news/list-htm-fid-55.html','//a[@title=\'下一页\']','bencandy-htm-fid-.*\\.html','//*[@id=\"sort_list\"]','//div[@class=\"content\"]//h1//text()','//*[@id=\"divcontent\"]//div//text()','//div[@class=\"content\"]//div[@class=\"content_info\"]/text()','//div[@class=\"content\"]//div[@class=\"content_info\"]/a/text()',0),
67 | 	(2,'中国会展网','expo-china.com','http://www.expo-china.com/web/news/news_list.aspx','','\\/pages\\/news\\/\\d{6}\\/\\d*\\/index.shtml','//*[@id=\"NewsView_GNZX_bianju\"]/div','//*[@id=\"NewsView_HDTK_bianju\"]/div[1]/h2/text()','//*[@id=\"NewsView_HDTK_bianju\"]/div[3]/p//text()','//*[@id=\"NewsView_HDTK_bianju\"]/div[1]/span[1]/text()','//*[@id=\"NewsView_HDTK_bianju\"]/div[1]/span[2]/text()',0),
68 | 	(3,'成都会展','cdexpo.com.cn','http://www.cdexpo.com.cn/article-1-35-1.html','//a[@class=\"next\"]','article-detail-.*\\.html','//*[@id=\"news-list\"]/div','//*[@id=\"content\"]//div[@class=\"detail-tit\"]/h3/text()','//*[@id=\"content\"]//div[@class=\"detail-content\"]/p/text()','//*[@id=\"content\"]/div/div[1]/div/div/div[2]/p/span[1]/text()','//*[@id=\"content\"]/div/div[1]/div/div/div[2]/p/span[2]/text()',1);
69 | 
70 | /*!40000 ALTER TABLE `rules` ENABLE KEYS */;
71 | UNLOCK TABLES;
72 | 
73 | 
74 | 
75 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
76 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
77 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
78 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
79 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
80 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
81 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'wuchong'
2 | 


--------------------------------------------------------------------------------
/model/article.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sqlalchemy import Column, String , Integer
 3 | from sqlalchemy.ext.declarative import declarative_base
 4 | 
 5 | Base = declarative_base()
 6 | 
 7 | class Article(Base):
 8 |     __tablename__ = 'articles'
 9 | 
10 |     id = Column(Integer, primary_key=True)
11 |     title = Column(String)
12 |     url = Column(String)
13 |     body = Column(String)
14 |     publish_time = Column(String)
15 |     source_site = Column(String)
16 | 
17 | 


--------------------------------------------------------------------------------
/model/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from sqlalchemy import create_engine
 4 | from sqlalchemy.orm import sessionmaker
 5 | import redis
 6 | 
 7 | # 初始化数据库连接:
 8 | engine = create_engine('mysql+mysqldb://root:root@localhost:3306/spider?charset=utf8')
 9 | # 创建DBSession类型:
10 | DBSession = sessionmaker(bind=engine)
11 | # 初始化redis数据库连接
12 | Redis = redis.StrictRedis(host='localhost',port=6379,db=0)


--------------------------------------------------------------------------------
/model/rule.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from sqlalchemy import Column, String , DateTime, Integer
 3 | from sqlalchemy.ext.declarative import declarative_base
 4 | 
 5 | # 创建对象的基类:
 6 | Base = declarative_base()
 7 | 
 8 | class Rule(Base):
 9 |     __tablename__ = 'rules'
10 | 
11 |     # 表的结构:
12 |     id = Column(Integer, primary_key=True)
13 |     name = Column(String)
14 |     allow_domains = Column(String)
15 |     start_urls = Column(String)
16 |     next_page = Column(String)
17 |     allow_url = Column(String)
18 |     extract_from = Column(String)
19 |     title_xpath = Column(String)
20 |     body_xpath = Column(String)
21 |     publish_time_xpath = Column(String)
22 |     source_site_xpath = Column(String)
23 |     enable = Column(Integer)
24 | 
25 | 


--------------------------------------------------------------------------------
/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import codecs
 5 | from scrapy.exceptions import DropItem
 6 | from model.config import DBSession
 7 | from model.config import Redis
 8 | from model.article import Article
 9 | 
10 | # 去重
11 | class DuplicatesPipeline(object):
12 |     def process_item(self, item, spider):
13 |         if Redis.exists('url:%s' % item['url']):
14 |             raise DropItem("Duplicate item found: %s" % item)
15 |         else:
16 |             Redis.set('url:%s' % item['url'],1)
17 |             return item
18 | 
19 | # 存储到数据库
20 | class DataBasePipeline(object):
21 |     def open_spider(self, spider):
22 |         self.session = DBSession()
23 | 
24 |     def process_item(self, item, spider):
25 |         a = Article(title=item["title"].encode("utf-8"),
26 |                     url=item["url"],
27 |                     body=item["body"].encode("utf-8"),
28 |                     publish_time=item["publish_time"].encode("utf-8"),
29 |                     source_site=item["source_site"].encode("utf-8"))
30 |         self.session.add(a)
31 |         self.session.commit()
32 | 
33 |     def close_spider(self,spider):
34 |         self.session.close()
35 | 
36 | # 存储到文件
37 | class JsonWriterPipeline(object):
38 | 
39 |     def __init__(self):
40 |         self.file = codecs.open('items.json', 'w', encoding='utf-8')
41 | 
42 |     def process_item(self, item, spider):
43 |         line = json.dumps(dict(item)) + "\n"
44 |         self.file.write(line.decode('unicode_escape'))
45 |         return item
46 | 
47 | # 爬取指定条数 100条
48 | class CountDropPipline(object):
49 |     def __init__(self):
50 |         self.count = 100
51 | 
52 |     def process_item(self, item, spider):
53 |         if self.count == 0:
54 |             raise DropItem("Over item found: %s" % item)
55 |         else:
56 |             self.count -= 1
57 |             return item


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from spiders.deep_spider import DeepSpider
 3 | from model.config import DBSession
 4 | from model.rule import Rule
 5 | from scrapy.crawler import CrawlerProcess
 6 | from scrapy.settings import Settings
 7 | 
 8 | settings = Settings()
 9 | 
10 | # crawl settings
11 | settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
12 | settings.set("ITEM_PIPELINES" , {
13 |     'pipelines.DuplicatesPipeline': 200,
14 |     # 'pipelines.CountDropPipline': 100,
15 |     'pipelines.DataBasePipeline': 300,
16 | })
17 | 
18 | process = CrawlerProcess(settings)
19 | 
20 | db = DBSession()
21 | rules = db.query(Rule).filter(Rule.enable == 1)
22 | for rule in rules:
23 |     process.crawl(DeepSpider,rule)
24 | process.start()
25 | 


--------------------------------------------------------------------------------
/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/deep_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from scrapy.linkextractors import LinkExtractor
 6 | 
 7 | class Article(scrapy.Item):
 8 |     title = scrapy.Field()
 9 |     url = scrapy.Field()
10 |     body = scrapy.Field()
11 |     publish_time = scrapy.Field()
12 |     source_site = scrapy.Field()
13 | 
14 | class DeepSpider(CrawlSpider):
15 |     name = "Deep"
16 | 
17 |     def __init__(self,rule):
18 |         self.rule = rule
19 |         self.name = rule.name
20 |         self.allowed_domains = rule.allow_domains.split(",")
21 |         self.start_urls = rule.start_urls.split(",")
22 |         rule_list = []
23 |         #添加`下一页`的规则
24 |         if rule.next_page:
25 |             rule_list.append(Rule(LinkExtractor(restrict_xpaths = rule.next_page)))
26 |         #添加抽取文章链接的规则
27 |         rule_list.append(Rule(LinkExtractor(
28 |             allow=[rule.allow_url],
29 |             restrict_xpaths = [rule.extract_from]),
30 |             callback='parse_item'))
31 |         self.rules = tuple(rule_list)
32 |         super(DeepSpider, self).__init__()
33 | 
34 | 
35 |     def parse_item(self, response):
36 |         self.log('Hi, this is an article page! %s' % response.url)
37 | 
38 |         article = Article()
39 | 
40 |         article["url"] = response.url
41 | 
42 |         title = response.xpath(self.rule.title_xpath).extract()
43 |         article["title"] = title[0] if title else ""
44 | 
45 |         body = response.xpath(self.rule.body_xpath).extract()
46 |         article["body"] =  '\n'.join(body) if body else ""
47 | 
48 |         publish_time = response.xpath(self.rule.publish_time_xpath).extract()
49 |         article["publish_time"] = publish_time[0] if publish_time else ""
50 | 
51 |         source_site = response.xpath(self.rule.source_site_xpath).extract()
52 |         article["source_site"] = source_site[0] if source_site else ""
53 | 
54 |         return article


--------------------------------------------------------------------------------
/spiders/dmoz_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | 
 5 | class DmozItem(scrapy.Item):
 6 |     """Item object to store title, link, description"""
 7 | 
 8 |     title = scrapy.Field()
 9 |     link = scrapy.Field()
10 |     desc = scrapy.Field()
11 | 
12 | class DmozSpider(scrapy.Spider):
13 |     """
14 |     Spider to crawl Python books and resources on dmoz.org
15 |     """
16 |     name = "dmoz"
17 |     allowed_domains = ["dmoz.org"]
18 |     start_urls = [
19 |         "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
20 |         "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
21 |     ]
22 | 
23 |     def parse(self, response):
24 |         for sel in response.xpath('//ul/li'):
25 |             item = DmozItem()
26 |             item['title'] = sel.xpath('a/text()').extract()
27 |             item['link'] = sel.xpath('a/@href').extract()
28 |             item['desc'] = sel.xpath('text()').extract()
29 |             yield item


--------------------------------------------------------------------------------