├── .gitignore ├── README.md ├── alexa ├── alexa │ ├── __init__.py │ ├── cn.json │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ └── alexa_spider.py │ └── universal.json ├── read_from_json.ipynb └── scrapy.cfg ├── alexa_topsites ├── alexa_topsites │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── amazonbook ├── amazonbook │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── clean.sh ├── delay.sh ├── dianping ├── dianping │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── dmoz ├── dmoz │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── doubanbook ├── doubanbook │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── douban_spider.py ├── sample.jpg └── scrapy.cfg ├── doubanmovie ├── doubanmovie │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── douyu ├── douyu │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── general_spider ├── general_spider │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── BasicSpiderConfig.py │ │ ├── __init__.py │ │ ├── run.sh │ │ ├── scrapy_examples.py │ │ ├── spider.py │ │ └── v2ex.py └── scrapy.cfg ├── github_trending ├── github_trending │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── googlescholar ├── README.md ├── googlescholar │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── hacker_news ├── hacker_news │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── hrtencent ├── hrtencent │ ├── __init__.py │ ├── data_utf8.json │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── hrtencent_spider.py └── scrapy.cfg ├── linkedin ├── README.md ├── doc │ └── db-scheme.md └── linkedin │ ├── linkedin │ ├── Rakefile │ ├── __init__.py │ ├── agents.py │ ├── db.py │ ├── items.py │ ├── middleware.py │ ├── parser │ │ ├── HtmlParser.py │ │ ├── LinkedinParser.py │ │ └── __init__.py │ ├── pipelines.py │ ├── proxy.py │ ├── reload_proxy.py │ ├── settings.py │ └── spiders │ │ ├── LinkedinSpider.py │ │ └── __init__.py │ └── scrapy.cfg ├── misc ├── __init__.py ├── agents.py ├── log.py ├── middleware.py ├── proxy.py └── spider.py ├── pandatv ├── pandatv │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── proxylist ├── proxylist │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── log │ │ ├── free-proxy-list.net │ │ └── proxy-list.org │ │ └── spider.py └── scrapy.cfg ├── qqnews ├── qqnews │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── reddit ├── reddit │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py └── scrapy.cfg ├── sinanews ├── scrapy.cfg └── sinanews │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── spider.py ├── sis ├── README.md ├── forum-230.json ├── forum-58.json ├── index.html ├── scrapy.cfg └── sis │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── sis_spider.py ├── startproject.sh ├── template ├── scrapy.cfg └── template │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── spider.py ├── tutorial ├── Books ├── Resources ├── data_utf8.json ├── scrapy.cfg └── tutorial │ ├── __init__.py │ ├── data_utf8.json │ ├── items.py │ ├── misc │ ├── __init__.py │ ├── agents.py │ ├── log.py │ ├── middleware.py │ └── proxy.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── naive_spider.py ├── underdev ├── README ├── meijutt │ ├── meijutt │ │ ├── __init__.py │ │ ├── items.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── spider.py │ └── scrapy.cfg └── twitch │ ├── README │ ├── scrapy.cfg │ └── twitch │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── spider.py ├── v2ex ├── scrapy.cfg └── v2ex │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── spider.py ├── youtube_trending ├── scrapy.cfg └── youtube_trending │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── spider.py ├── zhibo8 ├── run.sh ├── scrapy.cfg └── zhibo8 │ ├── README.md │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ ├── __init__.py │ ├── example.py │ ├── hupu_news_spider.py │ ├── zhibo8_decrypt.py │ └── zhibo8_schedule_spider.py │ ├── utils │ ├── __init__.py │ └── mysqldriver.py │ └── zhibo8 ├── zhihu ├── scrapy.cfg └── zhihu │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── redis-test.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── zhihu_spider.py └── ziroom ├── scrapy.cfg └── ziroom ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.swp 3 | .ipynb_checkpoints/ 4 | 5 | cscope.out 6 | download_file/ 7 | data_utf8.json 8 | # *.json 9 | .DS_Store 10 | dump.rdb 11 | 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | bin/ 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | .tox/ 41 | .coverage 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | 46 | # Translations 47 | *.mo 48 | 49 | # Mr Developer 50 | .mr.developer.cfg 51 | .project 52 | .pydevproject 53 | 54 | # Rope 55 | .ropeproject 56 | 57 | # Django stuff: 58 | *.log 59 | *.pot 60 | 61 | # Sphinx documentation 62 | docs/_build/# Byte-compiled / optimized / DLL files 63 | __pycache__/ 64 | *.py[cod] 65 | 66 | # C extensions 67 | *.so 68 | 69 | # Distribution / packaging 70 | bin/ 71 | build/ 72 | develop-eggs/ 73 | dist/ 74 | eggs/ 75 | lib/ 76 | lib64/ 77 | parts/ 78 | sdist/ 79 | var/ 80 | *.egg-info/ 81 | .installed.cfg 82 | *.egg 83 | 84 | # Installer logs 85 | pip-log.txt 86 | pip-delete-this-directory.txt 87 | 88 | # Unit test / coverage reports 89 | .tox/ 90 | .coverage 91 | .cache 92 | nosetests.xml 93 | coverage.xml 94 | 95 | # Translations 96 | *.mo 97 | 98 | # Mr Developer 99 | .mr.developer.cfg 100 | .project 101 | .pydevproject 102 | 103 | # Rope 104 | .ropeproject 105 | 106 | # Django stuff: 107 | *.log 108 | *.pot 109 | 110 | # Sphinx documentation 111 | docs/_build/ 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scrapy-examples 2 | ============== 3 | 4 | Multifarious scrapy examples with integrated proxies and agents, which make you comfy to write a spider. 5 | 6 | Don't use it to do anything illegal! 7 | 8 | *** 9 | 10 | ## Real spider example: doubanbook 11 | 12 | #### Tutorial 13 | 14 | git clone https://github.com/geekan/scrapy-examples 15 | cd scrapy-examples/doubanbook 16 | scrapy crawl doubanbook 17 | 18 | #### Depth 19 | 20 | There are several depths in the spider, and the spider gets 21 | real data from depth2. 22 | 23 | - Depth0: The entrance is `http://book.douban.com/tag/` 24 | - Depth1: Urls like `http://book.douban.com/tag/外国文学` from depth0 25 | - Depth2: Urls like `http://book.douban.com/subject/1770782/` from depth1 26 | 27 | #### Example image 28 | ![douban book](https://raw.githubusercontent.com/geekan/scrapy-examples/master/doubanbook/sample.jpg) 29 | 30 | *** 31 | 32 | ## Avaiable Spiders 33 | 34 | * tutorial 35 | * dmoz_item 36 | * douban_book 37 | * page_recorder 38 | * douban_tag_book 39 | * doubanbook 40 | * linkedin 41 | * hrtencent 42 | * sis 43 | * zhihu 44 | * alexa 45 | * alexa 46 | * alexa.cn 47 | 48 | ## Advanced 49 | 50 | * Use `parse_with_rules` to write a spider quickly. 51 | See dmoz spider for more details. 52 | 53 | * Proxies 54 | * If you don't want to use proxy, just comment the proxy middleware in settings. 55 | * If you want to custom it, hack `misc/proxy.py` by yourself. 56 | 57 | * Notice 58 | * Don't use `parse` as your method name, it's an inner method of CrawlSpider. 59 | 60 | ### Advanced Usage 61 | 62 | * Run `./startproject.sh ` to start a new project. 63 | It will automatically generate most things, the only left things are: 64 | * `PROJECT/PROJECT/items.py` 65 | * `PROJECT/PROJECT/spider/spider.py` 66 | 67 | #### Example to hack `items.py` and `spider.py` 68 | 69 | Hacked `items.py` with additional fields `url` and `description`: 70 | ``` 71 | from scrapy.item import Item, Field 72 | 73 | class exampleItem(Item): 74 | url = Field() 75 | name = Field() 76 | description = Field() 77 | ``` 78 | 79 | Hacked `spider.py` with start rules and css rules (here only display the class exampleSpider): 80 | ``` 81 | class exampleSpider(CommonSpider): 82 | name = "dmoz" 83 | allowed_domains = ["dmoz.org"] 84 | start_urls = [ 85 | "http://www.dmoz.com/", 86 | ] 87 | # Crawler would start on start_urls, and follow the valid urls allowed by below rules. 88 | rules = [ 89 | Rule(sle(allow=["/Arts/", "/Games/"]), callback='parse', follow=True), 90 | ] 91 | 92 | css_rules = { 93 | '.directory-url li': { 94 | '__use': 'dump', # dump data directly 95 | '__list': True, # it's a list 96 | 'url': 'li > a::attr(href)', 97 | 'name': 'a::text', 98 | 'description': 'li::text', 99 | } 100 | } 101 | 102 | def parse(self, response): 103 | info('Parse '+response.url) 104 | # parse_with_rules is implemented here: 105 | # https://github.com/geekan/scrapy-examples/blob/master/misc/spider.py 106 | self.parse_with_rules(response, self.css_rules, exampleItem) 107 | ``` 108 | 109 | -------------------------------------------------------------------------------- /alexa/alexa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/alexa/alexa/__init__.py -------------------------------------------------------------------------------- /alexa/alexa/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class alexaSiteInfoItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | url = Field() 12 | description = Field() 13 | category = Field() 14 | 15 | class alexaCategoryItem(Item): 16 | name = Field() 17 | url = Field() 18 | -------------------------------------------------------------------------------- /alexa/alexa/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /alexa/alexa/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for alexa project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'alexa' 17 | 18 | SPIDER_MODULES = ['alexa.spiders'] 19 | NEWSPIDER_MODULE = 'alexa.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'alexa (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'alexa.pipelines.JsonWithEncodingPipeline': 300, 31 | #'alexa.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /alexa/alexa/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /alexa/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = alexa.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = alexa 12 | -------------------------------------------------------------------------------- /alexa_topsites/alexa_topsites/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/alexa_topsites/alexa_topsites/__init__.py -------------------------------------------------------------------------------- /alexa_topsites/alexa_topsites/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class alexa_topsitesItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /alexa_topsites/alexa_topsites/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /alexa_topsites/alexa_topsites/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for alexa_topsites project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'alexa_topsites' 17 | 18 | SPIDER_MODULES = ['alexa_topsites.spiders'] 19 | NEWSPIDER_MODULE = 'alexa_topsites.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'alexa_topsites (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'alexa_topsites.pipelines.JsonWithEncodingPipeline': 300, 31 | #'alexa_topsites.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /alexa_topsites/alexa_topsites/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /alexa_topsites/alexa_topsites/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from alexa_topsites.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class alexa_topsitesSpider(CommonSpider): 24 | name = "alexa_topsites" 25 | allowed_domains = ["alexa.com"] 26 | start_urls = [ 27 | "http://www.alexa.com/topsites", 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("http://www.alexa.com/topsites")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '.site-listing': { 35 | 'rank': '.count::text', 36 | 'name': '.desc-paragraph a::text', 37 | 'desc': '.description::text' 38 | } 39 | } 40 | 41 | content_css_rules = { 42 | 'text': '#Cnt-Main-Article-QQ p *::text', 43 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 44 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 45 | } 46 | 47 | def parse_1(self, response): 48 | info('Parse '+response.url) 49 | x = self.parse_with_rules(response, self.list_css_rules, dict) 50 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 51 | print(json.dumps(x, ensure_ascii=False, indent=2)) 52 | # pp.pprint(x) 53 | # return self.parse_with_rules(response, self.css_rules, alexa_topsitesItem) 54 | -------------------------------------------------------------------------------- /alexa_topsites/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = alexa_topsites.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = alexa_topsites 12 | -------------------------------------------------------------------------------- /amazonbook/amazonbook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/amazonbook/amazonbook/__init__.py -------------------------------------------------------------------------------- /amazonbook/amazonbook/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class amazonbookItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /amazonbook/amazonbook/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /amazonbook/amazonbook/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for amazonbook project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'amazonbook' 17 | 18 | SPIDER_MODULES = ['amazonbook.spiders'] 19 | NEWSPIDER_MODULE = 'amazonbook.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'amazonbook (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'amazonbook.pipelines.JsonWithEncodingPipeline': 300, 31 | #'amazonbook.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /amazonbook/amazonbook/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /amazonbook/amazonbook/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | 6 | 7 | from scrapy.selector import Selector 8 | try: 9 | from scrapy.spiders import Spider 10 | except: 11 | from scrapy.spiders import BaseSpider as Spider 12 | from scrapy.utils.response import get_base_url 13 | from scrapy.spiders import CrawlSpider, Rule 14 | from scrapy.linkextractors import LinkExtractor as sle 15 | 16 | 17 | from amazonbook.items import * 18 | from misc.log import * 19 | from misc.spider import CommonSpider 20 | 21 | 22 | import pprint 23 | class MyPrettyPrinter(pprint.PrettyPrinter): 24 | def format(self, object, context, maxlevels, level): 25 | if isinstance(object, unicode): 26 | return (object.encode('utf8'), True, False) 27 | return pprint.PrettyPrinter.format(self, object, context, maxlevels, level) 28 | pp = MyPrettyPrinter() 29 | 30 | 31 | class amazonbookSpider(CommonSpider): 32 | name = "amazonbook" 33 | allowed_domains = ["amazon.com", "www.amazon.com"] 34 | start_urls = [ 35 | #"http://www.amazon.com/b/ref=s9_acss_bw_en_BGG15eve_d_1_6?_encoding=UTF8&node=17&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-top-3&pf_rd_r=0XCRZV6SDKBTKDPH8SFR&pf_rd_t=101&pf_rd_p=2293718502&pf_rd_i=283155", 36 | "http://www.amazon.com/books-used-books-textbooks/b?node=283155", 37 | ] 38 | rules = [ 39 | #Rule(sle(allow=("/gp/product/.*")), callback='parse_1', follow=True), 40 | Rule(sle(allow=("/books-used-books-textbooks/.*")), callback='parse_0', follow=True), 41 | ] 42 | 43 | css_rules = { 44 | ".inner .a-row": { 45 | "url": ".title::attr(href)", 46 | #"desc": "span::text" 47 | "title": ".s9TitleText::text", 48 | "comments": ".a-icon-row .a-size-small::text", 49 | } 50 | } 51 | 52 | def parse_0(self, response): 53 | info('Parse 0 '+response.url) 54 | pp.pprint(self.parse_with_rules(response, self.css_rules, dict)) 55 | 56 | #.inner .a-row 57 | def parse_1(self, response): 58 | info('Parse 1 '+response.url) 59 | #pp.pprint(self.parse_with_rules(response, self.css_rules, dict)) 60 | # return self.parse_with_rules(response, self.css_rules, amazonbookItem) 61 | -------------------------------------------------------------------------------- /amazonbook/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = amazonbook.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = amazonbook 12 | -------------------------------------------------------------------------------- /clean.sh: -------------------------------------------------------------------------------- 1 | find . -name '*.pyc' | xargs rm 2 | -------------------------------------------------------------------------------- /delay.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | delay=${1:-1} 4 | 5 | #find . | grep -E "settings.py$" | xargs sed -i '' "s/DOWNLOAD_DEALY = [0-9]+/DOWNLOAD_DELAY=$1/g" 6 | if [[ "$OSTYPE" == "darwin"* ]]; then 7 | find . | grep -E "settings.py$" | xargs sed -E -i '' "s/DOWNLOAD_DELAY = [0-9]+/DOWNLOAD_DELAY = $delay/g" 8 | else 9 | find . | grep -E "settings.py$" | xargs sed -E -i "s/DOWNLOAD_DELAY = [0-9]+/DOWNLOAD_DELAY = $delay/g" 10 | fi 11 | -------------------------------------------------------------------------------- /dianping/dianping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/dianping/dianping/__init__.py -------------------------------------------------------------------------------- /dianping/dianping/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class dianpingItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /dianping/dianping/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /dianping/dianping/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for dianping project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'dianping' 17 | 18 | SPIDER_MODULES = ['dianping.spiders'] 19 | NEWSPIDER_MODULE = 'dianping.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'dianping (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'dianping.pipelines.JsonWithEncodingPipeline': 300, 31 | #'dianping.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /dianping/dianping/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dianping/dianping/spiders/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | from json import loads 4 | from scrapy.http import Request 5 | from scrapy.selector import Selector 6 | 7 | try: 8 | from scrapy.spiders import Spider 9 | except: 10 | from scrapy.spiders import BaseSpider as Spider 11 | 12 | from misc.spider import CommonSpider 13 | 14 | BAIDU_GEO = u'http://api.map.baidu.com/geocoder/v2/?address={}&output=json&ak=gQsCAgCrWsuN99ggSIjGn5nO' 15 | 16 | base_category_url = "http://www.dianping.com/search/category" 17 | 18 | start_url_dict = { 19 | u"足疗按摩": "/2/30/g141r1471", 20 | u"中医养生": "/2/30/g2827r1471", 21 | u"健康体检": "/2/80/g612", 22 | u"妇幼保健": "/2/70/g258", 23 | u"美容Spa": "/2/50/g158", 24 | u"整形塑体": "/2/85/g183", 25 | u"运动健身": "/2/45/g147", 26 | u"口腔健康": "/2/85/g182", 27 | u"药店": "/2/85/g235" 28 | } 29 | 30 | 31 | def clean_string(string): 32 | return string.replace(' ', '').replace('\n', '') if string else '' 33 | 34 | 35 | def address_to_geo(address): 36 | data = requests.get(BAIDU_GEO.format(address)).json() 37 | longitude = data['result']['location']['lng'] if 'result' in data else 120.260569 38 | latitude = data['result']['location']['lat'] if 'result' in data else 30.242865 39 | return {'longitude': longitude, 'latitude': latitude} 40 | 41 | 42 | class dianpingSpider(CommonSpider): 43 | name = "dianping" 44 | allowed_domains = ["dianping.com"] 45 | 46 | def start_requests(self): 47 | for k, v in start_url_dict.items(): 48 | for i in range(1, 3): 49 | url = base_category_url + v + 'p{}'.format(i) 50 | yield Request(url, callback=self.parse, meta={'category': k}) 51 | 52 | def parse(self, response): 53 | hxs = Selector(response) 54 | shops = hxs.xpath('//div[@class="tit"]/a/@href').extract() 55 | for shop in shops: 56 | if shop.startswith('/shop/'): 57 | yield Request("http://www.dianping.com{}".format(shop), callback=self.parse_shop, 58 | meta=response.request.meta) 59 | 60 | def parse_shop(self, response): 61 | shop = {} 62 | hxs = Selector(response) 63 | shop_name = hxs.css('.shop-name::text').extract_first() 64 | shop['name'] = clean_string(shop_name) 65 | address = hxs.css('.address span.item::text').extract_first() 66 | shop['address'] = clean_string(address) 67 | phone_number = hxs.css('.tel span.item::text').extract_first() 68 | shop['phone_number'] = clean_string(phone_number) 69 | path = u'//span[contains(text(), "营业时间:")]/following-sibling::span/text()' 70 | opening_hours = hxs.xpath(path).extract_first() 71 | shop['opening_hours'] = clean_string(opening_hours) 72 | geo = address_to_geo(address) 73 | shop.update(geo) 74 | store_images = hxs.xpath("//div[@class='photos-container']//img/@src").extract() 75 | shop['store_images'] = ','.join(store_images[:2]) 76 | deals = hxs.xpath("//div[@id='sales']//a/@href").extract() 77 | shop['deals'] = deals 78 | shop['category'] = response.request.meta['category'] 79 | return shop 80 | 81 | 82 | class dianpingDealSpider(CommonSpider): 83 | name = "dianping-deal" 84 | allowed_domains = ["dianping.com"] 85 | 86 | def start_requests(self): 87 | with open('partner.json', 'rb') as f: 88 | for line in f: 89 | data = loads(line) 90 | for url in data['deals']: 91 | yield Request(url, callback=self.parse, meta={'category': data['category'], 92 | 'partner': data['name']}) 93 | break 94 | 95 | def parse(self, response): 96 | deal = {} 97 | hxs = Selector(response) 98 | bd = hxs.css('.bd') 99 | name = bd.css('.title::text').extract_first() 100 | deal['name'] = clean_string(name) 101 | description = bd.css('.sub-title span::text').extract_first() 102 | deal['description'] = clean_string(description) 103 | price = bd.css('.price-display::text').extract_first() 104 | deal['price'] = clean_string(price) 105 | # it's dynamic 106 | # images = hxs.xpath('//div[@class="img-area"]//img/@src').extract() 107 | # deal['images'] = ','.join(images[:2]) 108 | deal['category'] = response.request.meta['category'] 109 | deal['partner'] = response.request.meta['partner'] 110 | return deal 111 | -------------------------------------------------------------------------------- /dianping/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = dianping.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dianping 12 | -------------------------------------------------------------------------------- /dmoz/dmoz/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/dmoz/dmoz/__init__.py -------------------------------------------------------------------------------- /dmoz/dmoz/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class dmozItem(Item): 9 | # define the fields for your item here like: 10 | url = Field() 11 | name = Field() 12 | description = Field() 13 | 14 | -------------------------------------------------------------------------------- /dmoz/dmoz/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /dmoz/dmoz/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for dmoz project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'dmoz' 17 | 18 | SPIDER_MODULES = ['dmoz.spiders'] 19 | NEWSPIDER_MODULE = 'dmoz.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'dmoz (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'dmoz.pipelines.JsonWithEncodingPipeline': 300, 31 | #'dmoz.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /dmoz/dmoz/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /dmoz/dmoz/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | 6 | 7 | from scrapy.selector import Selector 8 | try: 9 | from scrapy.spiders import Spider 10 | except: 11 | from scrapy.spiders import BaseSpider as Spider 12 | from scrapy.utils.response import get_base_url 13 | from scrapy.spiders import CrawlSpider, Rule 14 | from scrapy.linkextractors import LinkExtractor as sle 15 | 16 | 17 | from dmoz.items import * 18 | from misc.log import * 19 | from misc.spider import CommonSpider 20 | 21 | 22 | class dmozSpider(CommonSpider): 23 | name = "dmoz" 24 | allowed_domains = ["dmoz.org"] 25 | start_urls = [ 26 | "http://www.dmoz.org/", 27 | ] 28 | valid_categories = [ 29 | 'Arts', 'Business', 'Computers', 'Games', 'Health', 'Home', 30 | 'Kids_and_Teens', 'News', 'Recreation', 'Reference', 'Regional', 'Science', 31 | 'Shopping', 'Society', 'Sports', 32 | ] 33 | allow_rules = ['/'+i+'/' for i in valid_categories] 34 | rules = [ 35 | Rule(sle(allow=allow_rules), callback='parse_1', follow=True), 36 | ] 37 | 38 | item_rules = { 39 | '.directory-url li': { 40 | '__use': 'dump', 41 | '__list': True, 42 | 'url': 'li > a::attr(href)', 43 | 'name': 'a::text', 44 | 'description': 'li::text', 45 | } 46 | } 47 | 48 | def parse_1(self, response): 49 | info('Parse depth 1 '+response.url) 50 | items = self.parse_with_rules(response, self.item_rules, dmozItem) 51 | return items 52 | -------------------------------------------------------------------------------- /dmoz/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = dmoz.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dmoz 12 | -------------------------------------------------------------------------------- /doubanbook/doubanbook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/doubanbook/doubanbook/__init__.py -------------------------------------------------------------------------------- /doubanbook/doubanbook/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class DoubanbookItem(Item): 9 | # define the fields for your item here like: 10 | # name = Field() 11 | title = Field() 12 | link = Field() 13 | desc = Field() 14 | num = Field() 15 | 16 | 17 | class DoubanSubjectItem(Item): 18 | title = Field() 19 | link = Field() 20 | info = Field() 21 | rate = Field() 22 | votes = Field() 23 | content_intro = Field() 24 | author_intro = Field() 25 | tags = Field() 26 | -------------------------------------------------------------------------------- /doubanbook/doubanbook/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def spider_closed(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /doubanbook/doubanbook/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for doubanbook project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | 15 | BOT_NAME = 'doubanbook' 16 | 17 | SPIDER_MODULES = ['doubanbook.spiders'] 18 | NEWSPIDER_MODULE = 'doubanbook.spiders' 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | #USER_AGENT = 'doubanbook (+http://www.yourdomain.com)' 22 | 23 | DOWNLOADER_MIDDLEWARES = { 24 | #'misc.middleware.CustomHttpProxyMiddleware': 400, 25 | 'misc.middleware.CustomUserAgentMiddleware': 401, 26 | } 27 | 28 | ITEM_PIPELINES = { 29 | 'doubanbook.pipelines.JsonWithEncodingPipeline': 300, 30 | #'template.pipelines.RedisPipeline': 301, 31 | } 32 | 33 | LOG_LEVEL = 'INFO' 34 | 35 | -------------------------------------------------------------------------------- /doubanbook/doubanbook/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /doubanbook/doubanbook/spiders/douban_spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | 4 | 5 | from scrapy.selector import Selector 6 | try: 7 | from scrapy.spiders import Spider 8 | except: 9 | from scrapy.spiders import BaseSpider as Spider 10 | from scrapy.utils.response import get_base_url 11 | from scrapy.spiders import CrawlSpider, Rule 12 | from scrapy.linkextractors import LinkExtractor as sle 13 | 14 | 15 | from doubanbook.items import * 16 | from misc.log import * 17 | 18 | 19 | class DoubanBookSpider(CrawlSpider): 20 | name = "doubanbook" 21 | allowed_domains = ["douban.com"] 22 | start_urls = [ 23 | "https://book.douban.com/tag/" 24 | ] 25 | rules = [ 26 | Rule(sle(allow=("/subject/\d+$")), callback='parse_2'), 27 | Rule(sle(allow=("/tag/[^/]+$", )), follow=True), 28 | #Rule(sle(allow=("/tag/$", )), follow=True), 29 | ] 30 | 31 | def parse_2(self, response): 32 | items = [] 33 | sel = Selector(response) 34 | sites = sel.css('#wrapper') 35 | for site in sites: 36 | item = DoubanSubjectItem() 37 | item['title'] = site.css('h1 span::text').extract() 38 | item['link'] = response.url 39 | item['content_intro'] = site.css('#link-report .intro p::text').extract() 40 | items.append(item) 41 | # print repr(item).decode("unicode-escape") + '\n' 42 | print item 43 | # info('parsed ' + str(response)) 44 | return items 45 | 46 | def parse_1(self, response): 47 | # url cannot encode to Chinese easily.. XXX 48 | info('parsed ' + str(response)) 49 | 50 | def process_request(self, request): 51 | info('process ' + str(request)) 52 | return request 53 | 54 | def closed(self, reason): 55 | info("DoubanBookSpider Closed:" + reason) 56 | -------------------------------------------------------------------------------- /doubanbook/sample.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/doubanbook/sample.jpg -------------------------------------------------------------------------------- /doubanbook/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = doubanbook.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = doubanbook 12 | -------------------------------------------------------------------------------- /doubanmovie/doubanmovie/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/doubanmovie/doubanmovie/__init__.py -------------------------------------------------------------------------------- /doubanmovie/doubanmovie/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class doubanmovieItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /doubanmovie/doubanmovie/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | print("JsonWithEncodingPipeline closed") 29 | self.file.close() 30 | 31 | def open_spider(self, spider): 32 | print("JsonWithEncodingPipeline opend") 33 | 34 | 35 | class RedisPipeline(object): 36 | 37 | def __init__(self): 38 | self.r = redis.StrictRedis(host='localhost', port=6379) 39 | 40 | def process_item(self, item, spider): 41 | if not item['id']: 42 | print 'no id item!!' 43 | 44 | str_recorded_item = self.r.get(item['id']) 45 | final_item = None 46 | if str_recorded_item is None: 47 | final_item = item 48 | else: 49 | ritem = eval(self.r.get(item['id'])) 50 | final_item = dict(item.items() + ritem.items()) 51 | self.r.set(item['id'], final_item) 52 | 53 | def close_spider(self, spider): 54 | return 55 | -------------------------------------------------------------------------------- /doubanmovie/doubanmovie/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for doubanmovie project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'doubanmovie' 17 | 18 | SPIDER_MODULES = ['doubanmovie.spiders'] 19 | NEWSPIDER_MODULE = 'doubanmovie.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'doubanmovie (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | #'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'doubanmovie.pipelines.JsonWithEncodingPipeline': 300, 31 | #'doubanmovie.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /doubanmovie/doubanmovie/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /doubanmovie/doubanmovie/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from doubanmovie.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class doubanmovieSpider(CommonSpider): 24 | name = "doubanmovie" 25 | allowed_domains = ["douban.com"] 26 | start_urls = [ 27 | #"https://movie.douban.com/tag/", 28 | "https://movie.douban.com/chart" 29 | ] 30 | rules = [ 31 | #Rule(sle(allow=("/tag/[0-9]{4}$")), follow=True), 32 | #Rule(sle(allow=("/tag/[0-9]{4}/?start=[0-9]{2,4}&type=T$")), follow=True), 33 | #Rule(sle(allow=("/subject/[0-9]+$")), callback='parse_1'), 34 | Rule(sle(allow=("/subject/[0-9]+/$")), callback='parse_1', follow=True), 35 | ] 36 | 37 | list_css_rules = { 38 | '.linkto': { 39 | 'url': 'a::attr(href)', 40 | 'name': 'a::text', 41 | } 42 | } 43 | 44 | list_css_rules_2 = { 45 | '#listZone .Q-tpWrap': { 46 | 'url': '.linkto::attr(href)', 47 | 'name': '.linkto::text' 48 | } 49 | } 50 | 51 | content_css_rules = { 52 | 'rating_per': '.rating_per::text', 53 | 'rating_num': '.rating_num::text', 54 | 'title': 'h1 span:nth-child(1)::text', 55 | 'rating_people': '.rating_people span::text', 56 | } 57 | 58 | def parse_1(self, response): 59 | info('Parse '+response.url) 60 | x = self.parse_with_rules(response, self.content_css_rules, dict) 61 | return x 62 | #print(repr(x).decode('raw_unicode_escape')) 63 | # return self.parse_with_rules(response, self.css_rules, doubanmovieItem) 64 | -------------------------------------------------------------------------------- /doubanmovie/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = doubanmovie.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = doubanmovie 12 | -------------------------------------------------------------------------------- /douyu/douyu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/douyu/douyu/__init__.py -------------------------------------------------------------------------------- /douyu/douyu/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class douyuItem(Item): 9 | # define the fields for your item here like: 10 | url = Field() 11 | room_name = Field() 12 | people_count = Field() 13 | tag = Field() 14 | -------------------------------------------------------------------------------- /douyu/douyu/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /douyu/douyu/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for douyu project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'douyu' 17 | 18 | SPIDER_MODULES = ['douyu.spiders'] 19 | NEWSPIDER_MODULE = 'douyu.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'douyu (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'douyu.pipelines.JsonWithEncodingPipeline': 300, 31 | #'douyu.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /douyu/douyu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /douyu/douyu/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from douyu.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class douyuSpider(CommonSpider): 24 | name = "douyu" 25 | allowed_domains = ["douyu.com"] 26 | start_urls = [ 27 | "http://www.douyu.com/directory/all" 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("http://www.douyu.com/directory/all")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '#live-list-contentbox li': { 35 | 'url': 'a::attr(href)', 36 | 'room_name': 'a::attr(title)', 37 | 'tag': 'span.tag.ellipsis::text', 38 | 'people_count': '.dy-num.fr::text' 39 | } 40 | } 41 | 42 | list_css_rules_for_item = { 43 | '#live-list-contentbox li': { 44 | '__use': '1', 45 | '__list': '1', 46 | 'url': 'a::attr(href)', 47 | 'room_name': 'a::attr(title)', 48 | 'tag': 'span.tag.ellipsis::text', 49 | 'people_count': '.dy-num.fr::text' 50 | } 51 | } 52 | 53 | 54 | def parse_1(self, response): 55 | info('Parse '+response.url) 56 | #x = self.parse_with_rules(response, self.list_css_rules, dict) 57 | x = self.parse_with_rules(response, self.list_css_rules_for_item, douyuItem) 58 | print(len(x)) 59 | # print(json.dumps(x, ensure_ascii=False, indent=2)) 60 | # pp.pprint(x) 61 | # return self.parse_with_rules(response, self.list_css_rules, douyuItem) 62 | return x 63 | -------------------------------------------------------------------------------- /douyu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = douyu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = douyu 12 | -------------------------------------------------------------------------------- /general_spider/general_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/general_spider/general_spider/__init__.py -------------------------------------------------------------------------------- /general_spider/general_spider/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class general_spiderItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /general_spider/general_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /general_spider/general_spider/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for general_spider project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'general_spider' 17 | 18 | SPIDER_MODULES = ['general_spider.spiders'] 19 | NEWSPIDER_MODULE = 'general_spider.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'general_spider (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'general_spider.pipelines.JsonWithEncodingPipeline': 300, 31 | #'general_spider.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /general_spider/general_spider/spiders/BasicSpiderConfig.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class ExRule: 4 | allowed_rule_regex = '' 5 | # list_css_rules 6 | paras = {} 7 | 8 | def __init__(self, allowed_rule_regex, **kwargs): 9 | self.allowed_rule_regex = allowed_rule_regex 10 | self.paras = kwargs 11 | 12 | 13 | class BasicConfig: 14 | name='' 15 | allowed_domains=[] 16 | # allowed_url_regex=[] 17 | start_urls=[] 18 | ex_rules = [] 19 | 20 | 21 | -------------------------------------------------------------------------------- /general_spider/general_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /general_spider/general_spider/spiders/run.sh: -------------------------------------------------------------------------------- 1 | conf=${1:-scrapy_examples} 2 | scrapy crawl general_spider -a conf_module=$conf 3 | -------------------------------------------------------------------------------- /general_spider/general_spider/spiders/scrapy_examples.py: -------------------------------------------------------------------------------- 1 | 2 | from BasicSpiderConfig import ExRule 3 | 4 | class Config: 5 | 6 | list_css_rules = { 7 | '.js-navigation-item': { 8 | 'content': '.content a::text', 9 | 'message': '.message a::text', 10 | 'age': '.age *::text', 11 | } 12 | } 13 | 14 | ex_rule = ExRule('https://github.com/geekan/scrapy-examples$', list_css_rules=list_css_rules) 15 | 16 | name='scrapy_examples' 17 | allowed_domains=['github.com'] 18 | start_urls=['https://github.com/geekan/scrapy-examples'] 19 | ex_rules = [ex_rule] 20 | 21 | -------------------------------------------------------------------------------- /general_spider/general_spider/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from general_spider.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | import BasicSpiderConfig 24 | 25 | 26 | class general_spiderSpider(CommonSpider): 27 | name = 'general_spider' 28 | 29 | def __init__(self, conf_module='TestSpiderConfig', *args, **kwargs): 30 | cm = __import__(conf_module, globals=globals()) 31 | conf = cm.Config() 32 | self.name = conf.name 33 | self.allowed_domains = conf.allowed_domains 34 | self.start_urls = conf.start_urls 35 | self.rules = [Rule(sle(allow=(c.allowed_rule_regex)), callback='parse_1', cb_kwargs=c.paras, follow=True) for c in conf.ex_rules] 36 | info(self.start_urls) 37 | info(self.rules) 38 | super(general_spiderSpider, self).__init__(*args, **kwargs) 39 | 40 | def parse_1(self, response, list_css_rules): 41 | info('---------------------') 42 | info('Parse '+response.url) 43 | info('list_css_rules:') 44 | info(list_css_rules) 45 | x = self.parse_with_rules(response, list_css_rules, dict) 46 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 47 | print(json.dumps(x, ensure_ascii=False, indent=2)) 48 | # pp.pprint(x) 49 | # return self.parse_with_rules(response, self.css_rules, general_spiderItem) 50 | return x 51 | -------------------------------------------------------------------------------- /general_spider/general_spider/spiders/v2ex.py: -------------------------------------------------------------------------------- 1 | 2 | from BasicSpiderConfig import ExRule 3 | 4 | class Config: 5 | 6 | list_css_rules = { 7 | '.cell.item': { 8 | 'title': '.item_title a::text', 9 | 'node': '.node::text', 10 | 'author': '.node+ strong a::text', 11 | 'reply_count': '.count_livid::text' 12 | } 13 | } 14 | 15 | ex_rule = ExRule('http://www.v2ex.com/$', list_css_rules=list_css_rules) 16 | 17 | name='v2ex' 18 | allowed_domains=['www.v2ex.com'] 19 | start_urls=['http://www.v2ex.com/'] 20 | ex_rules = [ex_rule] 21 | 22 | -------------------------------------------------------------------------------- /general_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = general_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = general_spider 12 | -------------------------------------------------------------------------------- /github_trending/github_trending/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/github_trending/github_trending/__init__.py -------------------------------------------------------------------------------- /github_trending/github_trending/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class github_trendingItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /github_trending/github_trending/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /github_trending/github_trending/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for github_trending project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'github_trending' 17 | 18 | SPIDER_MODULES = ['github_trending.spiders'] 19 | NEWSPIDER_MODULE = 'github_trending.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'github_trending (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'github_trending.pipelines.JsonWithEncodingPipeline': 300, 31 | #'github_trending.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /github_trending/github_trending/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /github_trending/github_trending/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from github_trending.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class github_trendingSpider(CommonSpider): 24 | name = "github_trending" 25 | allowed_domains = ["github.com"] 26 | start_urls = [ 27 | "http://www.github.com/trending", 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("/trending$")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '.repo-list-item': { 35 | 'repo_name': '.repo-list-name a::attr(href)', 36 | 'repo_meta': '.repo-list-meta::text', 37 | } 38 | } 39 | 40 | content_css_rules = { 41 | 'text': '#Cnt-Main-Article-QQ p *::text', 42 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 43 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 44 | } 45 | 46 | def parse_1(self, response): 47 | info('Parse '+response.url) 48 | x = self.parse_with_rules(response, self.list_css_rules, dict) 49 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 50 | print(json.dumps(x, ensure_ascii=False, indent=2)) 51 | # pp.pprint(x) 52 | # return self.parse_with_rules(response, self.css_rules, github_trendingItem) 53 | -------------------------------------------------------------------------------- /github_trending/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = github_trending.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = github_trending 12 | -------------------------------------------------------------------------------- /googlescholar/googlescholar/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/googlescholar/googlescholar/__init__.py -------------------------------------------------------------------------------- /googlescholar/googlescholar/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class googlescholarItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /googlescholar/googlescholar/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /googlescholar/googlescholar/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for googlescholar project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'googlescholar' 17 | 18 | SPIDER_MODULES = ['googlescholar.spiders'] 19 | NEWSPIDER_MODULE = 'googlescholar.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'googlescholar (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | #'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'googlescholar.pipelines.JsonWithEncodingPipeline': 300, 31 | #'googlescholar.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /googlescholar/googlescholar/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /googlescholar/googlescholar/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from googlescholar.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class googlescholarSpider(CommonSpider): 24 | name = "googlescholar" 25 | allowed_domains = ["google.com"] 26 | start_urls = [ 27 | "http://scholar.google.com/scholar?as_ylo=2011&q=machine+learning&hl=en&as_sdt=0,5", 28 | #"http://scholar.google.com/scholar?q=estimate+ctr&btnG=&hl=en&as_sdt=0%2C5&as_ylo=2011", 29 | #"http://scholar.google.com", 30 | ] 31 | rules = [ 32 | Rule(sle(allow=("scholar\?.*")), callback='parse_1', follow=False), 33 | ] 34 | 35 | def __init__(self, start_url='', *args, **kwargs): 36 | if start_url: 37 | self.start_urls = [start_url] 38 | super(googlescholarSpider, self).__init__(*args, **kwargs) 39 | 40 | #.gs_ri: content besides related html/pdf 41 | list_css_rules = { 42 | '.gs_r': { 43 | 'title': '.gs_rt a *::text', 44 | 'url': '.gs_rt a::attr(href)', 45 | 'related-text': '.gs_ggsS::text', 46 | 'related-type': '.gs_ggsS .gs_ctg2::text', 47 | 'related-url': '.gs_ggs a::attr(href)', 48 | 'citation-text': '.gs_fl > a:nth-child(1)::text', 49 | 'citation-url': '.gs_fl > a:nth-child(1)::attr(href)', 50 | 'authors': '.gs_a a::text', 51 | 'description': '.gs_rs *::text', 52 | 'journal-year-src': '.gs_a::text', 53 | } 54 | } 55 | 56 | def parse_1(self, response): 57 | info('Parse '+response.url) 58 | #sel = Selector(response) 59 | #v = sel.css('.gs_ggs a::attr(href)').extract() 60 | #import pdb; pdb.set_trace() 61 | x = self.parse_with_rules(response, self.list_css_rules, dict) 62 | pp.pprint(x[0]['.gs_r']) 63 | # return self.parse_with_rules(response, self.css_rules, googlescholarItem) 64 | -------------------------------------------------------------------------------- /googlescholar/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = googlescholar.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = googlescholar 12 | -------------------------------------------------------------------------------- /hacker_news/hacker_news/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/hacker_news/hacker_news/__init__.py -------------------------------------------------------------------------------- /hacker_news/hacker_news/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class hacker_newsItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /hacker_news/hacker_news/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /hacker_news/hacker_news/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for hacker_news project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'hacker_news' 17 | 18 | SPIDER_MODULES = ['hacker_news.spiders'] 19 | NEWSPIDER_MODULE = 'hacker_news.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'hacker_news (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'hacker_news.pipelines.JsonWithEncodingPipeline': 300, 31 | #'hacker_news.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /hacker_news/hacker_news/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /hacker_news/hacker_news/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from hacker_news.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class hacker_newsSpider(CommonSpider): 24 | name = "hacker_news" 25 | allowed_domains = ["news.ycombinator.com"] 26 | start_urls = [ 27 | "https://news.ycombinator.com/", 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("https://news.ycombinator.com/$")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | 'title': '.storylink::text', 35 | 'desc': '.subtext .score::text', 36 | } 37 | 38 | content_css_rules = { 39 | 'text': '#Cnt-Main-Article-QQ p *::text', 40 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 41 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 42 | } 43 | 44 | def parse_1(self, response): 45 | info('Parse '+response.url) 46 | x = self.parse_with_rules(response, self.list_css_rules, dict) 47 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 48 | print(json.dumps(x, ensure_ascii=False, indent=2)) 49 | # pp.pprint(x) 50 | # return self.parse_with_rules(response, self.css_rules, hacker_newsItem) 51 | -------------------------------------------------------------------------------- /hacker_news/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = hacker_news.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = hacker_news 12 | -------------------------------------------------------------------------------- /hrtencent/hrtencent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/hrtencent/hrtencent/__init__.py -------------------------------------------------------------------------------- /hrtencent/hrtencent/data_utf8.json: -------------------------------------------------------------------------------- 1 | {"bottomline": ["深圳", "设计类", "1人"], "sharetitle": ["SD3-互娱2D原画设计师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=16300&keywords=&lid=0&tid=0"} 2 | {"bottomline": ["深圳", "技术类", "2人"], "sharetitle": ["SD9-移动终端游戏开发工程师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=16709&keywords=&lid=0&tid=0"} 3 | {"bottomline": ["深圳", "产品/项目类", "1人"], "sharetitle": ["MIG12-高级数据分析经理(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=12799&keywords=&lid=0&tid=0"} 4 | {"bottomline": ["深圳", "技术类", "5人"], "sharetitle": ["SNG03-QQ iOS开发工程师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=16602&keywords=&lid=0&tid=0"} 5 | {"bottomline": ["深圳", "技术类", "4人"], "sharetitle": ["TEG07-PHP开发工程师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=16714&keywords=&lid=0&tid=0"} 6 | {"bottomline": ["上海", "技术类", "2人"], "sharetitle": ["SNG15-广告业务Java开发工程师(上海)"], "link": "http://hr.tencent.com/position_detail.php?id=16639&keywords=&lid=0&tid=0"} 7 | {"bottomline": ["深圳", "技术类", "1人"], "sharetitle": ["MIG12-技术运营工程师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=16458&keywords=&lid=0&tid=0"} 8 | {"bottomline": ["深圳", "技术类", "9人"], "sharetitle": ["14413-android终端开发工程师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=16243&keywords=&lid=0&tid=0"} 9 | {"bottomline": ["深圳", "职能类", "4人"], "sharetitle": ["S2-财经系统需求分析师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=16582&keywords=&lid=0&tid=0"} 10 | {"bottomline": ["深圳", "技术类", "4人"], "sharetitle": ["14413-Android终端开发工程师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=13424&keywords=&lid=0&tid=0"} 11 | {"bottomline": ["北京", "内容编辑类", "2人"], "sharetitle": ["OMG10-腾讯综艺视频编辑(北京)"], "link": "http://hr.tencent.com/position_detail.php?id=15775&keywords=&lid=0&tid=0"} 12 | {"bottomline": ["北京", "内容编辑类", "1人"], "sharetitle": ["OMG10-腾讯视频综艺运营编辑(北京)"], "link": "http://hr.tencent.com/position_detail.php?id=14937&keywords=&lid=0&tid=0"} 13 | {"bottomline": ["深圳", "产品/项目类", "6人"], "sharetitle": ["13552-Global Operation Talents (shenzhen)"], "link": "http://hr.tencent.com/position_detail.php?id=15974&keywords=&lid=0&tid=0"} 14 | {"bottomline": ["上海", "设计类", "2人"], "sharetitle": ["SD8-3D美术设计师(上海)"], "link": "http://hr.tencent.com/position_detail.php?id=16730&keywords=&lid=0&tid=0"} 15 | {"bottomline": ["上海", "产品/项目类", "2人"], "sharetitle": ["SD8-移动终端手游策划(上海)"], "link": "http://hr.tencent.com/position_detail.php?id=13299&keywords=&lid=0&tid=0"} 16 | {"bottomline": ["北京", "设计类", "1人"], "sharetitle": ["SNG10-交互设计师(北京)"], "link": "http://hr.tencent.com/position_detail.php?id=16305&keywords=&lid=0&tid=0"} 17 | {"bottomline": ["成都", "技术类", "1人"], "sharetitle": ["SD2-后台开发工程师(成都)"], "link": "http://hr.tencent.com/position_detail.php?id=12133&keywords=&lid=0&tid=0"} 18 | {"bottomline": ["北京", "市场类", "1人"], "sharetitle": ["SNG15-市场活动策划与执行(北京)"], "link": "http://hr.tencent.com/position_detail.php?id=16266&keywords=&lid=0&tid=0"} 19 | {"bottomline": ["深圳", "技术类", "1人"], "sharetitle": ["SD9-互娱后台开发工程师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=16708&keywords=&lid=0&tid=0"} 20 | {"bottomline": ["深圳", "技术类", "1人"], "sharetitle": ["SD5-后台开发工程师(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=14854&keywords=&lid=0&tid=0"} 21 | {"bottomline": ["深圳", "产品/项目类", "14人"], "sharetitle": ["13551-移动游戏数据分析与商业化运营策划(深圳)"], "link": "http://hr.tencent.com/position_detail.php?id=14982&keywords=&lid=0&tid=0"} 22 | -------------------------------------------------------------------------------- /hrtencent/hrtencent/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class PositionDetailItem(Item): 9 | title = Field() 10 | link = Field() 11 | sharetitle = Field() 12 | bottomline = Field() 13 | duty = Field() 14 | xxx = Field() 15 | -------------------------------------------------------------------------------- /hrtencent/hrtencent/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | from scrapy import signals 7 | 8 | 9 | import json 10 | import codecs 11 | 12 | 13 | class JsonWithEncodingPipeline(object): 14 | 15 | def __init__(self): 16 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 17 | 18 | def process_item(self, item, spider): 19 | line = json.dumps(dict(item), ensure_ascii=False) + "\n" 20 | self.file.write(line) 21 | return item 22 | 23 | def close_spider(self, spider): 24 | self.file.close() 25 | -------------------------------------------------------------------------------- /hrtencent/hrtencent/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for hrtencent project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | 15 | BOT_NAME = 'hrtencent' 16 | 17 | SPIDER_MODULES = ['hrtencent.spiders'] 18 | NEWSPIDER_MODULE = 'hrtencent.spiders' 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | #USER_AGENT = 'hrtencent (+http://www.yourdomain.com)' 22 | DOWNLOADER_MIDDLEWARES = { 23 | #'misc.middleware.CustomHttpProxyMiddleware': 400, 24 | 'misc.middleware.CustomUserAgentMiddleware': 401, 25 | } 26 | 27 | ITEM_PIPELINES = { 28 | 'hrtencent.pipelines.JsonWithEncodingPipeline': 300, 29 | } 30 | 31 | LOG_LEVEL = 'INFO' 32 | 33 | -------------------------------------------------------------------------------- /hrtencent/hrtencent/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /hrtencent/hrtencent/spiders/hrtencent_spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | 4 | 5 | from scrapy.selector import Selector 6 | try: 7 | from scrapy.spiders import Spider 8 | except: 9 | from scrapy.spiders import BaseSpider as Spider 10 | from scrapy.utils.response import get_base_url 11 | from scrapy.spiders import CrawlSpider, Rule 12 | from scrapy.linkextractors import LinkExtractor as sle 13 | 14 | 15 | from hrtencent.items import * 16 | from misc.log import * 17 | 18 | 19 | class HrtencentSpider(CrawlSpider): 20 | name = "hrtencent" 21 | allowed_domains = ["tencent.com"] 22 | start_urls = [ 23 | "http://hr.tencent.com/position.php?start=%d" % d for d in range(0, 20, 10) 24 | ] 25 | rules = [ 26 | Rule(sle(allow=("/position_detail.php\?id=\d*.*", )), callback='parse_2'), 27 | Rule(sle(allow=("/position.php\?&start=\d{,2}#a")), follow=True, callback='parse_1') 28 | ] 29 | 30 | def parse_2(self, response): 31 | items = [] 32 | sel = Selector(response) 33 | sites = sel.css('.tablelist') 34 | for site in sites: 35 | item = PositionDetailItem() 36 | item['sharetitle'] = site.css('.h #sharetitle::text').extract() 37 | item['bottomline'] = site.css('.bottomline td::text').extract() 38 | # item['duty'] = site.css('.c .l2::text').extract() 39 | item['link'] = response.url 40 | items.append(item) 41 | print repr(item).decode("unicode-escape") + '\n' 42 | # info('parsed ' + str(response)) 43 | self.parse_1(response) 44 | return items 45 | 46 | def parse_1(self, response): 47 | # url cannot encode to Chinese easily.. XXX 48 | info('parsed ' + str(response)) 49 | 50 | def _process_request(self, request): 51 | info('process ' + str(request)) 52 | return request 53 | -------------------------------------------------------------------------------- /hrtencent/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = hrtencent.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = hrtencent 12 | -------------------------------------------------------------------------------- /linkedin/README.md: -------------------------------------------------------------------------------- 1 | scrapy-linkedin 2 | =============== 3 | 4 | Using Scrapy to get Linkedin's person public profile. 5 | 6 | ### feature 7 | * Get all **public** profile 8 | * Using Scrapy 9 | * Enable auto throttle 10 | * Enable naive proxy providing 11 | * Agent rotating 12 | * Support Unicode 13 | * Using MongoDB as Backend 14 | * ... 15 | 16 | 17 | ### Dependency 18 | * Scrapy == 0.20 19 | * pymongo 20 | * BeautifulSoup4, UnicodeDammit 21 | 22 | 23 | ### usage 24 | 1. start a MongoDB instance, `mongod` 25 | 2. run the crawler, `scrapy crawl LinkedinSpider` 26 | 27 | you may found `Rakefile` helpful. 28 | 29 | 30 | ### configuration 31 | you can change MongoDB setting ang other things in `settings.py`. 32 | 33 | ### note 34 | if you just need whatever public profiles, there are better ways to do it. 35 | check out these urls: http://www.linkedin.com/directory/people/[a-z].html 36 | 37 | Our strategy is following `also-view` links in public profile. 38 | 39 | ### One more thing 40 | This is a toy project a few years ago. Now I won't maintain it anymore, questions about this project will be ignored. You can read the code, there isn't much. 41 | I hope this project can help you get a basic understanding of Scrapy, then you can make your own Spider. 42 | -------------------------------------------------------------------------------- /linkedin/doc/db-scheme.md: -------------------------------------------------------------------------------- 1 | Mongodb Scheme(**draft**) 2 | 3 | --- 4 | 5 | **PersonProfile** 6 | 7 | PersonProfile 8 | { 9 | linkedin_id:'id', 10 | locality:'beijing', 11 | industry:'Research', 12 | summary:'I am a professor…', 13 | 14 | skills: 15 | [ 16 | 'data mining', 17 | 'machine learning' 18 | ], 19 | 20 | specilities: 21 | [ 22 | 'data mining', 23 | ], 24 | 25 | interests: 26 | [ 27 | 'data mining', 28 | 'machine learning' 29 | ], 30 | 31 | groups: 32 | { 33 | 'member', 34 | 'affiliation': 35 | [ 36 | 'kdd 2012' 37 | ] 38 | } 39 | 40 | honors: 41 | [ 42 | 'first prize', 43 | ], 44 | 45 | education: 46 | [ 47 | { 48 | school_name: 'a', 49 | period: '1991-2012', 50 | desc:'topic model' 51 | }, 52 | ], 53 | 54 | experience: 55 | [ 56 | { 57 | title:'associate professor', 58 | organization:'tsinghua', 59 | period:'1999-2000', 60 | description:'research about data mining', 61 | }, 62 | ], 63 | 64 | also_view: 65 | [ 66 | { 67 | 'linkedin_id':'asd', 68 | 'url':'http', 69 | } 70 | ], 71 | 72 | } -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/Rakefile: -------------------------------------------------------------------------------- 1 | task :start do 2 | system 'scrapy crawl LinkedinSpider' 3 | end 4 | 5 | 6 | task :start_with_resume, :job_name do |t, args| 7 | if !File.exists?("crawl_jobs") 8 | FileUtil.mkdir("crawl_jobs") 9 | end 10 | system "scrapy crawl LinkedinSpider -s JOBDIR=crawl_jobs/#{args.job_name}" 11 | end 12 | 13 | task :reload_proxy do 14 | system 'python reload_proxy.py' 15 | end 16 | -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/linkedin/linkedin/linkedin/__init__.py -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/db.py: -------------------------------------------------------------------------------- 1 | from linkedin import settings 2 | import pymongo 3 | 4 | class MongoDBClient(object): 5 | def __init__(self, col, index=None): 6 | connection = pymongo.Connection(settings.MONGODB_SERVER, settings.MONGODB_PORT) 7 | self.db = connection[settings.MONGODB_DB] 8 | self.collection = self.db[col] 9 | if index: 10 | self.collection.create_index(index, unique=True) 11 | 12 | def get_collection(self): 13 | return self.collection 14 | 15 | def _walk(self): 16 | """ 17 | generator of all the documents in this collection 18 | """ 19 | skip = 0 20 | limit = 1000 21 | hasMore = True 22 | while hasMore: 23 | res = self.collection.find(skip=skip, limit=limit) 24 | hasMore = (res.count(with_limit_and_skip=True) == limit) 25 | for x in res: 26 | yield x 27 | skip += limit 28 | 29 | def walk(self): 30 | """ 31 | return all the documents in this collection 32 | """ 33 | docs = [] 34 | for doc in self._walk(): 35 | docs.append(doc) 36 | return docs 37 | 38 | -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class LinkedinItem(Item): 9 | # define the fields for your item here like: 10 | # name = Field() 11 | pass 12 | 13 | 14 | class PersonProfileItem(Item): 15 | _id = Field() 16 | url = Field() 17 | name = Field() 18 | also_view = Field() 19 | education = Field() 20 | locality = Field() 21 | industry = Field() 22 | summary = Field() 23 | specilities = Field() 24 | skills = Field() 25 | interests = Field() 26 | group = Field() 27 | honors = Field() 28 | education = Field() 29 | experience = Field() 30 | overview_html = Field() 31 | homepage = Field() 32 | 33 | -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/middleware.py: -------------------------------------------------------------------------------- 1 | from scrapy import log 2 | from proxy import PROXIES 3 | from agents import AGENTS 4 | 5 | import random 6 | 7 | """ 8 | Custom proxy provider. 9 | """ 10 | class CustomHttpProxyMiddleware(object): 11 | 12 | def process_request(self, request, spider): 13 | # TODO implement complex proxy providing algorithm 14 | if self.use_proxy(request): 15 | p = random.choice(PROXIES) 16 | try: 17 | request.meta['proxy'] = "http://%s" % p['ip_port'] 18 | except Exception, e: 19 | log.msg("Exception %s" % e, _level=log.CRITICAL) 20 | 21 | 22 | def use_proxy(self, request): 23 | """ 24 | using direct download for depth <= 2 25 | using proxy with probability 0.3 26 | """ 27 | if "depth" in request.meta and int(request.meta['depth']) <= 2: 28 | return False 29 | i = random.randint(1, 10) 30 | return i <= 2 31 | 32 | 33 | """ 34 | change request header nealy every time 35 | """ 36 | class CustomUserAgentMiddleware(object): 37 | def process_request(self, request, spider): 38 | agent = random.choice(AGENTS) 39 | request.headers['User-Agent'] = agent 40 | -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/parser/LinkedinParser.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from urllib2 import urlparse 3 | 4 | def parse_homepage(html): 5 | soup = BeautifulSoup(html) 6 | websites = soup.find_all('dd', 'websites') 7 | if websites and len(websites) > 0: 8 | websites = websites[0] 9 | sites = websites.find_all('li') 10 | if sites and len(sites) > 0: 11 | result = {} 12 | for site in sites: 13 | site_name = site.text.strip() 14 | original = site.a.get('href') 15 | url_parse = urlparse.urlparse(original).query 16 | query_parse = urlparse.parse_qs(url_parse) 17 | if 'url' in query_parse: 18 | result[site_name] = query_parse['url'] 19 | return result 20 | return None 21 | 22 | -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/linkedin/linkedin/linkedin/parser/__init__.py -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | from scrapy.conf import settings 6 | from scrapy import log 7 | 8 | class LinkedinPipeline(object): 9 | def process_item(self, item, spider): 10 | return item 11 | 12 | 13 | # Copyright 2011 Julien Duponchelle . 14 | # 15 | # Licensed under the Apache License, Version 2.0 (the "License"); 16 | # you may not use this file except in compliance with the License. 17 | # You may obtain a copy of the License at 18 | # 19 | # http://www.apache.org/licenses/LICENSE-2.0 20 | # 21 | # Unless required by applicable law or agreed to in writing, software 22 | # distributed under the License is distributed on an "AS IS" BASIS, 23 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 24 | # See the License for the specific language governing permissions and 25 | # limitations under the License. 26 | class MongoDBPipeline(object): 27 | def __init__(self): 28 | import pymongo 29 | connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT']) 30 | self.db = connection[settings['MONGODB_DB']] 31 | self.collection = self.db[settings['MONGODB_COLLECTION']] 32 | if self.__get_uniq_key() is not None: 33 | self.collection.create_index(self.__get_uniq_key(), unique=True) 34 | 35 | def process_item(self, item, spider): 36 | if self.__get_uniq_key() is None: 37 | self.collection.insert(dict(item)) 38 | else: 39 | self.collection.update( 40 | {self.__get_uniq_key(): item[self.__get_uniq_key()]}, 41 | dict(item), 42 | upsert=True) 43 | log.msg("Item wrote to MongoDB database %s/%s" % 44 | (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']), 45 | level=log.DEBUG, spider=spider) 46 | return item 47 | 48 | def __get_uniq_key(self): 49 | if not settings['MONGODB_UNIQ_KEY'] or settings['MONGODB_UNIQ_KEY'] == "": 50 | return None 51 | return settings['MONGODB_UNIQ_KEY'] -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for linkedin project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | import os 9 | 10 | BOT_NAME = 'linkedin' 11 | 12 | SPIDER_MODULES = ['linkedin.spiders'] 13 | NEWSPIDER_MODULE = 'linkedin.spiders' 14 | 15 | DOWNLOADER_MIDDLEWARES = { 16 | 'linkedin.middleware.CustomHttpProxyMiddleware': 543, 17 | 'linkedin.middleware.CustomUserAgentMiddleware': 545, 18 | } 19 | 20 | ########### Item pipeline 21 | ITEM_PIPELINES = [ 22 | "linkedin.pipelines.MongoDBPipeline", 23 | ] 24 | 25 | MONGODB_SERVER = 'localhost' 26 | MONGODB_PORT = 27017 27 | MONGODB_DB = 'scrapy' 28 | MONGODB_COLLECTION = 'person_profiles' 29 | MONGODB_UNIQ_KEY = '_id' 30 | ########### 31 | 32 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 33 | #USER_AGENT = 'linkedin (+http://www.yourdomain.com)' 34 | 35 | # Enable auto throttle 36 | AUTOTHROTTLE_ENABLED = True 37 | 38 | COOKIES_ENABLED = False 39 | 40 | # Set your own download folder 41 | DOWNLOAD_FILE_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)), "download_file") 42 | 43 | 44 | -------------------------------------------------------------------------------- /linkedin/linkedin/linkedin/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /linkedin/linkedin/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = linkedin.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = linkedin 12 | -------------------------------------------------------------------------------- /misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/misc/__init__.py -------------------------------------------------------------------------------- /misc/log.py: -------------------------------------------------------------------------------- 1 | 2 | #from scrapy import log 3 | import logging as log 4 | 5 | def warn(msg): 6 | #log.msg(str(msg), level=log.WARNING) 7 | log.warn(str(msg)) 8 | 9 | 10 | def info(msg): 11 | #log.msg(str(msg), level=log.INFO) 12 | log.info(str(msg)) 13 | 14 | 15 | def debug(msg): 16 | #log.msg(str(msg), level=log.DEBUG) 17 | log.debug(str(msg)) 18 | 19 | import pprint 20 | class MyPrettyPrinter(pprint.PrettyPrinter): 21 | def format(self, object, context, maxlevels, level): 22 | if isinstance(object, unicode): 23 | return (object.encode('utf8'), True, False) 24 | return pprint.PrettyPrinter.format(self, object, context, maxlevels, level) 25 | pu = MyPrettyPrinter() 26 | 27 | pp = pprint.PrettyPrinter() 28 | -------------------------------------------------------------------------------- /misc/middleware.py: -------------------------------------------------------------------------------- 1 | from proxy import PROXIES, FREE_PROXIES 2 | from agents import AGENTS 3 | import logging as log 4 | 5 | import random 6 | 7 | 8 | class CustomHttpProxyFromMysqlMiddleware(object): 9 | proxies = FREE_PROXIES 10 | 11 | def process_request(self, request, spider): 12 | # TODO implement complex proxy providing algorithm 13 | if self.use_proxy(request): 14 | p = random.choice(self.proxies) 15 | try: 16 | request.meta['proxy'] = "http://%s" % p['ip_port'] 17 | print(request.meta['proxy']) 18 | except Exception, e: 19 | #log.msg("Exception %s" % e, _level=log.CRITICAL) 20 | log.critical("Exception %s" % e) 21 | 22 | def use_proxy(self, request): 23 | """ 24 | using direct download for depth <= 2 25 | using proxy with probability 0.3 26 | """ 27 | #if "depth" in request.meta and int(request.meta['depth']) <= 2: 28 | # return False 29 | #i = random.randint(1, 10) 30 | #return i <= 2 31 | return True 32 | 33 | 34 | 35 | class CustomHttpProxyMiddleware(object): 36 | 37 | def process_request(self, request, spider): 38 | # TODO implement complex proxy providing algorithm 39 | if self.use_proxy(request): 40 | p = random.choice(PROXIES) 41 | try: 42 | request.meta['proxy'] = "http://%s" % p['ip_port'] 43 | except Exception, e: 44 | #log.msg("Exception %s" % e, _level=log.CRITICAL) 45 | log.critical("Exception %s" % e) 46 | 47 | def use_proxy(self, request): 48 | """ 49 | using direct download for depth <= 2 50 | using proxy with probability 0.3 51 | """ 52 | #if "depth" in request.meta and int(request.meta['depth']) <= 2: 53 | # return False 54 | #i = random.randint(1, 10) 55 | #return i <= 2 56 | return True 57 | 58 | 59 | class CustomUserAgentMiddleware(object): 60 | def process_request(self, request, spider): 61 | agent = random.choice(AGENTS) 62 | request.headers['User-Agent'] = agent 63 | -------------------------------------------------------------------------------- /misc/proxy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This proxy is goagent/wallproxy 3 | If you want to disable it, plz configure settings.py 4 | ''' 5 | PROXIES = [ 6 | #{"ip_port": "127.0.0.1:8087"}, #goagent 7 | #{"ip_port": "127.0.0.1:8118"}, #tor via privoxy 8 | {"ip_port": "127.0.0.1:1080"}, #tor via privoxy 9 | ] 10 | 11 | FREE_PROXIES = [ 12 | {"ip_port": "181.48.0.173:8081"}, 13 | {"ip_port": "82.43.21.165:3128"}, 14 | {"ip_port": "185.112.234.4:80"}, 15 | {"ip_port": "118.189.13.178:8080"}, 16 | {"ip_port": "37.187.117.157:3128"}, 17 | {"ip_port": "62.201.200.17:80"}, 18 | {"ip_port": "181.143.28.210:3128"}, 19 | {"ip_port": "216.190.97.3:3128"}, 20 | {"ip_port": "183.111.169.205:3128"}, 21 | ] 22 | -------------------------------------------------------------------------------- /pandatv/pandatv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/pandatv/pandatv/__init__.py -------------------------------------------------------------------------------- /pandatv/pandatv/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class pandatvItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /pandatv/pandatv/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /pandatv/pandatv/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for pandatv project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'pandatv' 17 | 18 | SPIDER_MODULES = ['pandatv.spiders'] 19 | NEWSPIDER_MODULE = 'pandatv.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'pandatv (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'pandatv.pipelines.JsonWithEncodingPipeline': 300, 31 | #'pandatv.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /pandatv/pandatv/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /pandatv/pandatv/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from pandatv.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class pandatvSpider(CommonSpider): 24 | name = "pandatv" 25 | allowed_domains = ["panda.tv"] 26 | start_urls = [ 27 | "http://www.panda.tv/all", 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("http://www.panda.tv/all")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '.video-list-item.video-no-tag': { 35 | 'room_name': '.video-title::text', 36 | 'author': '.video-nickname::text', 37 | 'people_count': '.video-number::text', 38 | 'tag': '.video-cate::text', 39 | } 40 | } 41 | 42 | content_css_rules = { 43 | 'text': '#Cnt-Main-Article-QQ p *::text', 44 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 45 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 46 | } 47 | 48 | def parse_1(self, response): 49 | info('Parse '+response.url) 50 | x = self.parse_with_rules(response, self.list_css_rules, dict) 51 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 52 | print(json.dumps(x, ensure_ascii=False, indent=2)) 53 | # pp.pprint(x) 54 | # return self.parse_with_rules(response, self.css_rules, pandatvItem) 55 | -------------------------------------------------------------------------------- /pandatv/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = pandatv.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = pandatv 12 | -------------------------------------------------------------------------------- /proxylist/proxylist/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/proxylist/proxylist/__init__.py -------------------------------------------------------------------------------- /proxylist/proxylist/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class freeProxyListItem(Item): 9 | # define the fields for your item here like: 10 | ip = Field() 11 | port = Field() 12 | code = Field() 13 | country = Field() 14 | anonymity = Field() 15 | google = Field() 16 | https = Field() 17 | last_checked = Field() 18 | 19 | -------------------------------------------------------------------------------- /proxylist/proxylist/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | import sys 18 | import MySQLdb 19 | import hashlib 20 | from scrapy.exceptions import DropItem 21 | from scrapy.http import Request 22 | 23 | 24 | class JsonWithEncodingPipeline(object): 25 | 26 | def __init__(self): 27 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 28 | 29 | def process_item(self, item, spider): 30 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 31 | self.file.write(line) 32 | return item 33 | 34 | def close_spider(self, spider): 35 | self.file.close() 36 | 37 | 38 | 39 | class MySQLStorePipeline(object): 40 | def __init__(self): 41 | # user, passwd, db 42 | self.conn = MySQLdb.connect(user='proxylist', passwd='proxylist', db='proxylist', host='localhost', charset="utf8", use_unicode=True) 43 | self.cursor = self.conn.cursor() 44 | # self.cursor.execute('create table free_proxy_list (ip varchar(32), port int, code varchar(16), country varchar(64), anoymity varchar(32), google varchar(4), https varchar(4), last_checked varchar(32));''') 45 | 46 | def process_item(self, item, spider): 47 | try: 48 | l = ['ip', 'port', 'code', 'country', 'anonymity', 'google', 'https', 'last_checked'] 49 | self.cursor.execute(""" 50 | INSERT INTO free_proxy_list 51 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s)""", 52 | [item[i].encode('utf-8') for i in l] 53 | ) 54 | self.conn.commit() 55 | except MySQLdb.Error, e: 56 | print "Error %d: %s" % (e.args[0], e.args[1]) 57 | 58 | return item 59 | 60 | 61 | class RedisPipeline(object): 62 | 63 | def __init__(self): 64 | self.r = redis.StrictRedis(host='localhost', port=6379) 65 | 66 | def process_item(self, item, spider): 67 | if not item['id']: 68 | print 'no id item!!' 69 | 70 | str_recorded_item = self.r.get(item['id']) 71 | final_item = None 72 | if str_recorded_item is None: 73 | final_item = item 74 | else: 75 | ritem = eval(self.r.get(item['id'])) 76 | final_item = dict(item.items() + ritem.items()) 77 | self.r.set(item['id'], final_item) 78 | 79 | def close_spider(self, spider): 80 | return 81 | -------------------------------------------------------------------------------- /proxylist/proxylist/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for proxylist project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'proxylist' 17 | 18 | SPIDER_MODULES = ['proxylist.spiders'] 19 | NEWSPIDER_MODULE = 'proxylist.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'proxylist (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'proxylist.pipelines.JsonWithEncodingPipeline': 300, 31 | #'proxylist.pipelines.RedisPipeline': 301, 32 | 'proxylist.pipelines.MySQLStorePipeline': 302 33 | } 34 | 35 | LOG_LEVEL = 'INFO' 36 | 37 | DOWNLOAD_DELAY = 1 38 | -------------------------------------------------------------------------------- /proxylist/proxylist/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /proxylist/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = proxylist.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = proxylist 12 | -------------------------------------------------------------------------------- /qqnews/qqnews/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/qqnews/qqnews/__init__.py -------------------------------------------------------------------------------- /qqnews/qqnews/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class qqnewsItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | title = Field() 12 | url = Field() 13 | content = Field() 14 | 15 | class PositionDetailItem(Item): 16 | title = Field() 17 | link = Field() 18 | sharetitle = Field() 19 | bottomline = Field() 20 | duty = Field() 21 | xxx = Field() 22 | -------------------------------------------------------------------------------- /qqnews/qqnews/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /qqnews/qqnews/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for qqnews project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'qqnews' 17 | 18 | SPIDER_MODULES = ['qqnews.spiders'] 19 | NEWSPIDER_MODULE = 'qqnews.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'qqnews (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | #'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'qqnews.pipelines.JsonWithEncodingPipeline': 300, 31 | #'qqnews.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | #DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /qqnews/qqnews/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /qqnews/qqnews/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | 4 | 5 | from scrapy.selector import Selector 6 | try: 7 | from scrapy.spiders import Spider 8 | except: 9 | from scrapy.spiders import BaseSpider as Spider 10 | from scrapy.utils.response import get_base_url 11 | from scrapy.spiders import CrawlSpider, Rule 12 | from scrapy.linkextractors import LinkExtractor as sle 13 | 14 | 15 | from qqnews.items import * 16 | from misc.log import * 17 | from misc.spider import CommonSpider 18 | 19 | 20 | class qqnewsSpider(CommonSpider): 21 | name = "qqnews" 22 | allowed_domains = ["tencent.com", 'qq.com'] 23 | start_urls = [ 24 | 'http://news.qq.com/society_index.shtml' 25 | ] 26 | rules = [ 27 | Rule(sle(allow=('society_index.shtml')), callback='parse_0', follow=True), 28 | Rule(sle(allow=(".*[0-9]{8}.*htm$")), callback='parse_1', follow=True), 29 | ] 30 | 31 | list_css_rules = { 32 | '.linkto': { 33 | 'url': 'a::attr(href)', 34 | 'name': 'a::text', 35 | } 36 | } 37 | 38 | list_css_rules_2 = { 39 | '#listZone .Q-tpWrap': { 40 | 'url': '.linkto::attr(href)', 41 | 'name': '.linkto::text' 42 | } 43 | } 44 | 45 | content_css_rules = { 46 | 'text': '#Cnt-Main-Article-QQ p *::text', 47 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 48 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 49 | } 50 | 51 | def parse_0(self, response): 52 | info('Parse0 '+response.url) 53 | x = self.parse_with_rules(response, self.list_css_rules, dict) 54 | pp.pprint(x) 55 | #return self.parse_with_rules(response, self.list_css_rules, qqnewsItem) 56 | 57 | def parse_1(self, response): 58 | info('Parse1 '+response.url) 59 | x = self.parse_with_rules(response, self.content_css_rules, dict) 60 | pp.pprint(x) 61 | #import pdb; pdb.set_trace() 62 | 63 | def parse_2(self, response): 64 | info('Parse2 '+response.url) 65 | 66 | -------------------------------------------------------------------------------- /qqnews/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = qqnews.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = qqnews 12 | -------------------------------------------------------------------------------- /reddit/reddit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/reddit/reddit/__init__.py -------------------------------------------------------------------------------- /reddit/reddit/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class redditItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /reddit/reddit/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /reddit/reddit/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for reddit project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'reddit' 17 | 18 | SPIDER_MODULES = ['reddit.spiders'] 19 | NEWSPIDER_MODULE = 'reddit.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'reddit (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'reddit.pipelines.JsonWithEncodingPipeline': 300, 31 | #'reddit.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /reddit/reddit/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /reddit/reddit/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from reddit.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class redditSpider(CommonSpider): 24 | name = "reddit" 25 | allowed_domains = ["reddit.com"] 26 | start_urls = [ 27 | "https://www.reddit.com/", 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("https://www.reddit.com/")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '.link': { 35 | 'title': '.title a::text', 36 | 'domain': '.domain a::text', 37 | 'author': '.author::text', 38 | 'comment_count': '.comments::text', 39 | 'score': '.score::text', 40 | } 41 | } 42 | 43 | content_css_rules = { 44 | 'text': '#Cnt-Main-Article-QQ p *::text', 45 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 46 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 47 | } 48 | 49 | def parse_1(self, response): 50 | info('Parse '+response.url) 51 | x = self.parse_with_rules(response, self.list_css_rules, dict) 52 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 53 | print(json.dumps(x, ensure_ascii=False, indent=2)) 54 | # pp.pprint(x) 55 | # return self.parse_with_rules(response, self.css_rules, redditItem) 56 | -------------------------------------------------------------------------------- /reddit/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = reddit.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = reddit 12 | -------------------------------------------------------------------------------- /sinanews/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = sinanews.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sinanews 12 | -------------------------------------------------------------------------------- /sinanews/sinanews/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/sinanews/sinanews/__init__.py -------------------------------------------------------------------------------- /sinanews/sinanews/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class sinanewsItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | content = Field() 12 | url = Field() 13 | 14 | 15 | -------------------------------------------------------------------------------- /sinanews/sinanews/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /sinanews/sinanews/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for sinanews project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'sinanews' 17 | 18 | SPIDER_MODULES = ['sinanews.spiders'] 19 | NEWSPIDER_MODULE = 'sinanews.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'sinanews (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomHttpProxyFromMysqlMiddleware': 400, 27 | 'misc.middleware.CustomUserAgentMiddleware': 401, 28 | } 29 | 30 | ITEM_PIPELINES = { 31 | 'sinanews.pipelines.JsonWithEncodingPipeline': 300, 32 | #'sinanews.pipelines.RedisPipeline': 301, 33 | } 34 | 35 | LOG_LEVEL = 'INFO' 36 | 37 | DOWNLOAD_DELAY = 1 38 | -------------------------------------------------------------------------------- /sinanews/sinanews/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sinanews/sinanews/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from sinanews.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | import pprint 24 | class MyPrettyPrinter(pprint.PrettyPrinter): 25 | def format(self, object, context, maxlevels, level): 26 | if isinstance(object, unicode): 27 | return (object.encode('utf8'), True, False) 28 | return pprint.PrettyPrinter.format(self, object, context, maxlevels, level) 29 | pp = MyPrettyPrinter() 30 | 31 | 32 | class sinanewsSpider(CommonSpider): 33 | name = "sinanews" 34 | allowed_domains = ["news.sina.com.cn"] 35 | start_urls = [ 36 | "http://news.sina.com.cn/", 37 | ] 38 | rules = [ 39 | Rule(sle(allow=("http://news.sina.com.cn/$")), callback='parse_0'), 40 | Rule(sle(allow=(".*doc[^/]*shtml$")), callback='parse_1'), #, follow=True), 41 | #Rule(sle(allow=('/c/2015-11-19/doc-ifxkszhk0386278.shtml')), callback='parse_1', follow=True, process_request='process_request'), 42 | ] 43 | 44 | list_css_rules = { 45 | '#blk_yw_01 a': { 46 | 'url': 'a::attr(href)', 47 | 'name': 'a::text', 48 | } 49 | } 50 | 51 | content_css_rules = { 52 | 'text': 'p::text', 53 | 'images': 'img::attr(src)', 54 | 'images-desc': '.img_descr::text', 55 | # need url analysis for video 56 | #'video': '#J_Article_Player', 57 | } 58 | 59 | def process_request(self, r): 60 | info('process '+str(r)) 61 | return r 62 | 63 | def parse_0(self, response): 64 | info('Parse 0 '+response.url) 65 | x = self.parse_with_rules(response, self.list_css_rules, dict) 66 | pp.pprint(x) 67 | #pdb.set_trace() 68 | #return self.parse_with_rules(response, self.list_css_rules, sinanewsItem) 69 | 70 | def parse_1(self, response): 71 | info('Parse 1 '+response.url) 72 | x = self.parse_with_rules(response, self.content_css_rules, dict) 73 | pp.pprint(x) 74 | #self.parse_with_rules(response, self.css_rules, sinanewsItem) 75 | -------------------------------------------------------------------------------- /sis/README.md: -------------------------------------------------------------------------------- 1 | usage: 2 | 3 | ```sh 4 | scrapy crawl sis -a forum_id=230 -a digit=2 5 | sort -t"\"" -nk12 data_utf8.json 6 | ``` 7 | -------------------------------------------------------------------------------- /sis/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = sis.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sis 12 | -------------------------------------------------------------------------------- /sis/sis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/sis/sis/__init__.py -------------------------------------------------------------------------------- /sis/sis/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class SisItem(Item): 9 | title = Field() 10 | link = Field() 11 | imgs = Field() 12 | torrents = Field() 13 | sharetitle = Field() 14 | bottomline = Field() 15 | duty = Field() 16 | xxx = Field() 17 | 18 | class SisForumListItem(Item): 19 | content = Field() # raw content with all html 20 | title = Field() 21 | thread_type = Field() 22 | author = Field() 23 | post_time = Field() 24 | link = Field() 25 | star = Field() 26 | comment = Field() 27 | view = Field() 28 | size = Field() 29 | video_type = Field() 30 | last_post_time = Field() 31 | -------------------------------------------------------------------------------- /sis/sis/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | from scrapy import signals 7 | 8 | 9 | import json 10 | import codecs 11 | from collections import OrderedDict 12 | 13 | 14 | class JsonWithEncodingPipeline(object): 15 | 16 | def __init__(self): 17 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 18 | 19 | def process_item(self, item, spider): 20 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 21 | self.file.write(line) 22 | return item 23 | 24 | def close_spider(self, spider): 25 | self.file.close() 26 | -------------------------------------------------------------------------------- /sis/sis/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for sis project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'sis' 17 | 18 | SPIDER_MODULES = ['sis.spiders'] 19 | NEWSPIDER_MODULE = 'sis.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'sis (+http://www.yourdomain.com)' 23 | DOWNLOADER_MIDDLEWARES = { 24 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 25 | 'misc.middleware.CustomUserAgentMiddleware': 401, 26 | } 27 | 28 | ITEM_PIPELINES = { 29 | 'sis.pipelines.JsonWithEncodingPipeline': 300, 30 | } 31 | 32 | LOG_LEVEL = 'INFO' 33 | 34 | DOWNLOAD_DELAY = 1 35 | -------------------------------------------------------------------------------- /sis/sis/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sis/sis/spiders/sis_spider.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import re 4 | import json 5 | import sys 6 | from urlparse import urljoin 7 | 8 | 9 | from scrapy.selector import Selector 10 | try: 11 | from scrapy.spiders import Spider 12 | except: 13 | from scrapy.spiders import BaseSpider as Spider 14 | from scrapy.utils.response import get_base_url 15 | from scrapy.spiders import CrawlSpider, Rule 16 | from scrapy.linkextractors import LinkExtractor as sle 17 | 18 | from sis.items import * 19 | from misc.log import * 20 | 21 | 22 | class sisSpider(CrawlSpider): 23 | name = "sis" 24 | ip = "38.103.161.187" 25 | allowed_domains = [ip] 26 | ip_format = 'http://' + ip + '/forum/forum-%d-1.html' 27 | ''' 28 | start_urls = [ 29 | # ip_format % d for d in [230] #[143, 230, 58] 30 | ] 31 | rules = [ 32 | # Rule(sle(allow=("/forum/thread-\d*-1-1\.html")), callback='parse_2'), 33 | # Rule(sle(allow=("/forum/forum-(143|230|58)-[0-9]{,2}\.html")), follow=True, callback='parse_1'), 34 | Rule(sle(allow=("/forum/forum-230-[0-9]{,4}\.html")), follow=True, callback='parse_1'), 35 | ] 36 | ''' 37 | 38 | def __init__(self, forum_id=58, digit=1, *args, **kwargs): 39 | self.start_urls = [self.ip_format % d for d in [int(forum_id)]] 40 | self.rules = [Rule(sle(allow=("/forum/forum-" + str(forum_id) + "-[0-9]{," + str(digit) + "}\.html")), follow=True, callback='parse_1'),] 41 | super(sisSpider, self).__init__(*args, **kwargs) 42 | 43 | def parse_2(self, response): 44 | items = [] 45 | sel = Selector(response) 46 | sites = sel.css('.postcontent')[0:1] 47 | for site in sites: 48 | item = SisItem() 49 | item['title'] = site.css('.postmessage h2::text').extract() 50 | imgs = site.css('.postmessage img::attr(src)').extract() 51 | item['imgs'] = filter(lambda x: not x.endswith('.gif'), imgs) 52 | item['torrents'] = [urljoin(response.url, x) for x in site.css('.t_attachlist a[href*=attachment]::attr(href)').extract()] 53 | # item['duty'] = site.css('.c .l2::text').extract() 54 | item['link'] = response.url 55 | items.append(item) 56 | # print repr(item).decode("unicode-escape") + '\n' 57 | # info('parsed ' + str(response)) 58 | self.parse_1(response) 59 | return items 60 | 61 | def parse_1(self, response): 62 | items = [] 63 | # url cannot encode to Chinese easily.. XXX 64 | info('parsed ' + str(response)) 65 | sel = Selector(response) 66 | threads = sel.css('tbody[id*=normalthread_]') 67 | for thread in threads: 68 | item = SisForumListItem() 69 | # filter some thread 70 | inner_thread = thread.css('span[id*=thread_]') 71 | url = urljoin(response.url, inner_thread.css('a[href]::attr(href)').extract()[0]) 72 | thread_content = re.sub(r"\s\s+", " ", thread.extract()) 73 | # if re.search(u"(奸|姦)", thread_content): 74 | item['title'] = inner_thread.css('a::text').extract()[0] 75 | item['link'] = url 76 | item['star'] = re.sub(r'\s+', '', thread.css('td[class=author] cite::text').extract()[1]) 77 | item['comment'] = thread.css('td[class=nums] strong::text').extract()[0] 78 | item['view'] = thread.css('td[class=nums] em::text').extract()[0] 79 | item['post_time'] = thread.css('td[class=author] em::text').extract()[0] 80 | print ' ', item['post_time'], item['star'], '|', item['title'], item['link'], item['comment'], item['view'] 81 | 82 | # NOTE: content is only for debug purpose 83 | # item['content'] = thread_content 84 | 85 | items.append(item) 86 | # yield Request(url, callback=parse_2) 87 | return items 88 | 89 | def _process_request(self, request): 90 | info('process ' + str(request)) 91 | return request 92 | -------------------------------------------------------------------------------- /startproject.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | usage() { 5 | echo "\n usage:\n ./startproject.sh \n" 6 | } 7 | 8 | if [ -z "$1" ]; then 9 | usage 10 | exit 11 | fi 12 | 13 | echo "Starting project $1." 14 | 15 | cp -r template $1 16 | if [ "$(uname)" == "Darwin" ]; then 17 | #alias sed='sed -i' 18 | find $1 -type f | xargs sed -i '' "s/template/$1/" 19 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then 20 | find $1 -type f | xargs sed -i "s/template/$1/" 21 | elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then 22 | find $1 -type f | xargs sed -i "s/template/$1/" 23 | fi 24 | mv $1/template $1/$1 25 | 26 | echo "Create $1 succeed!" 27 | -------------------------------------------------------------------------------- /template/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = template.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = template 12 | -------------------------------------------------------------------------------- /template/template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/template/template/__init__.py -------------------------------------------------------------------------------- /template/template/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class templateItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /template/template/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /template/template/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for template project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'template' 17 | 18 | SPIDER_MODULES = ['template.spiders'] 19 | NEWSPIDER_MODULE = 'template.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'template (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'template.pipelines.JsonWithEncodingPipeline': 300, 31 | #'template.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /template/template/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /template/template/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from template.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class templateSpider(CommonSpider): 24 | name = "template" 25 | allowed_domains = ["template.com"] 26 | start_urls = [ 27 | "http://www.template.com/", 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '.linkto': { 35 | 'url': 'a::attr(href)', 36 | 'name': 'a::text', 37 | } 38 | } 39 | 40 | content_css_rules = { 41 | 'text': '#Cnt-Main-Article-QQ p *::text', 42 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 43 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 44 | } 45 | 46 | def parse_1(self, response): 47 | info('Parse '+response.url) 48 | # x = self.parse_with_rules(response, self.list_css_rules, dict) 49 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 50 | # print(json.dumps(x, ensure_ascii=False, indent=2)) 51 | # pp.pprint(x) 52 | # return self.parse_with_rules(response, self.css_rules, templateItem) 53 | -------------------------------------------------------------------------------- /tutorial/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = tutorial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tutorial 12 | -------------------------------------------------------------------------------- /tutorial/tutorial/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/tutorial/tutorial/__init__.py -------------------------------------------------------------------------------- /tutorial/tutorial/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class TutorialItem(Item): 9 | # define the fields for your item here like: 10 | # name = Field() 11 | title = Field() 12 | link = Field() 13 | desc = Field() 14 | num = Field() 15 | -------------------------------------------------------------------------------- /tutorial/tutorial/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/tutorial/tutorial/misc/__init__.py -------------------------------------------------------------------------------- /tutorial/tutorial/misc/log.py: -------------------------------------------------------------------------------- 1 | 2 | from scrapy import log 3 | 4 | def warn(msg): 5 | log.msg(str(msg), level=log.WARNING) 6 | 7 | 8 | def info(msg): 9 | log.msg(str(msg), level=log.INFO) 10 | 11 | 12 | def debug(msg): 13 | log.msg(str(msg), level=log.DEBUG) 14 | 15 | -------------------------------------------------------------------------------- /tutorial/tutorial/misc/middleware.py: -------------------------------------------------------------------------------- 1 | from scrapy import log 2 | from proxy import PROXIES 3 | from agents import AGENTS 4 | 5 | import random 6 | 7 | 8 | class CustomHttpProxyMiddleware(object): 9 | 10 | def process_request(self, request, spider): 11 | # TODO implement complex proxy providing algorithm 12 | if self.use_proxy(request): 13 | p = random.choice(PROXIES) 14 | try: 15 | request.meta['proxy'] = "http://%s" % p['ip_port'] 16 | except Exception, e: 17 | log.msg("Exception %s" % e, _level=log.CRITICAL) 18 | 19 | def use_proxy(self, request): 20 | """ 21 | using direct download for depth <= 2 22 | using proxy with probability 0.3 23 | """ 24 | if "depth" in request.meta and int(request.meta['depth']) <= 2: 25 | return False 26 | i = random.randint(1, 10) 27 | return i <= 2 28 | 29 | 30 | class CustomUserAgentMiddleware(object): 31 | def process_request(self, request, spider): 32 | agent = random.choice(AGENTS) 33 | request.headers['User-Agent'] = agent 34 | -------------------------------------------------------------------------------- /tutorial/tutorial/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | from scrapy import signals 7 | from scrapy.contrib.exporter import XmlItemExporter 8 | 9 | 10 | class TutorialPipeline(object): 11 | def process_item(self, item, spider): 12 | for field in item: 13 | print field + ': ' + item[field][0] 14 | return item 15 | 16 | 17 | class XmlExportPipeline(object): 18 | 19 | def __init__(self): 20 | self.files = {} 21 | 22 | @classmethod 23 | def from_crawler(cls, crawler): 24 | pipeline = cls() 25 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 26 | crawler.signals.connect(pipeline.close_spider, signals.close_spider) 27 | return pipeline 28 | 29 | def spider_opened(self, spider): 30 | file = open('%s_products.xml' % spider.name, 'w+b') 31 | self.files[spider] = file 32 | self.exporter = XmlItemExporter(file) 33 | self.exporter.start_exporting() 34 | 35 | def close_spider(self, spider): 36 | self.exporter.finish_exporting() 37 | file = self.files.pop(spider) 38 | file.close() 39 | 40 | def process_item(self, item, spider): 41 | self.exporter.export_item(item) 42 | return item 43 | 44 | 45 | import json 46 | import codecs 47 | 48 | class JsonWithEncodingPipeline(object): 49 | 50 | def __init__(self): 51 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 52 | 53 | def process_item(self, item, spider): 54 | line = json.dumps(dict(item), ensure_ascii=False) + "\n" 55 | self.file.write(line) 56 | return item 57 | 58 | def close_spider(self, spider): 59 | self.file.close() 60 | -------------------------------------------------------------------------------- /tutorial/tutorial/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for tutorial project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'tutorial' 10 | 11 | SPIDER_MODULES = ['tutorial.spiders'] 12 | NEWSPIDER_MODULE = 'tutorial.spiders' 13 | ITEM_PIPELINES = { 14 | #'tutorial.pipelines.JsonWithEncodingPipeline': 300, 15 | } 16 | #Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' 18 | 19 | DOWNLOADER_MIDDLEWARES = { 20 | #'tutorial.misc.middleware.CustomHttpProxyMiddleware': 400, 21 | 'tutorial.misc.middleware.CustomUserAgentMiddleware': 401, 22 | } 23 | 24 | LOG_LEVEL = 'INFO' 25 | -------------------------------------------------------------------------------- /tutorial/tutorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /underdev/README: -------------------------------------------------------------------------------- 1 | Some crawlers are under dev because related pages should be rendered first. 2 | Consider to use selenium or similar tool. 3 | -------------------------------------------------------------------------------- /underdev/meijutt/meijutt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/underdev/meijutt/meijutt/__init__.py -------------------------------------------------------------------------------- /underdev/meijutt/meijutt/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class meijuttItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /underdev/meijutt/meijutt/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /underdev/meijutt/meijutt/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for meijutt project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'meijutt' 17 | 18 | SPIDER_MODULES = ['meijutt.spiders'] 19 | NEWSPIDER_MODULE = 'meijutt.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'meijutt (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'meijutt.pipelines.JsonWithEncodingPipeline': 300, 31 | #'meijutt.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /underdev/meijutt/meijutt/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /underdev/meijutt/meijutt/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from meijutt.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class meijuttSpider(CommonSpider): 24 | name = "meijutt" 25 | allowed_domains = ["meijutt.com"] 26 | start_urls = [ 27 | "http://www.meijutt.com/content/meiju117.html", # 3 28 | "http://www.meijutt.com/content/meiju116.html", # 4 29 | ] 30 | rules = [ 31 | Rule(sle(allow=(".*meiju11[67]\.html$")), callback='parse_1', follow=False), 32 | ] 33 | 34 | content_css_rules = { 35 | '.downurl .adds': { 36 | 'links': 'input::attr(value)' 37 | } 38 | } 39 | 40 | def parse_1(self, response): 41 | info('Parse '+response.url) 42 | x = self.parse_with_rules(response, self.content_css_rules, dict) 43 | pp.pprint(x) 44 | # return self.parse_with_rules(response, self.css_rules, meijuttItem) 45 | -------------------------------------------------------------------------------- /underdev/meijutt/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = meijutt.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = meijutt 12 | -------------------------------------------------------------------------------- /underdev/twitch/README: -------------------------------------------------------------------------------- 1 | AJAX part. Need render engine. 2 | 3 | Data format may be JSON: 4 | - https://api.twitch.tv/kraken/videos/top?limit=20&offset=0&period=week&broadcast_type=all&on_site=1 5 | -------------------------------------------------------------------------------- /underdev/twitch/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = twitch.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = twitch 12 | -------------------------------------------------------------------------------- /underdev/twitch/twitch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/underdev/twitch/twitch/__init__.py -------------------------------------------------------------------------------- /underdev/twitch/twitch/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class twitchItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /underdev/twitch/twitch/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /underdev/twitch/twitch/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for twitch project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'twitch' 17 | 18 | SPIDER_MODULES = ['twitch.spiders'] 19 | NEWSPIDER_MODULE = 'twitch.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'twitch (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'twitch.pipelines.JsonWithEncodingPipeline': 300, 31 | #'twitch.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /underdev/twitch/twitch/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /underdev/twitch/twitch/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from twitch.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class twitchSpider(CommonSpider): 24 | name = "twitch" 25 | allowed_domains = ["twitch.tv"] 26 | start_urls = [ 27 | "https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft" 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '.content': { 35 | 'room_name': '.meta .title a::text', 36 | 'author': '.meta .info a::text', 37 | 'people_count': '.meta .info a::attr(data-ember-action)' 38 | } 39 | } 40 | 41 | content_css_rules = { 42 | 'text': '#Cnt-Main-Article-QQ p *::text', 43 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 44 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 45 | } 46 | 47 | def parse_1(self, response): 48 | info('Parse '+response.url) 49 | x = self.parse_with_rules(response, self.list_css_rules, dict) 50 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 51 | import pdb; pdb.set_trace() 52 | print(json.dumps(x, ensure_ascii=False, indent=2)) 53 | # pp.pprint(x) 54 | # return self.parse_with_rules(response, self.css_rules, twitchItem) 55 | -------------------------------------------------------------------------------- /v2ex/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = v2ex.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = v2ex 12 | -------------------------------------------------------------------------------- /v2ex/v2ex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/v2ex/v2ex/__init__.py -------------------------------------------------------------------------------- /v2ex/v2ex/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class v2exItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /v2ex/v2ex/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /v2ex/v2ex/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for v2ex project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'v2ex' 17 | 18 | SPIDER_MODULES = ['v2ex.spiders'] 19 | NEWSPIDER_MODULE = 'v2ex.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'v2ex (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'v2ex.pipelines.JsonWithEncodingPipeline': 300, 31 | #'v2ex.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /v2ex/v2ex/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /v2ex/v2ex/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from v2ex.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class v2exSpider(CommonSpider): 24 | name = "v2ex" 25 | allowed_domains = ["v2ex.com"] 26 | start_urls = [ 27 | "http://www.v2ex.com/", 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("http://www.v2ex.com/$")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '.cell.item': { 35 | 'title': '.item_title a::text', 36 | 'node': '.node::text', 37 | 'author': '.node+ strong a::text', 38 | 'reply_count': '.count_livid::text' 39 | } 40 | } 41 | 42 | def parse_1(self, response): 43 | info('Parse '+response.url) 44 | # import pdb; pdb.set_trace() 45 | x = self.parse_with_rules(response, self.list_css_rules, dict) 46 | print(json.dumps(x, ensure_ascii=False, indent=2)) 47 | #pp.pprint(x) 48 | # return self.parse_with_rules(response, self.css_rules, v2exItem) 49 | -------------------------------------------------------------------------------- /youtube_trending/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = youtube_trending.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = youtube_trending 12 | -------------------------------------------------------------------------------- /youtube_trending/youtube_trending/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/youtube_trending/youtube_trending/__init__.py -------------------------------------------------------------------------------- /youtube_trending/youtube_trending/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class youtube_trendingItem(Item): 9 | # define the fields for your item here like: 10 | name = Field() 11 | 12 | -------------------------------------------------------------------------------- /youtube_trending/youtube_trending/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | class JsonWithEncodingPipeline(object): 18 | 19 | def __init__(self): 20 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 21 | 22 | def process_item(self, item, spider): 23 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 24 | self.file.write(line) 25 | return item 26 | 27 | def close_spider(self, spider): 28 | self.file.close() 29 | 30 | 31 | class RedisPipeline(object): 32 | 33 | def __init__(self): 34 | self.r = redis.StrictRedis(host='localhost', port=6379) 35 | 36 | def process_item(self, item, spider): 37 | if not item['id']: 38 | print 'no id item!!' 39 | 40 | str_recorded_item = self.r.get(item['id']) 41 | final_item = None 42 | if str_recorded_item is None: 43 | final_item = item 44 | else: 45 | ritem = eval(self.r.get(item['id'])) 46 | final_item = dict(item.items() + ritem.items()) 47 | self.r.set(item['id'], final_item) 48 | 49 | def close_spider(self, spider): 50 | return 51 | -------------------------------------------------------------------------------- /youtube_trending/youtube_trending/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for youtube_trending project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'youtube_trending' 17 | 18 | SPIDER_MODULES = ['youtube_trending.spiders'] 19 | NEWSPIDER_MODULE = 'youtube_trending.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'youtube_trending (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'youtube_trending.pipelines.JsonWithEncodingPipeline': 300, 31 | #'youtube_trending.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /youtube_trending/youtube_trending/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /youtube_trending/youtube_trending/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from urlparse import urlparse 4 | import urllib 5 | import pdb 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from youtube_trending.items import * 19 | from misc.log import * 20 | from misc.spider import CommonSpider 21 | 22 | 23 | class youtube_trendingSpider(CommonSpider): 24 | name = "youtube_trending" 25 | allowed_domains = ["youtube.com"] 26 | start_urls = [ 27 | "https://www.youtube.com/feed/trending", 28 | ] 29 | rules = [ 30 | Rule(sle(allow=("feed/trending$")), callback='parse_1', follow=True), 31 | ] 32 | 33 | list_css_rules = { 34 | '.yt-lockup-content': { 35 | 'video_title': '.yt-lockup-title a::text', 36 | 'author': '.yt-lockup-byline a::text', 37 | } 38 | } 39 | 40 | content_css_rules = { 41 | 'text': '#Cnt-Main-Article-QQ p *::text', 42 | 'images': '#Cnt-Main-Article-QQ img::attr(src)', 43 | 'images-desc': '#Cnt-Main-Article-QQ div p+ p::text', 44 | } 45 | 46 | def parse_1(self, response): 47 | info('Parse '+response.url) 48 | x = self.parse_with_rules(response, self.list_css_rules, dict) 49 | # x = self.parse_with_rules(response, self.content_css_rules, dict) 50 | print(json.dumps(x, ensure_ascii=False, indent=2)) 51 | # pp.pprint(x) 52 | # return self.parse_with_rules(response, self.css_rules, youtube_trendingItem) 53 | -------------------------------------------------------------------------------- /zhibo8/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | scrapy crawl zhibo8_schedule 6 | -------------------------------------------------------------------------------- /zhibo8/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zhibo8.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zhibo8 12 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/README.md: -------------------------------------------------------------------------------- 1 | 抓取虎扑新闻和直播吧赛程 2 | 基于scrapy[http://scrapy.org/] 3 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/zhibo8/zhibo8/__init__.py -------------------------------------------------------------------------------- /zhibo8/zhibo8/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class SportItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | #title = scrapy.Field() 14 | #url = scrapy.Field() 15 | start_time = scrapy.Field() 16 | home_team = scrapy.Field() 17 | guest_team = scrapy.Field() 18 | guest_team = scrapy.Field() 19 | match_date = scrapy.Filed() 20 | game_type = scrapy.Filed() 21 | home_logo = scrapy.Filed() 22 | guest_logo = scrapy.Filed() 23 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import MySQLdb 9 | from utils.mysqldriver import MySQL 10 | 11 | class SportPipeline(object): 12 | _db = None 13 | 14 | def __init__(self): 15 | dbconfig = { 16 | 'host':'localhost', 17 | 'port': 3306, 18 | 'user':'root', 19 | 'passwd':'111111', 20 | 'db':'sport', 21 | 'charset':'utf8' 22 | } 23 | 24 | self._db = MySQL(dbconfig) 25 | 26 | def process_item(self, item, spider): 27 | insert_sql = "INSERT INTO sport_schedule(home_team,guest_team,home_logo,guest_logo,match_date,game_type) values \ 28 | ('%s','%s','%s','%s','%s','%s')" % (item['home_team'], item['guest_team'], item['home_logo'],\ 29 | item['guest_logo'], item['match_date'], item['game_type']) 30 | 31 | self._db.insert(insert_sql) 32 | return item 33 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*-G 2 | 3 | # Scrapy settings for zhibo8 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zhibo8' 13 | 14 | SPIDER_MODULES = ['zhibo8.spiders'] 15 | NEWSPIDER_MODULE = 'zhibo8.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'zhibo8 (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'zhibo8.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | #DOWNLOADER_MIDDLEWARES = { 53 | # 'zhibo8.middlewares.MyCustomDownloaderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable extensions 57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 58 | #EXTENSIONS = { 59 | # 'scrapy.telnet.TelnetConsole': None, 60 | #} 61 | 62 | # Configure item pipelines 63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 64 | ITEM_PIPELINES = { 65 | 'zhibo8.pipelines.SportPipeline': 300, 66 | } 67 | 68 | # Enable and configure the AutoThrottle extension (disabled by default) 69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 71 | #AUTOTHROTTLE_ENABLED=True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY=5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY=60 76 | # Enable showing throttling stats for every response received: 77 | #AUTOTHROTTLE_DEBUG=False 78 | 79 | # Enable and configure HTTP caching (disabled by default) 80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 81 | #HTTPCACHE_ENABLED=True 82 | #HTTPCACHE_EXPIRATION_SECS=0 83 | #HTTPCACHE_DIR='httpcache' 84 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 86 | 87 | # start MySQL database configure setting 88 | MYSQL_HOST = 'localhost' 89 | MYSQL_DBNAME = 'zhibo8' 90 | MYSQL_USER = 'root' 91 | MYSQL_PASSWD = '111111' 92 | # end of MySQL database configure setting 93 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/spiders/example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class ExampleSpider(scrapy.Spider): 6 | name = "example" 7 | allowed_domains = ["example.com"] 8 | start_urls = ( 9 | 'http://www.example.com/', 10 | ) 11 | 12 | def parse(self, response): 13 | pass 14 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/spiders/hupu_news_spider.py: -------------------------------------------------------------------------------- 1 | # encoding:utf8 2 | """ 3 | 虎扑新闻抓取 4 | """ 5 | import scrapy 6 | 7 | class HupuNewsSpider(scrapy.Spider): 8 | name = "hupu_news" 9 | allowed_domains = ["hupu.com"] 10 | start_urls = ["http://voice.hupu.com/nba/newslist"] 11 | 12 | def parse(self, response): 13 | # refer : http://scrapy-chs.readthedocs.org/zh_CN/0.24/topics/selectors.html#topics-selectors-relative-xpaths 14 | li_list = response.xpath('//div[@class="news-list"]/ul/li') 15 | for li in li_list: 16 | #print li.extract() 17 | a = li.xpath('div/h4/a[1]') 18 | #print a.extract() 19 | title = li.xpath('div/h4/a[1]/text()').extract() 20 | link = li.xpath('div/h4/a[1]/@href').extract() 21 | print title[0],link[0] 22 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/spiders/zhibo8_decrypt.py: -------------------------------------------------------------------------------- 1 | # encoding:utf8 2 | """ 3 | zhibo8直播源解密 4 | """ 5 | import scrapy 6 | import sys 7 | import re 8 | import urllib2 9 | reload(sys) 10 | sys.setdefaultencoding( "utf-8" ) 11 | 12 | test_url = 'http://zhibo8.cc/zhibo/zuqiu/2016/0203laisitechengvsliwupu.htm' 13 | 14 | class Zhibo8Decrypt(): 15 | 16 | def get_content(self, url, send_headers=''): 17 | send_headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0'} 18 | req = urllib2.Request(url, headers=send_headers) 19 | ret = urllib2.urlopen(req) 20 | html = ret.read() 21 | return html 22 | 23 | def decrypt(self, content): 24 | if '' == content: 25 | return '' 26 | pattern = re.compile(r'C0ha0ne0l(.*?)') 27 | ch = pattern.findall(content) 28 | return ch 29 | 30 | 31 | zd = Zhibo8Decrypt() 32 | content = zd.get_content(test_url) 33 | zd.decrypt(content) 34 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/spiders/zhibo8_schedule_spider.py: -------------------------------------------------------------------------------- 1 | # encoding:utf8 2 | """ 3 | zhibo8比分 4 | """ 5 | import scrapy 6 | import sys 7 | reload(sys) 8 | sys.setdefaultencoding( "utf-8" ) 9 | 10 | game_type_list = [ 11 | u'CBA', 12 | u'NBA', 13 | u'法甲', 14 | u'英超', 15 | u'西甲', 16 | u'意甲', 17 | u'德甲', 18 | u'足总杯', 19 | u'国王杯', 20 | u'德国杯', 21 | u'解放者杯', 22 | u'意大利杯', 23 | ] 24 | 25 | 26 | class Zhibo8ScheduleSpider(scrapy.Spider): 27 | name = "zhibo8_schedule" 28 | allowed_domains = ["zhibo8.cc"] 29 | start_urls = ["http://zhibo8.cc/"] 30 | 31 | def parse(self, response): 32 | arr_matches = [] 33 | # refer : http://scrapy-chs.readthedocs.org/zh_CN/0.24/topics/selectors.html#topics-selectors-relative-xpaths 34 | div_list = response.xpath('//div[@class="schedule_container left"]/div[@class="box"]') 35 | for div in div_list: 36 | match_date = div.xpath('div[@class="titlebar"]/h2[1]/@title').extract()[0] 37 | li_list = div.xpath('div[@class="content"]/ul/li') 38 | ymd = match_date.replace('-', '') 39 | for li in li_list: 40 | match = {} 41 | match['tags'] = li.xpath('./@label').extract()[0] 42 | text_content = li.xpath('string(.)').extract()[0] 43 | text_content = text_content.replace(' ', ' ') 44 | arr_content = text_content.split(' ') 45 | #print 0,arr_content[0],1,arr_content[1],2,arr_content[2],3,arr_content[3],4,arr_content[4],\ 46 | # 5,arr_content[5],6,arr_content[6],7,arr_content[7] 47 | if len(arr_content) < 5 or '-' != arr_content[3]: 48 | continue 49 | match['start_time'] = match_date + ' ' + arr_content[0] + ':00' 50 | match['home_team'] = arr_content[2] 51 | match['guest_team'] = arr_content[4] 52 | match['match_date'] = ymd 53 | match['game_type'] = self.get_gametype(arr_content[1]) 54 | match['home_logo'] = self.get_home_logo(li) 55 | match['guest_logo'] = self.get_guest_logo(li) 56 | #print match['start_time'],match['home_team'],match['guest_team'],match['home_logo'],match['guest_logo'] 57 | arr_matches.append(match) 58 | 59 | return arr_matches 60 | 61 | 62 | def get_gametype(self, s): 63 | for game_type in game_type_list: 64 | if game_type in s: 65 | return game_type 66 | return s 67 | 68 | 69 | def get_home_logo(self, li): 70 | if li.xpath('./img[1]'): 71 | return li.xpath('./img[1]/@src').extract()[0] 72 | elif li.xpath('./b/img[1]'): 73 | return li.xpath('./b/img[1]/@src').extract()[0] 74 | else: 75 | return '' 76 | 77 | def get_guest_logo(self, li): 78 | if li.xpath('./img[2]'): 79 | return li.xpath('./img[2]/@src').extract()[0] 80 | elif li.xpath('./b/img[2]'): 81 | return li.xpath('./b/img[2]/@src').extract()[0] 82 | else: 83 | return '' 84 | 85 | -------------------------------------------------------------------------------- /zhibo8/zhibo8/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/zhibo8/zhibo8/utils/__init__.py -------------------------------------------------------------------------------- /zhibo8/zhibo8/utils/mysqldriver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | u'''对MySQLdb常用函数进行封装的类 4 | 5 | 整理者:兔大侠和他的朋友们(http://www.tudaxia.com) 6 | 日期:2014-04-22 7 | 出处:源自互联网,共享于互联网:-) 8 | 9 | 注意:使用这个类的前提是正确安装 MySQL-Python模块。 10 | 官方网站:http://mysql-python.sourceforge.net/ 11 | ''' 12 | 13 | import MySQLdb 14 | import time 15 | 16 | class MySQL: 17 | u'''对MySQLdb常用函数进行封装的类''' 18 | 19 | error_code = '' #MySQL错误号码 20 | 21 | _instance = None #本类的实例 22 | _conn = None # 数据库conn 23 | _cur = None #游标 24 | 25 | _TIMEOUT = 30 #默认超时30秒 26 | _timecount = 0 27 | 28 | def __init__(self, dbconfig): 29 | u'构造器:根据数据库连接参数,创建MySQL连接' 30 | try: 31 | self._conn = MySQLdb.connect(host=dbconfig['host'], 32 | port=dbconfig['port'], 33 | user=dbconfig['user'], 34 | passwd=dbconfig['passwd'], 35 | db=dbconfig['db'], 36 | charset=dbconfig['charset']) 37 | except MySQLdb.Error, e: 38 | self.error_code = e.args[0] 39 | error_msg = 'MySQL error! ', e.args[0], e.args[1] 40 | print error_msg 41 | 42 | # 如果没有超过预设超时时间,则再次尝试连接, 43 | if self._timecount < self._TIMEOUT: 44 | interval = 5 45 | self._timecount += interval 46 | time.sleep(interval) 47 | return self.__init__(dbconfig) 48 | else: 49 | raise Exception(error_msg) 50 | 51 | self._cur = self._conn.cursor() 52 | self._instance = MySQLdb 53 | 54 | def query(self,sql): 55 | u'执行 SELECT 语句' 56 | try: 57 | self._cur.execute("SET NAMES utf8") 58 | result = self._cur.execute(sql) 59 | except MySQLdb.Error, e: 60 | self.error_code = e.args[0] 61 | print "数据库错误代码:",e.args[0],e.args[1] 62 | result = False 63 | return result 64 | 65 | def update(self,sql): 66 | u'执行 UPDATE 及 DELETE 语句' 67 | try: 68 | self._cur.execute("SET NAMES utf8") 69 | result = self._cur.execute(sql) 70 | self._conn.commit() 71 | except MySQLdb.Error, e: 72 | self.error_code = e.args[0] 73 | print "数据库错误代码:",e.args[0],e.args[1] 74 | result = False 75 | return result 76 | 77 | def insert(self,sql): 78 | u'执行 INSERT 语句。如主键为自增长int,则返回新生成的ID' 79 | try: 80 | self._cur.execute("SET NAMES utf8") 81 | self._cur.execute(sql) 82 | self._conn.commit() 83 | return self._conn.insert_id() 84 | except MySQLdb.Error, e: 85 | self.error_code = e.args[0] 86 | return False 87 | 88 | def fetchAllRows(self): 89 | u'返回结果列表' 90 | return self._cur.fetchall() 91 | 92 | def fetchOneRow(self): 93 | u'返回一行结果,然后游标指向下一行。到达最后一行以后,返回None' 94 | return self._cur.fetchone() 95 | 96 | def getRowCount(self): 97 | u'获取结果行数' 98 | return self._cur.rowcount 99 | 100 | def commit(self): 101 | u'数据库commit操作' 102 | self._conn.commit() 103 | 104 | def rollback(self): 105 | u'数据库回滚操作' 106 | self._conn.rollback() 107 | 108 | def __del__(self): 109 | u'释放资源(系统GC自动调用)' 110 | try: 111 | self._cur.close() 112 | self._conn.close() 113 | except: 114 | pass 115 | 116 | def close(self): 117 | u'关闭数据库连接' 118 | self.__del__() 119 | 120 | """ 121 | if __name__ == '__main__': 122 | '''使用样例''' 123 | 124 | #数据库连接参数 125 | dbconfig = {'host':'localhost', 126 | 'port': 3306, 127 | 'user':'dbuser', 128 | 'passwd':'dbpassword', 129 | 'db':'testdb', 130 | 'charset':'utf8'} 131 | 132 | #连接数据库,创建这个类的实例 133 | db = MySQL(dbconfig) 134 | 135 | #操作数据库 136 | sql = "SELECT * FROM `sample_table`" 137 | db.query(sql); 138 | 139 | #获取结果列表 140 | result = db.fetchAllRows(); 141 | 142 | #相当于php里面的var_dump 143 | print result 144 | 145 | #对行进行循环 146 | for row in result: 147 | #使用下标进行取值 148 | #print row[0] 149 | 150 | #对列进行循环 151 | for colum in row: 152 | print colum 153 | 154 | #关闭数据库 155 | db.close() 156 | """ 157 | -------------------------------------------------------------------------------- /zhihu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = zhihu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zhihu 12 | -------------------------------------------------------------------------------- /zhihu/zhihu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/zhihu/zhihu/__init__.py -------------------------------------------------------------------------------- /zhihu/zhihu/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class ZhihuPeopleItem(Item): 9 | # define the fields for your item here like: 10 | id = Field() 11 | name = Field() 12 | sign = Field() 13 | location = Field() 14 | business = Field() 15 | employment = Field() 16 | position = Field() 17 | education = Field() 18 | education_extra = Field() 19 | description = Field() 20 | agree = Field() 21 | thanks = Field() 22 | asks = Field() 23 | answers = Field() 24 | posts = Field() 25 | collections = Field() 26 | logs = Field() 27 | followees = Field() 28 | followers = Field() 29 | follow_topics = Field() 30 | 31 | -------------------------------------------------------------------------------- /zhihu/zhihu/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import redis 7 | 8 | 9 | from scrapy import signals 10 | 11 | 12 | import json 13 | import codecs 14 | from collections import OrderedDict 15 | 16 | 17 | from misc.log import * 18 | 19 | 20 | class JsonWithEncodingPipeline(object): 21 | 22 | def __init__(self): 23 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 24 | 25 | def process_item(self, item, spider): 26 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 27 | self.file.write(line) 28 | return item 29 | 30 | def close_spider(self, spider): 31 | self.file.close() 32 | 33 | 34 | class RedisPipeline(object): 35 | 36 | def __init__(self): 37 | self.r = redis.StrictRedis(host='localhost', port=6379) 38 | 39 | def process_item(self, item, spider): 40 | if not item['id']: 41 | print 'no id item!!' 42 | 43 | str_recorded_item = self.r.get(item['id']) 44 | final_item = None 45 | if str_recorded_item is None: 46 | final_item = item 47 | else: 48 | ritem = eval(self.r.get(item['id'])) 49 | if ritem == item: 50 | debug('item '+item['id']+' equal') 51 | else: 52 | # info('item '+item['id']+' merge\n'+str(item)+'\n'+str(ritem)) 53 | info('item '+item['id']+' use new item') 54 | # final_item = dict(item.items() + ritem.items()) 55 | final_item = item 56 | self.r.set(item['id'], final_item) 57 | 58 | def close_spider(self, spider): 59 | return 60 | -------------------------------------------------------------------------------- /zhihu/zhihu/redis-test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import redis 4 | import json 5 | 6 | 7 | r = redis.StrictRedis(host='localhost', port=6379) 8 | 9 | 10 | def dump_all(redis=r): 11 | keys = redis.keys('*') 12 | pairs = {} 13 | for key in keys: 14 | type = redis.type(key) 15 | val = redis.get(key) 16 | try: 17 | pairs[key] = eval(val) 18 | except Exception as e: 19 | print pairs, key, val, e 20 | return pairs 21 | 22 | def del_all(redis=r): 23 | keys = redis.keys('*') 24 | for k in keys: 25 | print 'Deleting:', k, 'result is', redis.delete(k) 26 | 27 | def main(): 28 | # del_all() 29 | # print json.dumps(dump_all(), indent=4) 30 | keys = r.keys('*') 31 | print keys 32 | print len(keys) 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /zhihu/zhihu/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for zhihu project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | import sys 10 | import os 11 | from os.path import dirname 12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 13 | sys.path.append(path) 14 | from misc.log import * 15 | 16 | BOT_NAME = 'zhihu' 17 | 18 | SPIDER_MODULES = ['zhihu.spiders'] 19 | NEWSPIDER_MODULE = 'zhihu.spiders' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | #USER_AGENT = 'zhihu (+http://www.yourdomain.com)' 23 | 24 | DOWNLOADER_MIDDLEWARES = { 25 | # 'misc.middleware.CustomHttpProxyMiddleware': 400, 26 | 'misc.middleware.CustomUserAgentMiddleware': 401, 27 | } 28 | 29 | ITEM_PIPELINES = { 30 | 'zhihu.pipelines.JsonWithEncodingPipeline': 300, 31 | 'zhihu.pipelines.RedisPipeline': 301, 32 | } 33 | 34 | LOG_LEVEL = 'INFO' 35 | 36 | DOWNLOAD_DELAY = 1 37 | -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /zhihu/zhihu/spiders/zhihu_spider.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | 3 | import re 4 | import json 5 | from urlparse import urlparse 6 | 7 | 8 | from scrapy.selector import Selector 9 | try: 10 | from scrapy.spiders import Spider 11 | except: 12 | from scrapy.spiders import BaseSpider as Spider 13 | from scrapy.utils.response import get_base_url 14 | from scrapy.spiders import CrawlSpider, Rule 15 | from scrapy.linkextractors import LinkExtractor as sle 16 | 17 | 18 | from zhihu.items import * 19 | from misc.log import * 20 | 21 | ''' 22 | 1. 默认取sel.css()[0],如否则需要'__unique':false 23 | 2. 默认字典均为css解析,如否则需要'__use':'dump'表明是用于dump数据 24 | ''' 25 | 26 | class ZhihuSpider(CrawlSpider): 27 | name = "zhihu" 28 | allowed_domains = ["zhihu.com"] 29 | start_urls = [ 30 | "http://www.zhihu.com/", 31 | "http://www.zhihu.com/people/jia-yang-qing-74", 32 | ] 33 | rules = [ 34 | Rule(sle(allow=("/people/[^/]+/followees$")), callback='parse_followees'), 35 | Rule(sle(allow=("/people/[^/]+/followers$", )), callback='parse_followers'), 36 | Rule(sle(allow=("/people/[^/]+$", )), callback='parse_people_with_rules', follow=True), 37 | ] 38 | 39 | # need dfs/bfs 40 | all_css_rules = { 41 | '.zm-profile-header': { 42 | '.zm-profile-header-main': { 43 | '__use':'dump', 44 | 'name':'.title-section .name::text', 45 | 'sign':'.title-section .bio::text', 46 | 'location':'.location.item::text', 47 | 'business':'.business.item::text', 48 | 'employment':'.employment.item::text', 49 | 'position':'.position.item::text', 50 | 'education':'.education.item::text', 51 | 'education_extra':'.education-extra.item::text', 52 | }, '.zm-profile-header-operation': { 53 | '__use':'dump', 54 | 'agree':'.zm-profile-header-user-agree strong::text', 55 | 'thanks':'.zm-profile-header-user-thanks strong::text', 56 | }, '.profile-navbar': { 57 | '__use':'dump', 58 | 'asks':'a[href*=asks] .num::text', 59 | 'answers':'a[href*=answers] .num::text', 60 | 'posts':'a[href*=posts] .num::text', 61 | 'collections':'a[href*=collections] .num::text', 62 | 'logs':'a[href*=logs] .num::text', 63 | }, 64 | }, '.zm-profile-side-following': { 65 | '__use':'dump', 66 | 'followees':'a.item[href*=followees] strong::text', 67 | 'followers':'a.item[href*=followers] strong::text', 68 | } 69 | } 70 | 71 | def traversal(self, sel, rules, item): 72 | # print 'traversal:', sel, rules.keys() 73 | if '__use' in rules: 74 | for nk, nv in rules.items(): 75 | if nk == '__use': 76 | continue 77 | if nk not in item: 78 | item[nk] = [] 79 | if sel.css(nv): 80 | item[nk] += [i.extract() for i in sel.css(nv)] 81 | else: 82 | item[nk] = [] 83 | else: 84 | for nk, nv in rules.items(): 85 | for i in sel.css(nk): 86 | self.traversal(i, nv, item) 87 | 88 | def dfs(self, sel, rules, item_class): 89 | if sel is None: 90 | return [] 91 | item = item_class() 92 | self.traversal(sel, rules, item) 93 | return item 94 | 95 | def parse_with_rules(self, response, rules, item_class): 96 | return self.dfs(Selector(response), rules, item_class) 97 | 98 | def parse_people_with_rules(self, response): 99 | item = self.parse_with_rules(response, self.all_css_rules, ZhihuPeopleItem) 100 | item['id'] = urlparse(response.url).path.split('/')[-1] 101 | info('Parsed '+response.url) # +' to '+str(item)) 102 | return item 103 | 104 | def parse_followers(self, response): 105 | return self.parse_people_with_rules(response) 106 | 107 | def parse_followees(self, response): 108 | return self.parse_people_with_rules(response) 109 | -------------------------------------------------------------------------------- /ziroom/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ziroom.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ziroom 12 | -------------------------------------------------------------------------------- /ziroom/ziroom/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/ziroom/ziroom/__init__.py -------------------------------------------------------------------------------- /ziroom/ziroom/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ZiroomItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | room_id = scrapy.Field() 15 | room_price = scrapy.Field() 16 | room_name = scrapy.Field() 17 | modifyDate = scrapy.Field() 18 | -------------------------------------------------------------------------------- /ziroom/ziroom/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Define your item pipelines here 3 | # 4 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 5 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 6 | 7 | import redis 8 | 9 | 10 | from scrapy import signals 11 | 12 | 13 | import json 14 | import codecs 15 | from collections import OrderedDict 16 | 17 | 18 | class JsonWithEncodingPipeline(object): 19 | 20 | def __init__(self): 21 | self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8') 22 | 23 | def process_item(self, item, spider): 24 | line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n" 25 | self.file.write(line) 26 | return item 27 | 28 | def close_spider(self, spider): 29 | self.file.close() 30 | 31 | 32 | class RedisPipeline(object): 33 | 34 | def __init__(self): 35 | self.r = redis.StrictRedis(host='localhost', port=6379) 36 | 37 | def process_item(self, item, spider): 38 | if not item['id']: 39 | print 'no id item!!' 40 | 41 | str_recorded_item = self.r.get(item['id']) 42 | final_item = None 43 | if str_recorded_item is None: 44 | final_item = item 45 | else: 46 | ritem = eval(self.r.get(item['id'])) 47 | final_item = dict(item.items() + ritem.items()) 48 | self.r.set(item['id'], final_item) 49 | 50 | def spider_closed(self, spider): 51 | return 52 | -------------------------------------------------------------------------------- /ziroom/ziroom/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Scrapy settings for ziroom project 3 | # 4 | # For simplicity, this file contains only the most important settings by 5 | # default. All the other settings are documented here: 6 | # 7 | # http://doc.scrapy.org/en/latest/topics/settings.html 8 | # 9 | 10 | import sys 11 | import os 12 | from os.path import dirname 13 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__)))) 14 | sys.path.append(path) 15 | 16 | BOT_NAME = 'ziroom' 17 | 18 | SPIDER_MODULES = ['ziroom.spiders'] 19 | NEWSPIDER_MODULE = 'ziroom.spiders' 20 | 21 | 22 | 23 | DOWNLOADER_MIDDLEWARES = { 24 | #'misc.middleware.CustomHttpProxyMiddleware': 400, 25 | 'misc.middleware.CustomUserAgentMiddleware': 401, 26 | } 27 | 28 | ITEM_PIPELINES = { 29 | 'ziroom.pipelines.JsonWithEncodingPipeline': 300, 30 | #'template.pipelines.RedisPipeline': 301, 31 | } 32 | 33 | LOG_LEVEL = 'INFO' 34 | -------------------------------------------------------------------------------- /ziroom/ziroom/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ziroom/ziroom/spiders/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.spiders import Spider 3 | from scrapy import Request 4 | import re 5 | import time 6 | 7 | from ziroom.items import ZiroomItem 8 | 9 | class Parse(): 10 | def __init__(self, response): 11 | self.response = response 12 | self.room_detail = response.xpath('//div[@class="room_detail_right"]')[0] 13 | self.room_info = ' '.join(self.room_detail.xpath('.//ul[@class="detail_room"]/li/text()').extract()) 14 | self.metro_info = ''.join(self.room_detail.xpath('.//span[@id="lineList"]/text()').extract()).replace(' ', '').replace('\n', 15 | '') 16 | def getID(self): 17 | return int(re.findall('\d+', self.response.url)[0]) 18 | def getName(self): 19 | return self.room_detail.xpath('.//h2/text()').extract()[0].replace(' ', '').replace('\n', '') 20 | def getPrice(self): 21 | room_price = int(self.room_detail.xpath('.//span[@class="room_price"]/text()').extract()[0][1:]) 22 | if room_price < 500: 23 | room_price *= 30 24 | return room_price 25 | 26 | 27 | 28 | class PagesSpider(Spider): 29 | name = "ziroom" 30 | start_urls = ['http://www.ziroom.com/z/nl/z3.html?p=1'] 31 | 32 | def parse(self, response): 33 | print response.url 34 | page = re.findall('p=(\d+)', response.url)[0] 35 | 36 | houseList = response.xpath('//ul[@id="houseList"]/li') 37 | for each in houseList: 38 | url = each.xpath('div/h3/a/@href').extract()[0][2:].encode('utf-8') 39 | yield Request('http://' + url, self.parseItem) 40 | 41 | url = response.url 42 | url_new = url.replace(page, str(int(page) + 1)) 43 | # yield Request(url_new, self.parse) 44 | 45 | def parseItem(self, response): 46 | p = Parse(response) 47 | item = ZiroomItem() 48 | item['modifyDate'] = int(time.time()) 49 | item['room_id'] = p.getID() 50 | item['room_price'] = p.getPrice() 51 | item['room_name'] = p.getName() 52 | yield item --------------------------------------------------------------------------------