├── .gitignore
├── README.md
├── alexa
    ├── alexa
    │   ├── __init__.py
    │   ├── cn.json
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── alexa_spider.py
    │   └── universal.json
    ├── read_from_json.ipynb
    └── scrapy.cfg
├── alexa_topsites
    ├── alexa_topsites
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── amazonbook
    ├── amazonbook
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── clean.sh
├── delay.sh
├── dianping
    ├── dianping
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── dmoz
    ├── dmoz
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── doubanbook
    ├── doubanbook
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── douban_spider.py
    ├── sample.jpg
    └── scrapy.cfg
├── doubanmovie
    ├── doubanmovie
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── douyu
    ├── douyu
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── general_spider
    ├── general_spider
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── BasicSpiderConfig.py
    │   │   ├── __init__.py
    │   │   ├── run.sh
    │   │   ├── scrapy_examples.py
    │   │   ├── spider.py
    │   │   └── v2ex.py
    └── scrapy.cfg
├── github_trending
    ├── github_trending
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── googlescholar
    ├── README.md
    ├── googlescholar
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── hacker_news
    ├── hacker_news
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── hrtencent
    ├── hrtencent
    │   ├── __init__.py
    │   ├── data_utf8.json
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── hrtencent_spider.py
    └── scrapy.cfg
├── linkedin
    ├── README.md
    ├── doc
    │   └── db-scheme.md
    └── linkedin
    │   ├── linkedin
    │       ├── Rakefile
    │       ├── __init__.py
    │       ├── agents.py
    │       ├── db.py
    │       ├── items.py
    │       ├── middleware.py
    │       ├── parser
    │       │   ├── HtmlParser.py
    │       │   ├── LinkedinParser.py
    │       │   └── __init__.py
    │       ├── pipelines.py
    │       ├── proxy.py
    │       ├── reload_proxy.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── LinkedinSpider.py
    │       │   └── __init__.py
    │   └── scrapy.cfg
├── misc
    ├── __init__.py
    ├── agents.py
    ├── log.py
    ├── middleware.py
    ├── proxy.py
    └── spider.py
├── pandatv
    ├── pandatv
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── proxylist
    ├── proxylist
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── log
    │   │       ├── free-proxy-list.net
    │   │       └── proxy-list.org
    │   │   └── spider.py
    └── scrapy.cfg
├── qqnews
    ├── qqnews
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── reddit
    ├── reddit
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── spider.py
    └── scrapy.cfg
├── sinanews
    ├── scrapy.cfg
    └── sinanews
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── spider.py
├── sis
    ├── README.md
    ├── forum-230.json
    ├── forum-58.json
    ├── index.html
    ├── scrapy.cfg
    └── sis
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── sis_spider.py
├── startproject.sh
├── template
    ├── scrapy.cfg
    └── template
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── spider.py
├── tutorial
    ├── Books
    ├── Resources
    ├── data_utf8.json
    ├── scrapy.cfg
    └── tutorial
    │   ├── __init__.py
    │   ├── data_utf8.json
    │   ├── items.py
    │   ├── misc
    │       ├── __init__.py
    │       ├── agents.py
    │       ├── log.py
    │       ├── middleware.py
    │       └── proxy.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── naive_spider.py
├── underdev
    ├── README
    ├── meijutt
    │   ├── meijutt
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── spider.py
    │   └── scrapy.cfg
    └── twitch
    │   ├── README
    │   ├── scrapy.cfg
    │   └── twitch
    │       ├── __init__.py
    │       ├── items.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │           ├── __init__.py
    │           └── spider.py
├── v2ex
    ├── scrapy.cfg
    └── v2ex
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── spider.py
├── youtube_trending
    ├── scrapy.cfg
    └── youtube_trending
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── spider.py
├── zhibo8
    ├── run.sh
    ├── scrapy.cfg
    └── zhibo8
    │   ├── README.md
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── spiders
    │       ├── __init__.py
    │       ├── example.py
    │       ├── hupu_news_spider.py
    │       ├── zhibo8_decrypt.py
    │       └── zhibo8_schedule_spider.py
    │   ├── utils
    │       ├── __init__.py
    │       └── mysqldriver.py
    │   └── zhibo8
├── zhihu
    ├── scrapy.cfg
    └── zhihu
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── redis-test.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── zhihu_spider.py
└── ziroom
    ├── scrapy.cfg
    └── ziroom
        ├── __init__.py
        ├── items.py
        ├── pipelines.py
        ├── settings.py
        └── spiders
            ├── __init__.py
            └── spider.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea/
  2 | *.swp
  3 | .ipynb_checkpoints/
  4 | 
  5 | cscope.out
  6 | download_file/
  7 | data_utf8.json
  8 | # *.json
  9 | .DS_Store
 10 | dump.rdb
 11 | 
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | bin/
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | .tox/
 41 | .coverage
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | 
 46 | # Translations
 47 | *.mo
 48 | 
 49 | # Mr Developer
 50 | .mr.developer.cfg
 51 | .project
 52 | .pydevproject
 53 | 
 54 | # Rope
 55 | .ropeproject
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | *.pot
 60 | 
 61 | # Sphinx documentation
 62 | docs/_build/# Byte-compiled / optimized / DLL files
 63 | __pycache__/
 64 | *.py[cod]
 65 | 
 66 | # C extensions
 67 | *.so
 68 | 
 69 | # Distribution / packaging
 70 | bin/
 71 | build/
 72 | develop-eggs/
 73 | dist/
 74 | eggs/
 75 | lib/
 76 | lib64/
 77 | parts/
 78 | sdist/
 79 | var/
 80 | *.egg-info/
 81 | .installed.cfg
 82 | *.egg
 83 | 
 84 | # Installer logs
 85 | pip-log.txt
 86 | pip-delete-this-directory.txt
 87 | 
 88 | # Unit test / coverage reports
 89 | .tox/
 90 | .coverage
 91 | .cache
 92 | nosetests.xml
 93 | coverage.xml
 94 | 
 95 | # Translations
 96 | *.mo
 97 | 
 98 | # Mr Developer
 99 | .mr.developer.cfg
100 | .project
101 | .pydevproject
102 | 
103 | # Rope
104 | .ropeproject
105 | 
106 | # Django stuff:
107 | *.log
108 | *.pot
109 | 
110 | # Sphinx documentation
111 | docs/_build/
112 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | scrapy-examples
  2 | ==============
  3 | 
  4 | Multifarious scrapy examples with integrated proxies and agents, which make you comfy to write a spider.
  5 | 
  6 | Don't use it to do anything illegal!
  7 | 
  8 | ***
  9 | 
 10 | ## Real spider example: doubanbook
 11 | 
 12 | #### Tutorial
 13 | 
 14 |     git clone https://github.com/geekan/scrapy-examples
 15 |     cd scrapy-examples/doubanbook
 16 |     scrapy crawl doubanbook
 17 | 
 18 | #### Depth
 19 | 
 20 | There are several depths in the spider, and the spider gets
 21 | real data from depth2.
 22 | 
 23 | - Depth0: The entrance is `http://book.douban.com/tag/`
 24 | - Depth1: Urls like `http://book.douban.com/tag/外国文学` from depth0
 25 | - Depth2: Urls like `http://book.douban.com/subject/1770782/` from depth1
 26 | 
 27 | #### Example image
 28 | ![douban book](https://raw.githubusercontent.com/geekan/scrapy-examples/master/doubanbook/sample.jpg)
 29 | 
 30 | ***
 31 | 
 32 | ## Avaiable Spiders
 33 | 
 34 | * tutorial
 35 |   * dmoz_item
 36 |   * douban_book
 37 |   * page_recorder
 38 |   * douban_tag_book
 39 | * doubanbook
 40 | * linkedin
 41 | * hrtencent
 42 | * sis
 43 | * zhihu
 44 | * alexa
 45 |   * alexa
 46 |   * alexa.cn
 47 | 
 48 | ## Advanced
 49 | 
 50 | * Use `parse_with_rules` to write a spider quickly.  
 51 |   See dmoz spider for more details.
 52 | 
 53 | * Proxies
 54 |   * If you don't want to use proxy, just comment the proxy middleware in settings.  
 55 |   * If you want to custom it, hack `misc/proxy.py` by yourself.  
 56 | 
 57 | * Notice
 58 |   * Don't use `parse` as your method name, it's an inner method of CrawlSpider.
 59 | 
 60 | ### Advanced Usage
 61 | 
 62 | * Run `./startproject.sh <PROJECT>` to start a new project.  
 63 |   It will automatically generate most things, the only left things are:
 64 |   * `PROJECT/PROJECT/items.py`
 65 |   * `PROJECT/PROJECT/spider/spider.py`
 66 | 
 67 | #### Example to hack `items.py` and `spider.py`
 68 | 
 69 | Hacked `items.py` with additional fields `url` and `description`:  
 70 | ```
 71 | from scrapy.item import Item, Field
 72 | 
 73 | class exampleItem(Item):
 74 |     url = Field()
 75 |     name = Field()
 76 |     description = Field()
 77 | ```
 78 | 
 79 | Hacked `spider.py` with start rules and css rules (here only display the class exampleSpider):  
 80 | ```
 81 | class exampleSpider(CommonSpider):
 82 |     name = "dmoz"
 83 |     allowed_domains = ["dmoz.org"]
 84 |     start_urls = [
 85 |         "http://www.dmoz.com/",
 86 |     ]
 87 |     # Crawler would start on start_urls, and follow the valid urls allowed by below rules.
 88 |     rules = [
 89 |         Rule(sle(allow=["/Arts/", "/Games/"]), callback='parse', follow=True),
 90 |     ]
 91 | 
 92 |     css_rules = {
 93 |         '.directory-url li': {
 94 |             '__use': 'dump', # dump data directly
 95 |             '__list': True, # it's a list
 96 |             'url': 'li > a::attr(href)',
 97 |             'name': 'a::text',
 98 |             'description': 'li::text',
 99 |         }
100 |     }
101 | 
102 |     def parse(self, response):
103 |         info('Parse '+response.url)
104 |         # parse_with_rules is implemented here:
105 |         #   https://github.com/geekan/scrapy-examples/blob/master/misc/spider.py
106 |         self.parse_with_rules(response, self.css_rules, exampleItem)
107 | ```
108 | 
109 | 


--------------------------------------------------------------------------------
/alexa/alexa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/alexa/alexa/__init__.py


--------------------------------------------------------------------------------
/alexa/alexa/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class alexaSiteInfoItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 |     url = Field()
12 |     description = Field()
13 |     category = Field()
14 | 
15 | class alexaCategoryItem(Item):
16 |     name = Field()
17 |     url = Field()
18 | 


--------------------------------------------------------------------------------
/alexa/alexa/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/alexa/alexa/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for alexa project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'alexa'
17 | 
18 | SPIDER_MODULES = ['alexa.spiders']
19 | NEWSPIDER_MODULE = 'alexa.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'alexa (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'alexa.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'alexa.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/alexa/alexa/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/alexa/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = alexa.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = alexa
12 | 


--------------------------------------------------------------------------------
/alexa_topsites/alexa_topsites/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/alexa_topsites/alexa_topsites/__init__.py


--------------------------------------------------------------------------------
/alexa_topsites/alexa_topsites/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class alexa_topsitesItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/alexa_topsites/alexa_topsites/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/alexa_topsites/alexa_topsites/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for alexa_topsites project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'alexa_topsites'
17 | 
18 | SPIDER_MODULES = ['alexa_topsites.spiders']
19 | NEWSPIDER_MODULE = 'alexa_topsites.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'alexa_topsites (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'alexa_topsites.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'alexa_topsites.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/alexa_topsites/alexa_topsites/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/alexa_topsites/alexa_topsites/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from alexa_topsites.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class alexa_topsitesSpider(CommonSpider):
24 |     name = "alexa_topsites"
25 |     allowed_domains = ["alexa.com"]
26 |     start_urls = [
27 |         "http://www.alexa.com/topsites",
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("http://www.alexa.com/topsites")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '.site-listing': {
35 |             'rank': '.count::text',
36 |             'name': '.desc-paragraph a::text',
37 |             'desc': '.description::text'
38 |         }   
39 |     }   
40 | 
41 |     content_css_rules = { 
42 |         'text': '#Cnt-Main-Article-QQ p *::text',
43 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
44 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
45 |     }
46 | 
47 |     def parse_1(self, response):
48 |         info('Parse '+response.url)
49 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
50 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
51 |         print(json.dumps(x, ensure_ascii=False, indent=2))
52 |         # pp.pprint(x)
53 |         # return self.parse_with_rules(response, self.css_rules, alexa_topsitesItem)
54 | 


--------------------------------------------------------------------------------
/alexa_topsites/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = alexa_topsites.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = alexa_topsites
12 | 


--------------------------------------------------------------------------------
/amazonbook/amazonbook/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/amazonbook/amazonbook/__init__.py


--------------------------------------------------------------------------------
/amazonbook/amazonbook/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class amazonbookItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/amazonbook/amazonbook/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/amazonbook/amazonbook/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for amazonbook project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'amazonbook'
17 | 
18 | SPIDER_MODULES = ['amazonbook.spiders']
19 | NEWSPIDER_MODULE = 'amazonbook.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'amazonbook (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'amazonbook.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'amazonbook.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/amazonbook/amazonbook/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/amazonbook/amazonbook/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | 
 6 | 
 7 | from scrapy.selector import Selector
 8 | try:
 9 |     from scrapy.spiders import Spider
10 | except:
11 |     from scrapy.spiders import BaseSpider as Spider
12 | from scrapy.utils.response import get_base_url
13 | from scrapy.spiders import CrawlSpider, Rule
14 | from scrapy.linkextractors import LinkExtractor as sle
15 | 
16 | 
17 | from amazonbook.items import *
18 | from misc.log import *
19 | from misc.spider import CommonSpider
20 | 
21 | 
22 | import pprint
23 | class MyPrettyPrinter(pprint.PrettyPrinter):
24 |     def format(self, object, context, maxlevels, level):
25 |         if isinstance(object, unicode):
26 |             return (object.encode('utf8'), True, False)
27 |         return pprint.PrettyPrinter.format(self, object, context, maxlevels, level)
28 | pp = MyPrettyPrinter()
29 | 
30 | 
31 | class amazonbookSpider(CommonSpider):
32 |     name = "amazonbook"
33 |     allowed_domains = ["amazon.com", "www.amazon.com"]
34 |     start_urls = [
35 |         #"http://www.amazon.com/b/ref=s9_acss_bw_en_BGG15eve_d_1_6?_encoding=UTF8&node=17&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-top-3&pf_rd_r=0XCRZV6SDKBTKDPH8SFR&pf_rd_t=101&pf_rd_p=2293718502&pf_rd_i=283155",
36 |         "http://www.amazon.com/books-used-books-textbooks/b?node=283155",
37 |     ]
38 |     rules = [
39 |         #Rule(sle(allow=("/gp/product/.*")), callback='parse_1', follow=True),
40 |         Rule(sle(allow=("/books-used-books-textbooks/.*")), callback='parse_0', follow=True),
41 |     ]
42 | 
43 |     css_rules = {
44 |         ".inner .a-row": {
45 |             "url": ".title::attr(href)",
46 |             #"desc": "span::text"
47 |             "title": ".s9TitleText::text",
48 |             "comments": ".a-icon-row .a-size-small::text",
49 |         }
50 |     }
51 | 
52 |     def parse_0(self, response):
53 |         info('Parse 0 '+response.url)
54 |         pp.pprint(self.parse_with_rules(response, self.css_rules, dict))
55 | 
56 |     #.inner .a-row
57 |     def parse_1(self, response):
58 |         info('Parse 1 '+response.url)
59 |         #pp.pprint(self.parse_with_rules(response, self.css_rules, dict))
60 |         # return self.parse_with_rules(response, self.css_rules, amazonbookItem)
61 | 


--------------------------------------------------------------------------------
/amazonbook/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = amazonbook.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = amazonbook
12 | 


--------------------------------------------------------------------------------
/clean.sh:
--------------------------------------------------------------------------------
1 | find . -name '*.pyc' | xargs rm
2 | 


--------------------------------------------------------------------------------
/delay.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | delay=${1:-1}
 4 | 
 5 | #find . | grep -E "settings.py$" | xargs sed -i '' "s/DOWNLOAD_DEALY = [0-9]+/DOWNLOAD_DELAY=$1/g"
 6 | if [[ "$OSTYPE" == "darwin"* ]]; then
 7 | find . | grep -E "settings.py$" | xargs sed -E -i '' "s/DOWNLOAD_DELAY = [0-9]+/DOWNLOAD_DELAY = $delay/g"
 8 | else
 9 | find . | grep -E "settings.py$" | xargs sed -E -i "s/DOWNLOAD_DELAY = [0-9]+/DOWNLOAD_DELAY = $delay/g"
10 | fi
11 | 


--------------------------------------------------------------------------------
/dianping/dianping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/dianping/dianping/__init__.py


--------------------------------------------------------------------------------
/dianping/dianping/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class dianpingItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/dianping/dianping/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/dianping/dianping/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for dianping project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'dianping'
17 | 
18 | SPIDER_MODULES = ['dianping.spiders']
19 | NEWSPIDER_MODULE = 'dianping.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'dianping (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'dianping.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'dianping.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/dianping/dianping/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/dianping/dianping/spiders/spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import requests
  3 | from json import loads
  4 | from scrapy.http import Request
  5 | from scrapy.selector import Selector
  6 | 
  7 | try:
  8 |     from scrapy.spiders import Spider
  9 | except:
 10 |     from scrapy.spiders import BaseSpider as Spider
 11 | 
 12 | from misc.spider import CommonSpider
 13 | 
 14 | BAIDU_GEO = u'http://api.map.baidu.com/geocoder/v2/?address={}&output=json&ak=gQsCAgCrWsuN99ggSIjGn5nO'
 15 | 
 16 | base_category_url = "http://www.dianping.com/search/category"
 17 | 
 18 | start_url_dict = {
 19 |     u"足疗按摩": "/2/30/g141r1471",
 20 |     u"中医养生": "/2/30/g2827r1471",
 21 |     u"健康体检": "/2/80/g612",
 22 |     u"妇幼保健": "/2/70/g258",
 23 |     u"美容Spa": "/2/50/g158",
 24 |     u"整形塑体": "/2/85/g183",
 25 |     u"运动健身": "/2/45/g147",
 26 |     u"口腔健康": "/2/85/g182",
 27 |     u"药店": "/2/85/g235"
 28 | }
 29 | 
 30 | 
 31 | def clean_string(string):
 32 |     return string.replace(' ', '').replace('\n', '') if string else ''
 33 | 
 34 | 
 35 | def address_to_geo(address):
 36 |     data = requests.get(BAIDU_GEO.format(address)).json()
 37 |     longitude = data['result']['location']['lng'] if 'result' in data else 120.260569
 38 |     latitude = data['result']['location']['lat'] if 'result' in data else 30.242865
 39 |     return {'longitude': longitude, 'latitude': latitude}
 40 | 
 41 | 
 42 | class dianpingSpider(CommonSpider):
 43 |     name = "dianping"
 44 |     allowed_domains = ["dianping.com"]
 45 | 
 46 |     def start_requests(self):
 47 |         for k, v in start_url_dict.items():
 48 |             for i in range(1, 3):
 49 |                 url = base_category_url + v + 'p{}'.format(i)
 50 |                 yield Request(url, callback=self.parse, meta={'category': k})
 51 | 
 52 |     def parse(self, response):
 53 |         hxs = Selector(response)
 54 |         shops = hxs.xpath('//div[@class="tit"]/a/@href').extract()
 55 |         for shop in shops:
 56 |             if shop.startswith('/shop/'):
 57 |                 yield Request("http://www.dianping.com{}".format(shop), callback=self.parse_shop,
 58 |                               meta=response.request.meta)
 59 | 
 60 |     def parse_shop(self, response):
 61 |         shop = {}
 62 |         hxs = Selector(response)
 63 |         shop_name = hxs.css('.shop-name::text').extract_first()
 64 |         shop['name'] = clean_string(shop_name)
 65 |         address = hxs.css('.address span.item::text').extract_first()
 66 |         shop['address'] = clean_string(address)
 67 |         phone_number = hxs.css('.tel span.item::text').extract_first()
 68 |         shop['phone_number'] = clean_string(phone_number)
 69 |         path = u'//span[contains(text(), "营业时间：")]/following-sibling::span/text()'
 70 |         opening_hours = hxs.xpath(path).extract_first()
 71 |         shop['opening_hours'] = clean_string(opening_hours)
 72 |         geo = address_to_geo(address)
 73 |         shop.update(geo)
 74 |         store_images = hxs.xpath("//div[@class='photos-container']//img/@src").extract()
 75 |         shop['store_images'] = ','.join(store_images[:2])
 76 |         deals = hxs.xpath("//div[@id='sales']//a/@href").extract()
 77 |         shop['deals'] = deals
 78 |         shop['category'] = response.request.meta['category']
 79 |         return shop
 80 | 
 81 | 
 82 | class dianpingDealSpider(CommonSpider):
 83 |     name = "dianping-deal"
 84 |     allowed_domains = ["dianping.com"]
 85 | 
 86 |     def start_requests(self):
 87 |         with open('partner.json', 'rb') as f:
 88 |             for line in f:
 89 |                 data = loads(line)
 90 |                 for url in data['deals']:
 91 |                     yield Request(url, callback=self.parse, meta={'category': data['category'],
 92 |                                                                   'partner': data['name']})
 93 |                     break
 94 | 
 95 |     def parse(self, response):
 96 |         deal = {}
 97 |         hxs = Selector(response)
 98 |         bd = hxs.css('.bd')
 99 |         name = bd.css('.title::text').extract_first()
100 |         deal['name'] = clean_string(name)
101 |         description = bd.css('.sub-title span::text').extract_first()
102 |         deal['description'] = clean_string(description)
103 |         price = bd.css('.price-display::text').extract_first()
104 |         deal['price'] = clean_string(price)
105 |         # it's dynamic
106 |         # images = hxs.xpath('//div[@class="img-area"]//img/@src').extract()
107 |         # deal['images'] = ','.join(images[:2])
108 |         deal['category'] = response.request.meta['category']
109 |         deal['partner'] = response.request.meta['partner']
110 |         return deal
111 | 


--------------------------------------------------------------------------------
/dianping/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = dianping.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dianping
12 | 


--------------------------------------------------------------------------------
/dmoz/dmoz/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/dmoz/dmoz/__init__.py


--------------------------------------------------------------------------------
/dmoz/dmoz/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class dmozItem(Item):
 9 |     # define the fields for your item here like:
10 |     url = Field()
11 |     name = Field()
12 |     description = Field()
13 | 
14 | 


--------------------------------------------------------------------------------
/dmoz/dmoz/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/dmoz/dmoz/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for dmoz project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'dmoz'
17 | 
18 | SPIDER_MODULES = ['dmoz.spiders']
19 | NEWSPIDER_MODULE = 'dmoz.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'dmoz (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'dmoz.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'dmoz.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/dmoz/dmoz/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/dmoz/dmoz/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | 
 6 | 
 7 | from scrapy.selector import Selector
 8 | try:
 9 |     from scrapy.spiders import Spider
10 | except:
11 |     from scrapy.spiders import BaseSpider as Spider
12 | from scrapy.utils.response import get_base_url
13 | from scrapy.spiders import CrawlSpider, Rule
14 | from scrapy.linkextractors import LinkExtractor as sle
15 | 
16 | 
17 | from dmoz.items import *
18 | from misc.log import *
19 | from misc.spider import CommonSpider
20 | 
21 | 
22 | class dmozSpider(CommonSpider):
23 |     name = "dmoz"
24 |     allowed_domains = ["dmoz.org"]
25 |     start_urls = [
26 |         "http://www.dmoz.org/",
27 |     ]
28 |     valid_categories = [
29 |         'Arts', 'Business', 'Computers', 'Games', 'Health', 'Home',
30 |         'Kids_and_Teens', 'News', 'Recreation', 'Reference', 'Regional', 'Science',
31 |         'Shopping', 'Society', 'Sports',
32 |     ]
33 |     allow_rules = ['/'+i+'/' for i in valid_categories]
34 |     rules = [
35 |         Rule(sle(allow=allow_rules), callback='parse_1', follow=True),
36 |     ]
37 | 
38 |     item_rules = {
39 |         '.directory-url li': {
40 |             '__use': 'dump',
41 |             '__list': True,
42 |             'url': 'li > a::attr(href)',
43 |             'name': 'a::text',
44 |             'description': 'li::text',
45 |         }
46 |     }
47 | 
48 |     def parse_1(self, response):
49 |         info('Parse depth 1 '+response.url)
50 |         items = self.parse_with_rules(response, self.item_rules, dmozItem)
51 |         return items
52 | 


--------------------------------------------------------------------------------
/dmoz/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = dmoz.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dmoz
12 | 


--------------------------------------------------------------------------------
/doubanbook/doubanbook/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/doubanbook/doubanbook/__init__.py


--------------------------------------------------------------------------------
/doubanbook/doubanbook/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class DoubanbookItem(Item):
 9 |     # define the fields for your item here like:
10 |     # name = Field()
11 |     title = Field()
12 |     link = Field()
13 |     desc = Field()
14 |     num = Field()
15 | 
16 | 
17 | class DoubanSubjectItem(Item):
18 |     title = Field()
19 |     link = Field()
20 |     info = Field()
21 |     rate = Field()
22 |     votes = Field()
23 |     content_intro = Field()
24 |     author_intro = Field()
25 |     tags = Field()
26 | 


--------------------------------------------------------------------------------
/doubanbook/doubanbook/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def spider_closed(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/doubanbook/doubanbook/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for doubanbook project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | 
15 | BOT_NAME = 'doubanbook'
16 | 
17 | SPIDER_MODULES = ['doubanbook.spiders']
18 | NEWSPIDER_MODULE = 'doubanbook.spiders'
19 | 
20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
21 | #USER_AGENT = 'doubanbook (+http://www.yourdomain.com)'
22 | 
23 | DOWNLOADER_MIDDLEWARES = {
24 |     #'misc.middleware.CustomHttpProxyMiddleware': 400,
25 |     'misc.middleware.CustomUserAgentMiddleware': 401,
26 | }
27 | 
28 | ITEM_PIPELINES = {
29 |     'doubanbook.pipelines.JsonWithEncodingPipeline': 300,
30 |     #'template.pipelines.RedisPipeline': 301,
31 | }
32 | 
33 | LOG_LEVEL = 'INFO'
34 | 
35 | 


--------------------------------------------------------------------------------
/doubanbook/doubanbook/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/doubanbook/doubanbook/spiders/douban_spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | 
 4 | 
 5 | from scrapy.selector import Selector
 6 | try:
 7 |     from scrapy.spiders import Spider
 8 | except:
 9 |     from scrapy.spiders import BaseSpider as Spider
10 | from scrapy.utils.response import get_base_url
11 | from scrapy.spiders import CrawlSpider, Rule
12 | from scrapy.linkextractors import LinkExtractor as sle
13 | 
14 | 
15 | from doubanbook.items import *
16 | from misc.log import *
17 | 
18 | 
19 | class DoubanBookSpider(CrawlSpider):
20 |     name = "doubanbook"
21 |     allowed_domains = ["douban.com"]
22 |     start_urls = [
23 |         "https://book.douban.com/tag/"
24 |     ]
25 |     rules = [
26 |         Rule(sle(allow=("/subject/\d+$")), callback='parse_2'),
27 |         Rule(sle(allow=("/tag/[^/]+$", )), follow=True),
28 |         #Rule(sle(allow=("/tag/$", )), follow=True),
29 |     ]
30 | 
31 |     def parse_2(self, response):
32 |         items = []
33 |         sel = Selector(response)
34 |         sites = sel.css('#wrapper')
35 |         for site in sites:
36 |             item = DoubanSubjectItem()
37 |             item['title'] = site.css('h1 span::text').extract()
38 |             item['link'] = response.url
39 |             item['content_intro'] = site.css('#link-report .intro p::text').extract()
40 |             items.append(item)
41 |             # print repr(item).decode("unicode-escape") + '\n'
42 |             print item
43 |         # info('parsed ' + str(response))
44 |         return items
45 | 
46 |     def parse_1(self, response):
47 |         # url cannot encode to Chinese easily.. XXX
48 |         info('parsed ' + str(response))
49 | 
50 |     def process_request(self, request):
51 |         info('process ' + str(request))
52 |         return request
53 | 
54 |     def closed(self, reason):
55 |         info("DoubanBookSpider Closed:" + reason)
56 | 


--------------------------------------------------------------------------------
/doubanbook/sample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/doubanbook/sample.jpg


--------------------------------------------------------------------------------
/doubanbook/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = doubanbook.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = doubanbook
12 | 


--------------------------------------------------------------------------------
/doubanmovie/doubanmovie/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/doubanmovie/doubanmovie/__init__.py


--------------------------------------------------------------------------------
/doubanmovie/doubanmovie/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class doubanmovieItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/doubanmovie/doubanmovie/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         print("JsonWithEncodingPipeline closed")
29 |         self.file.close()
30 | 
31 |     def open_spider(self, spider):
32 |         print("JsonWithEncodingPipeline opend")
33 | 
34 | 
35 | class RedisPipeline(object):
36 | 
37 |     def __init__(self):
38 |         self.r = redis.StrictRedis(host='localhost', port=6379)
39 | 
40 |     def process_item(self, item, spider):
41 |         if not item['id']:
42 |             print 'no id item!!'
43 | 
44 |         str_recorded_item = self.r.get(item['id'])
45 |         final_item = None
46 |         if str_recorded_item is None:
47 |             final_item = item
48 |         else:
49 |             ritem = eval(self.r.get(item['id']))
50 |             final_item = dict(item.items() + ritem.items())
51 |         self.r.set(item['id'], final_item)
52 | 
53 |     def close_spider(self, spider):
54 |         return
55 | 


--------------------------------------------------------------------------------
/doubanmovie/doubanmovie/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for doubanmovie project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'doubanmovie'
17 | 
18 | SPIDER_MODULES = ['doubanmovie.spiders']
19 | NEWSPIDER_MODULE = 'doubanmovie.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'doubanmovie (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |     #'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'doubanmovie.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'doubanmovie.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/doubanmovie/doubanmovie/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/doubanmovie/doubanmovie/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from doubanmovie.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class doubanmovieSpider(CommonSpider):
24 |     name = "doubanmovie"
25 |     allowed_domains = ["douban.com"]
26 |     start_urls = [
27 |         #"https://movie.douban.com/tag/",
28 |         "https://movie.douban.com/chart"
29 |     ]
30 |     rules = [
31 |         #Rule(sle(allow=("/tag/[0-9]{4}$")), follow=True),
32 |         #Rule(sle(allow=("/tag/[0-9]{4}/?start=[0-9]{2,4}&type=T$")), follow=True),
33 |         #Rule(sle(allow=("/subject/[0-9]+$")), callback='parse_1'),
34 |         Rule(sle(allow=("/subject/[0-9]+/$")), callback='parse_1', follow=True),
35 |     ]
36 | 
37 |     list_css_rules = { 
38 |         '.linkto': {
39 |             'url': 'a::attr(href)',
40 |             'name': 'a::text',
41 |         }
42 |     }   
43 | 
44 |     list_css_rules_2 = { 
45 |         '#listZone .Q-tpWrap': {
46 |             'url': '.linkto::attr(href)',
47 |             'name': '.linkto::text'
48 |         }   
49 |     }   
50 | 
51 |     content_css_rules = { 
52 |         'rating_per': '.rating_per::text',
53 |         'rating_num': '.rating_num::text',
54 |         'title': 'h1 span:nth-child(1)::text',
55 |         'rating_people': '.rating_people span::text',
56 |     }
57 | 
58 |     def parse_1(self, response):
59 |         info('Parse '+response.url)
60 |         x = self.parse_with_rules(response, self.content_css_rules, dict)
61 |         return x
62 |         #print(repr(x).decode('raw_unicode_escape'))
63 |         # return self.parse_with_rules(response, self.css_rules, doubanmovieItem)
64 | 


--------------------------------------------------------------------------------
/doubanmovie/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = doubanmovie.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = doubanmovie
12 | 


--------------------------------------------------------------------------------
/douyu/douyu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/douyu/douyu/__init__.py


--------------------------------------------------------------------------------
/douyu/douyu/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class douyuItem(Item):
 9 |     # define the fields for your item here like:
10 |     url = Field()
11 |     room_name = Field()
12 |     people_count = Field()
13 |     tag = Field()
14 | 


--------------------------------------------------------------------------------
/douyu/douyu/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/douyu/douyu/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for douyu project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'douyu'
17 | 
18 | SPIDER_MODULES = ['douyu.spiders']
19 | NEWSPIDER_MODULE = 'douyu.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'douyu (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'douyu.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'douyu.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/douyu/douyu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/douyu/douyu/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from douyu.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class douyuSpider(CommonSpider):
24 |     name = "douyu"
25 |     allowed_domains = ["douyu.com"]
26 |     start_urls = [
27 |         "http://www.douyu.com/directory/all"
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("http://www.douyu.com/directory/all")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '#live-list-contentbox li': {
35 |             'url': 'a::attr(href)',
36 |             'room_name': 'a::attr(title)',
37 |             'tag': 'span.tag.ellipsis::text',
38 |             'people_count': '.dy-num.fr::text'
39 |         }
40 |     }
41 | 
42 |     list_css_rules_for_item = {
43 |         '#live-list-contentbox li': {
44 |             '__use': '1',
45 |             '__list': '1',
46 |             'url': 'a::attr(href)',
47 |             'room_name': 'a::attr(title)',
48 |             'tag': 'span.tag.ellipsis::text',
49 |             'people_count': '.dy-num.fr::text'
50 |         }
51 |     }
52 | 
53 | 
54 |     def parse_1(self, response):
55 |         info('Parse '+response.url)
56 |         #x = self.parse_with_rules(response, self.list_css_rules, dict)
57 |         x = self.parse_with_rules(response, self.list_css_rules_for_item, douyuItem)
58 |         print(len(x))
59 |         # print(json.dumps(x, ensure_ascii=False, indent=2))
60 |         # pp.pprint(x)
61 |         # return self.parse_with_rules(response, self.list_css_rules, douyuItem)
62 |         return x
63 | 


--------------------------------------------------------------------------------
/douyu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = douyu.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = douyu
12 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/general_spider/general_spider/__init__.py


--------------------------------------------------------------------------------
/general_spider/general_spider/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class general_spiderItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for general_spider project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'general_spider'
17 | 
18 | SPIDER_MODULES = ['general_spider.spiders']
19 | NEWSPIDER_MODULE = 'general_spider.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'general_spider (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'general_spider.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'general_spider.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/spiders/BasicSpiderConfig.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class ExRule:
 4 |     allowed_rule_regex = ''
 5 |     # list_css_rules
 6 |     paras = {}
 7 | 
 8 |     def __init__(self, allowed_rule_regex, **kwargs):
 9 |         self.allowed_rule_regex = allowed_rule_regex
10 |         self.paras = kwargs
11 | 
12 | 
13 | class BasicConfig:
14 |     name=''
15 |     allowed_domains=[]
16 |     # allowed_url_regex=[]
17 |     start_urls=[]
18 |     ex_rules = []
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/spiders/run.sh:
--------------------------------------------------------------------------------
1 | conf=${1:-scrapy_examples}
2 | scrapy crawl general_spider -a conf_module=$conf
3 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/spiders/scrapy_examples.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from BasicSpiderConfig import ExRule
 3 | 
 4 | class Config:
 5 | 
 6 |     list_css_rules = { 
 7 |         '.js-navigation-item': {
 8 |             'content': '.content a::text',
 9 |             'message': '.message a::text',
10 |             'age': '.age *::text',
11 |         }   
12 |     }
13 | 
14 |     ex_rule = ExRule('https://github.com/geekan/scrapy-examples$', list_css_rules=list_css_rules)
15 | 
16 |     name='scrapy_examples'
17 |     allowed_domains=['github.com']
18 |     start_urls=['https://github.com/geekan/scrapy-examples']
19 |     ex_rules = [ex_rule]
20 | 
21 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from general_spider.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | import BasicSpiderConfig
24 | 
25 | 
26 | class general_spiderSpider(CommonSpider):
27 |     name = 'general_spider'
28 | 
29 |     def __init__(self, conf_module='TestSpiderConfig', *args, **kwargs):
30 |         cm  = __import__(conf_module, globals=globals())
31 |         conf = cm.Config()
32 |         self.name = conf.name
33 |         self.allowed_domains = conf.allowed_domains
34 |         self.start_urls = conf.start_urls
35 |         self.rules = [Rule(sle(allow=(c.allowed_rule_regex)), callback='parse_1', cb_kwargs=c.paras, follow=True) for c in conf.ex_rules]
36 |         info(self.start_urls)
37 |         info(self.rules)
38 |         super(general_spiderSpider, self).__init__(*args, **kwargs)
39 | 
40 |     def parse_1(self, response, list_css_rules):
41 |         info('---------------------')
42 |         info('Parse '+response.url)
43 |         info('list_css_rules:')
44 |         info(list_css_rules)
45 |         x = self.parse_with_rules(response, list_css_rules, dict)
46 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
47 |         print(json.dumps(x, ensure_ascii=False, indent=2))
48 |         # pp.pprint(x)
49 |         # return self.parse_with_rules(response, self.css_rules, general_spiderItem)
50 |         return x
51 | 


--------------------------------------------------------------------------------
/general_spider/general_spider/spiders/v2ex.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from BasicSpiderConfig import ExRule
 3 | 
 4 | class Config:
 5 | 
 6 |     list_css_rules = { 
 7 |         '.cell.item': {
 8 |             'title': '.item_title a::text',
 9 |             'node': '.node::text',
10 |             'author': '.node+ strong a::text',
11 |             'reply_count': '.count_livid::text'
12 |         }   
13 |     }
14 | 
15 |     ex_rule = ExRule('http://www.v2ex.com/$', list_css_rules=list_css_rules)
16 | 
17 |     name='v2ex'
18 |     allowed_domains=['www.v2ex.com']
19 |     start_urls=['http://www.v2ex.com/']
20 |     ex_rules = [ex_rule]
21 | 
22 | 


--------------------------------------------------------------------------------
/general_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = general_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = general_spider
12 | 


--------------------------------------------------------------------------------
/github_trending/github_trending/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/github_trending/github_trending/__init__.py


--------------------------------------------------------------------------------
/github_trending/github_trending/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class github_trendingItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/github_trending/github_trending/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/github_trending/github_trending/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for github_trending project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'github_trending'
17 | 
18 | SPIDER_MODULES = ['github_trending.spiders']
19 | NEWSPIDER_MODULE = 'github_trending.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'github_trending (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'github_trending.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'github_trending.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/github_trending/github_trending/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/github_trending/github_trending/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from github_trending.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class github_trendingSpider(CommonSpider):
24 |     name = "github_trending"
25 |     allowed_domains = ["github.com"]
26 |     start_urls = [
27 |         "http://www.github.com/trending",
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("/trending$")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '.repo-list-item': {
35 |             'repo_name': '.repo-list-name a::attr(href)',
36 |             'repo_meta': '.repo-list-meta::text',
37 |         }   
38 |     }   
39 | 
40 |     content_css_rules = { 
41 |         'text': '#Cnt-Main-Article-QQ p *::text',
42 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
43 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
44 |     }
45 | 
46 |     def parse_1(self, response):
47 |         info('Parse '+response.url)
48 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
49 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
50 |         print(json.dumps(x, ensure_ascii=False, indent=2))
51 |         # pp.pprint(x)
52 |         # return self.parse_with_rules(response, self.css_rules, github_trendingItem)
53 | 


--------------------------------------------------------------------------------
/github_trending/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = github_trending.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = github_trending
12 | 


--------------------------------------------------------------------------------
/googlescholar/googlescholar/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/googlescholar/googlescholar/__init__.py


--------------------------------------------------------------------------------
/googlescholar/googlescholar/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class googlescholarItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/googlescholar/googlescholar/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/googlescholar/googlescholar/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for googlescholar project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'googlescholar'
17 | 
18 | SPIDER_MODULES = ['googlescholar.spiders']
19 | NEWSPIDER_MODULE = 'googlescholar.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'googlescholar (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |     #'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'googlescholar.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'googlescholar.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/googlescholar/googlescholar/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/googlescholar/googlescholar/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from googlescholar.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class googlescholarSpider(CommonSpider):
24 |     name = "googlescholar"
25 |     allowed_domains = ["google.com"]
26 |     start_urls = [
27 |         "http://scholar.google.com/scholar?as_ylo=2011&q=machine+learning&hl=en&as_sdt=0,5",
28 |         #"http://scholar.google.com/scholar?q=estimate+ctr&btnG=&hl=en&as_sdt=0%2C5&as_ylo=2011",
29 |         #"http://scholar.google.com",
30 |     ]
31 |     rules = [
32 |         Rule(sle(allow=("scholar\?.*")), callback='parse_1', follow=False),
33 |     ]
34 | 
35 |     def __init__(self, start_url='', *args, **kwargs):
36 |         if start_url:
37 |             self.start_urls = [start_url]
38 |         super(googlescholarSpider, self).__init__(*args, **kwargs)
39 | 
40 |     #.gs_ri: content besides related html/pdf
41 |     list_css_rules = {
42 |         '.gs_r': {
43 |             'title': '.gs_rt a *::text',
44 |             'url': '.gs_rt a::attr(href)',
45 |             'related-text': '.gs_ggsS::text',
46 |             'related-type': '.gs_ggsS .gs_ctg2::text',
47 |             'related-url': '.gs_ggs a::attr(href)',
48 |             'citation-text': '.gs_fl > a:nth-child(1)::text',
49 |             'citation-url': '.gs_fl > a:nth-child(1)::attr(href)',
50 |             'authors': '.gs_a a::text',
51 |             'description': '.gs_rs *::text',
52 |             'journal-year-src': '.gs_a::text',
53 |         }
54 |     }
55 | 
56 |     def parse_1(self, response):
57 |         info('Parse '+response.url)
58 |         #sel = Selector(response)
59 |         #v = sel.css('.gs_ggs a::attr(href)').extract()
60 |         #import pdb; pdb.set_trace()
61 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
62 |         pp.pprint(x[0]['.gs_r'])
63 |         # return self.parse_with_rules(response, self.css_rules, googlescholarItem)
64 | 


--------------------------------------------------------------------------------
/googlescholar/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = googlescholar.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = googlescholar
12 | 


--------------------------------------------------------------------------------
/hacker_news/hacker_news/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/hacker_news/hacker_news/__init__.py


--------------------------------------------------------------------------------
/hacker_news/hacker_news/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class hacker_newsItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/hacker_news/hacker_news/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/hacker_news/hacker_news/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for hacker_news project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'hacker_news'
17 | 
18 | SPIDER_MODULES = ['hacker_news.spiders']
19 | NEWSPIDER_MODULE = 'hacker_news.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'hacker_news (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'hacker_news.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'hacker_news.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/hacker_news/hacker_news/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/hacker_news/hacker_news/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from hacker_news.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class hacker_newsSpider(CommonSpider):
24 |     name = "hacker_news"
25 |     allowed_domains = ["news.ycombinator.com"]
26 |     start_urls = [
27 |         "https://news.ycombinator.com/",
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("https://news.ycombinator.com/$")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         'title': '.storylink::text',
35 |         'desc': '.subtext .score::text',
36 |     }   
37 | 
38 |     content_css_rules = { 
39 |         'text': '#Cnt-Main-Article-QQ p *::text',
40 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
41 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
42 |     }
43 | 
44 |     def parse_1(self, response):
45 |         info('Parse '+response.url)
46 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
47 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
48 |         print(json.dumps(x, ensure_ascii=False, indent=2))
49 |         # pp.pprint(x)
50 |         # return self.parse_with_rules(response, self.css_rules, hacker_newsItem)
51 | 


--------------------------------------------------------------------------------
/hacker_news/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = hacker_news.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = hacker_news
12 | 


--------------------------------------------------------------------------------
/hrtencent/hrtencent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/hrtencent/hrtencent/__init__.py


--------------------------------------------------------------------------------
/hrtencent/hrtencent/data_utf8.json:
--------------------------------------------------------------------------------
 1 | {"bottomline": ["深圳", "设计类", "1人"], "sharetitle": ["SD3-互娱2D原画设计师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=16300&keywords=&lid=0&tid=0"}
 2 | {"bottomline": ["深圳", "技术类", "2人"], "sharetitle": ["SD9-移动终端游戏开发工程师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=16709&keywords=&lid=0&tid=0"}
 3 | {"bottomline": ["深圳", "产品/项目类", "1人"], "sharetitle": ["MIG12-高级数据分析经理（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=12799&keywords=&lid=0&tid=0"}
 4 | {"bottomline": ["深圳", "技术类", "5人"], "sharetitle": ["SNG03-QQ iOS开发工程师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=16602&keywords=&lid=0&tid=0"}
 5 | {"bottomline": ["深圳", "技术类", "4人"], "sharetitle": ["TEG07-PHP开发工程师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=16714&keywords=&lid=0&tid=0"}
 6 | {"bottomline": ["上海", "技术类", "2人"], "sharetitle": ["SNG15-广告业务Java开发工程师（上海）"], "link": "http://hr.tencent.com/position_detail.php?id=16639&keywords=&lid=0&tid=0"}
 7 | {"bottomline": ["深圳", "技术类", "1人"], "sharetitle": ["MIG12-技术运营工程师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=16458&keywords=&lid=0&tid=0"}
 8 | {"bottomline": ["深圳", "技术类", "9人"], "sharetitle": ["14413-android终端开发工程师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=16243&keywords=&lid=0&tid=0"}
 9 | {"bottomline": ["深圳", "职能类", "4人"], "sharetitle": ["S2-财经系统需求分析师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=16582&keywords=&lid=0&tid=0"}
10 | {"bottomline": ["深圳", "技术类", "4人"], "sharetitle": ["14413-Android终端开发工程师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=13424&keywords=&lid=0&tid=0"}
11 | {"bottomline": ["北京", "内容编辑类", "2人"], "sharetitle": ["OMG10-腾讯综艺视频编辑（北京）"], "link": "http://hr.tencent.com/position_detail.php?id=15775&keywords=&lid=0&tid=0"}
12 | {"bottomline": ["北京", "内容编辑类", "1人"], "sharetitle": ["OMG10-腾讯视频综艺运营编辑（北京）"], "link": "http://hr.tencent.com/position_detail.php?id=14937&keywords=&lid=0&tid=0"}
13 | {"bottomline": ["深圳", "产品/项目类", "6人"], "sharetitle": ["13552-Global Operation Talents (shenzhen)"], "link": "http://hr.tencent.com/position_detail.php?id=15974&keywords=&lid=0&tid=0"}
14 | {"bottomline": ["上海", "设计类", "2人"], "sharetitle": ["SD8-3D美术设计师（上海）"], "link": "http://hr.tencent.com/position_detail.php?id=16730&keywords=&lid=0&tid=0"}
15 | {"bottomline": ["上海", "产品/项目类", "2人"], "sharetitle": ["SD8-移动终端手游策划（上海）"], "link": "http://hr.tencent.com/position_detail.php?id=13299&keywords=&lid=0&tid=0"}
16 | {"bottomline": ["北京", "设计类", "1人"], "sharetitle": ["SNG10-交互设计师（北京）"], "link": "http://hr.tencent.com/position_detail.php?id=16305&keywords=&lid=0&tid=0"}
17 | {"bottomline": ["成都", "技术类", "1人"], "sharetitle": ["SD2-后台开发工程师（成都）"], "link": "http://hr.tencent.com/position_detail.php?id=12133&keywords=&lid=0&tid=0"}
18 | {"bottomline": ["北京", "市场类", "1人"], "sharetitle": ["SNG15-市场活动策划与执行（北京）"], "link": "http://hr.tencent.com/position_detail.php?id=16266&keywords=&lid=0&tid=0"}
19 | {"bottomline": ["深圳", "技术类", "1人"], "sharetitle": ["SD9-互娱后台开发工程师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=16708&keywords=&lid=0&tid=0"}
20 | {"bottomline": ["深圳", "技术类", "1人"], "sharetitle": ["SD5-后台开发工程师（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=14854&keywords=&lid=0&tid=0"}
21 | {"bottomline": ["深圳", "产品/项目类", "14人"], "sharetitle": ["13551-移动游戏数据分析与商业化运营策划（深圳）"], "link": "http://hr.tencent.com/position_detail.php?id=14982&keywords=&lid=0&tid=0"}
22 | 


--------------------------------------------------------------------------------
/hrtencent/hrtencent/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class PositionDetailItem(Item):
 9 |     title = Field()
10 |     link = Field()
11 |     sharetitle  = Field()
12 |     bottomline = Field()
13 |     duty = Field()
14 |     xxx = Field()
15 | 


--------------------------------------------------------------------------------
/hrtencent/hrtencent/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | from scrapy import signals
 7 | 
 8 | 
 9 | import json
10 | import codecs
11 | 
12 | 
13 | class JsonWithEncodingPipeline(object):
14 | 
15 |     def __init__(self):
16 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
17 | 
18 |     def process_item(self, item, spider):
19 |         line = json.dumps(dict(item), ensure_ascii=False) + "\n"
20 |         self.file.write(line)
21 |         return item
22 | 
23 |     def close_spider(self, spider):
24 |         self.file.close()
25 | 


--------------------------------------------------------------------------------
/hrtencent/hrtencent/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for hrtencent project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | 
15 | BOT_NAME = 'hrtencent'
16 | 
17 | SPIDER_MODULES = ['hrtencent.spiders']
18 | NEWSPIDER_MODULE = 'hrtencent.spiders'
19 | 
20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
21 | #USER_AGENT = 'hrtencent (+http://www.yourdomain.com)'
22 | DOWNLOADER_MIDDLEWARES = {
23 |     #'misc.middleware.CustomHttpProxyMiddleware': 400,
24 |     'misc.middleware.CustomUserAgentMiddleware': 401,
25 | }
26 | 
27 | ITEM_PIPELINES = {
28 |     'hrtencent.pipelines.JsonWithEncodingPipeline': 300,
29 | }
30 | 
31 | LOG_LEVEL = 'INFO'
32 | 
33 | 


--------------------------------------------------------------------------------
/hrtencent/hrtencent/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/hrtencent/hrtencent/spiders/hrtencent_spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | 
 4 | 
 5 | from scrapy.selector import Selector
 6 | try:
 7 |     from scrapy.spiders import Spider
 8 | except:
 9 |     from scrapy.spiders import BaseSpider as Spider
10 | from scrapy.utils.response import get_base_url
11 | from scrapy.spiders import CrawlSpider, Rule
12 | from scrapy.linkextractors import LinkExtractor as sle
13 | 
14 | 
15 | from hrtencent.items import *
16 | from misc.log import *
17 | 
18 | 
19 | class HrtencentSpider(CrawlSpider):
20 |     name = "hrtencent"
21 |     allowed_domains = ["tencent.com"]
22 |     start_urls = [
23 |         "http://hr.tencent.com/position.php?start=%d" % d for d in range(0, 20, 10)
24 |     ]
25 |     rules = [
26 |         Rule(sle(allow=("/position_detail.php\?id=\d*.*", )), callback='parse_2'),
27 |         Rule(sle(allow=("/position.php\?&start=\d{,2}#a")), follow=True, callback='parse_1')
28 |     ]
29 | 
30 |     def parse_2(self, response):
31 |         items = []
32 |         sel = Selector(response)
33 |         sites = sel.css('.tablelist')
34 |         for site in sites:
35 |             item = PositionDetailItem()
36 |             item['sharetitle'] = site.css('.h #sharetitle::text').extract()
37 |             item['bottomline'] = site.css('.bottomline td::text').extract()
38 |             # item['duty'] = site.css('.c .l2::text').extract()
39 |             item['link'] = response.url
40 |             items.append(item)
41 |             print repr(item).decode("unicode-escape") + '\n'
42 |         # info('parsed ' + str(response))
43 |         self.parse_1(response)
44 |         return items
45 | 
46 |     def parse_1(self, response):
47 |         # url cannot encode to Chinese easily.. XXX
48 |         info('parsed ' + str(response))
49 | 
50 |     def _process_request(self, request):
51 |         info('process ' + str(request))
52 |         return request
53 | 


--------------------------------------------------------------------------------
/hrtencent/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = hrtencent.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = hrtencent
12 | 


--------------------------------------------------------------------------------
/linkedin/README.md:
--------------------------------------------------------------------------------
 1 | scrapy-linkedin
 2 | ===============
 3 | 
 4 | Using Scrapy to get Linkedin's person public profile.
 5 | 
 6 | ### feature
 7 | * Get all **public** profile
 8 | * Using Scrapy
 9 | * Enable auto throttle
10 | * Enable naive proxy providing
11 | * Agent rotating
12 | * Support Unicode
13 | * Using MongoDB as Backend
14 | * ...
15 | 
16 | 
17 | ### Dependency
18 | * Scrapy == 0.20
19 | * pymongo 
20 | * BeautifulSoup4, UnicodeDammit
21 | 
22 | 
23 | ### usage
24 | 	1. start a MongoDB instance, `mongod`
25 | 	2. run the crawler, `scrapy crawl LinkedinSpider`
26 | 
27 | you may found `Rakefile` helpful.
28 | 
29 | 
30 | ### configuration
31 | you can change MongoDB setting ang other things in `settings.py`. 
32 | 
33 | ### note
34 | if you just need whatever public profiles, there are better ways to do it. 
35 | check out these urls: http://www.linkedin.com/directory/people/[a-z].html
36 | 
37 | Our strategy is following `also-view` links in public profile.
38 | 
39 | ### One more thing
40 | This is a toy project a few years ago. Now I won't maintain it anymore, questions about this project will be ignored. You can read the code, there isn't much. 
41 | I hope this project can help you get a basic understanding of Scrapy, then you can make your own Spider. 
42 | 


--------------------------------------------------------------------------------
/linkedin/doc/db-scheme.md:
--------------------------------------------------------------------------------
 1 | Mongodb Scheme(**draft**)
 2 | 
 3 | ---
 4 | 
 5 | **PersonProfile**
 6 | 
 7 | 	PersonProfile
 8 | 	{
 9 | 		linkedin_id:'id',
10 | 		locality:'beijing',
11 | 		industry:'Research',
12 | 		summary:'I am a professor…',
13 | 		
14 | 		skills:
15 | 				[
16 | 					'data mining',
17 | 					'machine learning'
18 | 				],
19 | 				
20 | 		specilities:
21 | 				[
22 | 					'data mining',
23 | 				],
24 | 				
25 | 		interests:
26 | 				[
27 | 					'data mining',
28 | 					'machine learning'
29 | 				],
30 | 				
31 | 		groups:
32 | 				{
33 | 					'member',
34 | 					'affiliation':
35 | 								[
36 | 									'kdd 2012'
37 | 								]
38 | 				}
39 | 				
40 | 		honors:
41 | 				[
42 | 					'first prize',
43 | 				],
44 | 		
45 | 		education:
46 | 				[
47 | 					{
48 | 						school_name: 'a',
49 | 						period: '1991-2012',
50 | 						desc:'topic model'
51 | 					},
52 | 				],
53 | 				
54 | 		experience:
55 | 				[
56 | 					{
57 | 						title:'associate professor',
58 | 						organization:'tsinghua',
59 | 						period:'1999-2000',
60 | 						description:'research about data mining',
61 | 					},
62 | 				],
63 | 				
64 | 		also_view:
65 | 				[
66 | 					{
67 | 						'linkedin_id':'asd',
68 | 						'url':'http',
69 | 					}
70 | 				],
71 | 		
72 | 	}


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/Rakefile:
--------------------------------------------------------------------------------
 1 | task :start do
 2 |     system 'scrapy crawl LinkedinSpider'
 3 | end
 4 | 
 5 | 
 6 | task :start_with_resume, :job_name do |t, args|
 7 |     if !File.exists?("crawl_jobs")
 8 |         FileUtil.mkdir("crawl_jobs")
 9 |     end
10 |     system "scrapy crawl LinkedinSpider -s JOBDIR=crawl_jobs/#{args.job_name}"
11 | end
12 | 
13 | task :reload_proxy do
14 |     system 'python reload_proxy.py'
15 | end
16 | 


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/linkedin/linkedin/linkedin/__init__.py


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/db.py:
--------------------------------------------------------------------------------
 1 | from linkedin import settings
 2 | import pymongo
 3 | 
 4 | class MongoDBClient(object):
 5 |     def __init__(self, col, index=None):        
 6 |         connection = pymongo.Connection(settings.MONGODB_SERVER, settings.MONGODB_PORT)
 7 |         self.db = connection[settings.MONGODB_DB]
 8 |         self.collection = self.db[col]
 9 |         if index:
10 |             self.collection.create_index(index, unique=True)
11 |             
12 |     def get_collection(self):
13 |         return self.collection
14 |     
15 |     def _walk(self):
16 |         """
17 |         generator of all the documents in this collection
18 |         """
19 |         skip = 0
20 |         limit = 1000
21 |         hasMore = True
22 |         while hasMore:
23 |             res = self.collection.find(skip=skip, limit=limit)
24 |             hasMore = (res.count(with_limit_and_skip=True) == limit)
25 |             for x in res:
26 |                 yield x
27 |             skip += limit
28 |         
29 |     def walk(self):
30 |         """
31 |         return all the documents in this collection
32 |         """
33 |         docs = []
34 |         for doc in self._walk():
35 |             docs.append(doc)
36 |         return docs
37 |     
38 | 


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class LinkedinItem(Item):
 9 |     # define the fields for your item here like:
10 |     # name = Field()
11 |     pass
12 | 
13 | 
14 | class PersonProfileItem(Item):
15 |     _id = Field()
16 |     url = Field()
17 |     name = Field()
18 |     also_view = Field()
19 |     education = Field()
20 |     locality = Field()
21 |     industry = Field()
22 |     summary = Field()
23 |     specilities = Field()
24 |     skills = Field()
25 |     interests = Field()
26 |     group = Field()
27 |     honors = Field()
28 |     education = Field()
29 |     experience = Field()
30 |     overview_html = Field()
31 |     homepage = Field()
32 |     
33 | 


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/middleware.py:
--------------------------------------------------------------------------------
 1 | from scrapy import log
 2 | from proxy import PROXIES
 3 | from agents import AGENTS
 4 | 
 5 | import random
 6 | 
 7 | """
 8 | Custom proxy provider. 
 9 | """
10 | class CustomHttpProxyMiddleware(object):
11 |     
12 |     def process_request(self, request, spider):
13 |         # TODO implement complex proxy providing algorithm
14 |         if self.use_proxy(request):
15 |             p = random.choice(PROXIES)
16 |             try:
17 |                 request.meta['proxy'] = "http://%s" % p['ip_port']
18 |             except Exception, e:
19 |                 log.msg("Exception %s" % e, _level=log.CRITICAL)
20 |                 
21 |     
22 |     def use_proxy(self, request):
23 |         """
24 |         using direct download for depth <= 2
25 |         using proxy with probability 0.3
26 |         """
27 |         if "depth" in request.meta and int(request.meta['depth']) <= 2:
28 |             return False
29 |         i = random.randint(1, 10)
30 |         return i <= 2
31 |     
32 |     
33 | """
34 | change request header nealy every time
35 | """
36 | class CustomUserAgentMiddleware(object):
37 |     def process_request(self, request, spider):
38 |         agent = random.choice(AGENTS)
39 |         request.headers['User-Agent'] = agent
40 | 


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/parser/LinkedinParser.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from urllib2 import urlparse
 3 | 
 4 | def parse_homepage(html):
 5 |     soup = BeautifulSoup(html)
 6 |     websites = soup.find_all('dd', 'websites')
 7 |     if websites and len(websites) > 0:
 8 |         websites = websites[0]
 9 |         sites = websites.find_all('li')
10 |         if sites and len(sites) > 0:
11 |             result = {}
12 |             for site in sites:
13 |                 site_name = site.text.strip()
14 |                 original = site.a.get('href')
15 |                 url_parse = urlparse.urlparse(original).query
16 |                 query_parse = urlparse.parse_qs(url_parse)
17 |                 if 'url' in query_parse:
18 |                     result[site_name] = query_parse['url']
19 |             return result
20 |     return None
21 | 
22 | 


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/linkedin/linkedin/linkedin/parser/__init__.py


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
 5 | from scrapy.conf import settings
 6 | from scrapy import log
 7 | 
 8 | class LinkedinPipeline(object):
 9 |     def process_item(self, item, spider):
10 |         return item
11 | 
12 | 
13 | # Copyright 2011 Julien Duponchelle <julien@duponchelle.info>.
14 | #
15 | # Licensed under the Apache License, Version 2.0 (the "License");
16 | # you may not use this file except in compliance with the License.
17 | # You may obtain a copy of the License at
18 | #
19 | # http://www.apache.org/licenses/LICENSE-2.0
20 | #
21 | # Unless required by applicable law or agreed to in writing, software
22 | # distributed under the License is distributed on an "AS IS" BASIS,
23 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 | # See the License for the specific language governing permissions and
25 | # limitations under the License.
26 | class MongoDBPipeline(object):
27 |     def __init__(self):
28 |         import pymongo
29 |         connection = pymongo.Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
30 |         self.db = connection[settings['MONGODB_DB']]
31 |         self.collection = self.db[settings['MONGODB_COLLECTION']]
32 |         if self.__get_uniq_key() is not None:
33 |             self.collection.create_index(self.__get_uniq_key(), unique=True)
34 | 
35 |     def process_item(self, item, spider):
36 |         if self.__get_uniq_key() is None:
37 |             self.collection.insert(dict(item))
38 |         else:
39 |             self.collection.update(
40 |                             {self.__get_uniq_key(): item[self.__get_uniq_key()]},
41 |                             dict(item),
42 |                             upsert=True)  
43 |         log.msg("Item wrote to MongoDB database %s/%s" %
44 |                     (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
45 |                     level=log.DEBUG, spider=spider)  
46 |         return item
47 | 
48 |     def __get_uniq_key(self):
49 |         if not settings['MONGODB_UNIQ_KEY'] or settings['MONGODB_UNIQ_KEY'] == "":
50 |             return None
51 |         return settings['MONGODB_UNIQ_KEY']


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for linkedin project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/topics/settings.html
 7 | #
 8 | import os
 9 | 
10 | BOT_NAME = 'linkedin'
11 | 
12 | SPIDER_MODULES = ['linkedin.spiders']
13 | NEWSPIDER_MODULE = 'linkedin.spiders'
14 | 
15 | DOWNLOADER_MIDDLEWARES = {
16 |     'linkedin.middleware.CustomHttpProxyMiddleware': 543,
17 |     'linkedin.middleware.CustomUserAgentMiddleware': 545,
18 | }
19 | 
20 | ########### Item pipeline
21 | ITEM_PIPELINES = [
22 |                   "linkedin.pipelines.MongoDBPipeline",
23 | ]
24 | 
25 | MONGODB_SERVER = 'localhost'
26 | MONGODB_PORT = 27017
27 | MONGODB_DB = 'scrapy'
28 | MONGODB_COLLECTION = 'person_profiles'
29 | MONGODB_UNIQ_KEY = '_id'
30 | ###########
31 | 
32 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
33 | #USER_AGENT = 'linkedin (+http://www.yourdomain.com)'
34 | 
35 | # Enable auto throttle
36 | AUTOTHROTTLE_ENABLED = True
37 | 
38 | COOKIES_ENABLED = False
39 | 
40 | # Set your own download folder
41 | DOWNLOAD_FILE_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)), "download_file")
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/linkedin/linkedin/linkedin/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/linkedin/linkedin/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = linkedin.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = linkedin
12 | 


--------------------------------------------------------------------------------
/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/misc/__init__.py


--------------------------------------------------------------------------------
/misc/log.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #from scrapy import log
 3 | import logging as log
 4 | 
 5 | def warn(msg):
 6 |     #log.msg(str(msg), level=log.WARNING)
 7 |     log.warn(str(msg))
 8 | 
 9 | 
10 | def info(msg):
11 |     #log.msg(str(msg), level=log.INFO)
12 |     log.info(str(msg))
13 | 
14 | 
15 | def debug(msg):
16 |     #log.msg(str(msg), level=log.DEBUG)
17 |     log.debug(str(msg))
18 | 
19 | import pprint
20 | class MyPrettyPrinter(pprint.PrettyPrinter):
21 |     def format(self, object, context, maxlevels, level):
22 |         if isinstance(object, unicode):
23 |             return (object.encode('utf8'), True, False)
24 |         return pprint.PrettyPrinter.format(self, object, context, maxlevels, level)
25 | pu = MyPrettyPrinter()
26 | 
27 | pp = pprint.PrettyPrinter()
28 | 


--------------------------------------------------------------------------------
/misc/middleware.py:
--------------------------------------------------------------------------------
 1 | from proxy import PROXIES, FREE_PROXIES
 2 | from agents import AGENTS
 3 | import logging as log
 4 | 
 5 | import random
 6 | 
 7 | 
 8 | class CustomHttpProxyFromMysqlMiddleware(object):
 9 |     proxies = FREE_PROXIES
10 | 
11 |     def process_request(self, request, spider):
12 |         # TODO implement complex proxy providing algorithm
13 |         if self.use_proxy(request):
14 |             p = random.choice(self.proxies)
15 |             try:
16 |                 request.meta['proxy'] = "http://%s" % p['ip_port']
17 |                 print(request.meta['proxy'])
18 |             except Exception, e:
19 |                 #log.msg("Exception %s" % e, _level=log.CRITICAL)
20 |                 log.critical("Exception %s" % e)
21 | 
22 |     def use_proxy(self, request):
23 |         """
24 |         using direct download for depth <= 2
25 |         using proxy with probability 0.3
26 |         """
27 |         #if "depth" in request.meta and int(request.meta['depth']) <= 2:
28 |         #    return False
29 |         #i = random.randint(1, 10)
30 |         #return i <= 2
31 |         return True
32 | 
33 | 
34 | 
35 | class CustomHttpProxyMiddleware(object):
36 | 
37 |     def process_request(self, request, spider):
38 |         # TODO implement complex proxy providing algorithm
39 |         if self.use_proxy(request):
40 |             p = random.choice(PROXIES)
41 |             try:
42 |                 request.meta['proxy'] = "http://%s" % p['ip_port']
43 |             except Exception, e:
44 |                 #log.msg("Exception %s" % e, _level=log.CRITICAL)
45 |                 log.critical("Exception %s" % e)
46 | 
47 |     def use_proxy(self, request):
48 |         """
49 |         using direct download for depth <= 2
50 |         using proxy with probability 0.3
51 |         """
52 |         #if "depth" in request.meta and int(request.meta['depth']) <= 2:
53 |         #    return False
54 |         #i = random.randint(1, 10)
55 |         #return i <= 2
56 |         return True
57 | 
58 | 
59 | class CustomUserAgentMiddleware(object):
60 |     def process_request(self, request, spider):
61 |         agent = random.choice(AGENTS)
62 |         request.headers['User-Agent'] = agent
63 | 


--------------------------------------------------------------------------------
/misc/proxy.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This proxy is goagent/wallproxy
 3 | If you want to disable it, plz configure settings.py
 4 | '''
 5 | PROXIES = [
 6 |     #{"ip_port": "127.0.0.1:8087"}, #goagent
 7 |     #{"ip_port": "127.0.0.1:8118"}, #tor via privoxy
 8 |     {"ip_port": "127.0.0.1:1080"}, #tor via privoxy
 9 | ]
10 | 
11 | FREE_PROXIES = [
12 | {"ip_port": "181.48.0.173:8081"},
13 | {"ip_port": "82.43.21.165:3128"},
14 | {"ip_port": "185.112.234.4:80"},
15 | {"ip_port": "118.189.13.178:8080"},
16 | {"ip_port": "37.187.117.157:3128"},
17 | {"ip_port": "62.201.200.17:80"},
18 | {"ip_port": "181.143.28.210:3128"},
19 | {"ip_port": "216.190.97.3:3128"},
20 | {"ip_port": "183.111.169.205:3128"},
21 | ]
22 | 


--------------------------------------------------------------------------------
/pandatv/pandatv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/pandatv/pandatv/__init__.py


--------------------------------------------------------------------------------
/pandatv/pandatv/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class pandatvItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/pandatv/pandatv/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/pandatv/pandatv/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for pandatv project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'pandatv'
17 | 
18 | SPIDER_MODULES = ['pandatv.spiders']
19 | NEWSPIDER_MODULE = 'pandatv.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'pandatv (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'pandatv.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'pandatv.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/pandatv/pandatv/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/pandatv/pandatv/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from pandatv.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class pandatvSpider(CommonSpider):
24 |     name = "pandatv"
25 |     allowed_domains = ["panda.tv"]
26 |     start_urls = [
27 |         "http://www.panda.tv/all",
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("http://www.panda.tv/all")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '.video-list-item.video-no-tag': {
35 |             'room_name': '.video-title::text',
36 |             'author': '.video-nickname::text',
37 |             'people_count': '.video-number::text',
38 |             'tag': '.video-cate::text',
39 |         }   
40 |     }   
41 | 
42 |     content_css_rules = { 
43 |         'text': '#Cnt-Main-Article-QQ p *::text',
44 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
45 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
46 |     }
47 | 
48 |     def parse_1(self, response):
49 |         info('Parse '+response.url)
50 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
51 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
52 |         print(json.dumps(x, ensure_ascii=False, indent=2))
53 |         # pp.pprint(x)
54 |         # return self.parse_with_rules(response, self.css_rules, pandatvItem)
55 | 


--------------------------------------------------------------------------------
/pandatv/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = pandatv.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = pandatv
12 | 


--------------------------------------------------------------------------------
/proxylist/proxylist/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/proxylist/proxylist/__init__.py


--------------------------------------------------------------------------------
/proxylist/proxylist/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class freeProxyListItem(Item):
 9 |     # define the fields for your item here like:
10 |     ip = Field()
11 |     port = Field()
12 |     code = Field()
13 |     country = Field()
14 |     anonymity = Field()
15 |     google = Field()
16 |     https = Field()
17 |     last_checked = Field()
18 | 
19 | 


--------------------------------------------------------------------------------
/proxylist/proxylist/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | import sys
18 | import MySQLdb
19 | import hashlib
20 | from scrapy.exceptions import DropItem
21 | from scrapy.http import Request
22 | 
23 | 
24 | class JsonWithEncodingPipeline(object):
25 | 
26 |     def __init__(self):
27 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
28 | 
29 |     def process_item(self, item, spider):
30 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
31 |         self.file.write(line)
32 |         return item
33 | 
34 |     def close_spider(self, spider):
35 |         self.file.close()
36 | 
37 | 
38 | 
39 | class MySQLStorePipeline(object):
40 |     def __init__(self):
41 |         # user, passwd, db
42 |         self.conn = MySQLdb.connect(user='proxylist', passwd='proxylist', db='proxylist', host='localhost', charset="utf8", use_unicode=True)
43 |         self.cursor = self.conn.cursor()
44 |         # self.cursor.execute('create table free_proxy_list (ip varchar(32), port int, code varchar(16), country varchar(64), anoymity varchar(32), google varchar(4), https varchar(4), last_checked varchar(32));''')
45 | 
46 |     def process_item(self, item, spider):
47 |         try:
48 |             l = ['ip', 'port', 'code', 'country', 'anonymity', 'google', 'https', 'last_checked']
49 |             self.cursor.execute("""
50 |                 INSERT INTO free_proxy_list
51 |                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s)""",
52 |                [item[i].encode('utf-8') for i in l]
53 |             )
54 |             self.conn.commit()
55 |         except MySQLdb.Error, e:
56 |             print "Error %d: %s" % (e.args[0], e.args[1])
57 | 
58 |         return item
59 | 
60 | 
61 | class RedisPipeline(object):
62 | 
63 |     def __init__(self):
64 |         self.r = redis.StrictRedis(host='localhost', port=6379)
65 | 
66 |     def process_item(self, item, spider):
67 |         if not item['id']:
68 |             print 'no id item!!'
69 | 
70 |         str_recorded_item = self.r.get(item['id'])
71 |         final_item = None
72 |         if str_recorded_item is None:
73 |             final_item = item
74 |         else:
75 |             ritem = eval(self.r.get(item['id']))
76 |             final_item = dict(item.items() + ritem.items())
77 |         self.r.set(item['id'], final_item)
78 | 
79 |     def close_spider(self, spider):
80 |         return
81 | 


--------------------------------------------------------------------------------
/proxylist/proxylist/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for proxylist project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'proxylist'
17 | 
18 | SPIDER_MODULES = ['proxylist.spiders']
19 | NEWSPIDER_MODULE = 'proxylist.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'proxylist (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'proxylist.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'proxylist.pipelines.RedisPipeline': 301,
32 |     'proxylist.pipelines.MySQLStorePipeline': 302
33 | }
34 | 
35 | LOG_LEVEL = 'INFO'
36 | 
37 | DOWNLOAD_DELAY = 1
38 | 


--------------------------------------------------------------------------------
/proxylist/proxylist/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/proxylist/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = proxylist.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = proxylist
12 | 


--------------------------------------------------------------------------------
/qqnews/qqnews/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/qqnews/qqnews/__init__.py


--------------------------------------------------------------------------------
/qqnews/qqnews/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class qqnewsItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 |     title = Field()
12 |     url = Field()
13 |     content = Field()
14 | 
15 | class PositionDetailItem(Item):
16 |     title = Field()
17 |     link = Field()
18 |     sharetitle  = Field()
19 |     bottomline = Field()
20 |     duty = Field()
21 |     xxx = Field()
22 | 


--------------------------------------------------------------------------------
/qqnews/qqnews/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/qqnews/qqnews/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for qqnews project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'qqnews'
17 | 
18 | SPIDER_MODULES = ['qqnews.spiders']
19 | NEWSPIDER_MODULE = 'qqnews.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'qqnews (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |     #'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'qqnews.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'qqnews.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | #DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/qqnews/qqnews/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/qqnews/qqnews/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | 
 4 | 
 5 | from scrapy.selector import Selector
 6 | try:
 7 |     from scrapy.spiders import Spider
 8 | except:
 9 |     from scrapy.spiders import BaseSpider as Spider
10 | from scrapy.utils.response import get_base_url
11 | from scrapy.spiders import CrawlSpider, Rule
12 | from scrapy.linkextractors import LinkExtractor as sle
13 | 
14 | 
15 | from qqnews.items import *
16 | from misc.log import *
17 | from misc.spider import CommonSpider
18 | 
19 | 
20 | class qqnewsSpider(CommonSpider):
21 |     name = "qqnews"
22 |     allowed_domains = ["tencent.com", 'qq.com']
23 |     start_urls = [
24 |         'http://news.qq.com/society_index.shtml'
25 |     ]
26 |     rules = [
27 |         Rule(sle(allow=('society_index.shtml')), callback='parse_0', follow=True),
28 |         Rule(sle(allow=(".*[0-9]{8}.*htm$")), callback='parse_1', follow=True),
29 |     ]
30 | 
31 |     list_css_rules = { 
32 |         '.linkto': {
33 |             'url': 'a::attr(href)',
34 |             'name': 'a::text',
35 |         }   
36 |     }
37 | 
38 |     list_css_rules_2 = {
39 |         '#listZone .Q-tpWrap': {
40 |             'url': '.linkto::attr(href)',
41 |             'name': '.linkto::text'
42 |         }
43 |     }
44 | 
45 |     content_css_rules = {
46 |         'text': '#Cnt-Main-Article-QQ p *::text',
47 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
48 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
49 |     }
50 | 
51 |     def parse_0(self, response):
52 |         info('Parse0 '+response.url)
53 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
54 |         pp.pprint(x)
55 |         #return self.parse_with_rules(response, self.list_css_rules, qqnewsItem)
56 | 
57 |     def parse_1(self, response):
58 |         info('Parse1 '+response.url)
59 |         x = self.parse_with_rules(response, self.content_css_rules, dict)
60 |         pp.pprint(x)
61 |         #import pdb; pdb.set_trace()
62 | 
63 |     def parse_2(self, response):
64 |         info('Parse2 '+response.url)
65 | 
66 | 


--------------------------------------------------------------------------------
/qqnews/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = qqnews.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = qqnews
12 | 


--------------------------------------------------------------------------------
/reddit/reddit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/reddit/reddit/__init__.py


--------------------------------------------------------------------------------
/reddit/reddit/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class redditItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/reddit/reddit/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/reddit/reddit/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for reddit project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'reddit'
17 | 
18 | SPIDER_MODULES = ['reddit.spiders']
19 | NEWSPIDER_MODULE = 'reddit.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'reddit (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'reddit.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'reddit.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/reddit/reddit/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/reddit/reddit/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from reddit.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class redditSpider(CommonSpider):
24 |     name = "reddit"
25 |     allowed_domains = ["reddit.com"]
26 |     start_urls = [
27 |         "https://www.reddit.com/",
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("https://www.reddit.com/")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '.link': {
35 |             'title': '.title a::text',
36 |             'domain': '.domain a::text',
37 |             'author': '.author::text',
38 |             'comment_count': '.comments::text',
39 |             'score': '.score::text',
40 |         }   
41 |     }   
42 | 
43 |     content_css_rules = { 
44 |         'text': '#Cnt-Main-Article-QQ p *::text',
45 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
46 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
47 |     }
48 | 
49 |     def parse_1(self, response):
50 |         info('Parse '+response.url)
51 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
52 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
53 |         print(json.dumps(x, ensure_ascii=False, indent=2))
54 |         # pp.pprint(x)
55 |         # return self.parse_with_rules(response, self.css_rules, redditItem)
56 | 


--------------------------------------------------------------------------------
/reddit/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = reddit.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = reddit
12 | 


--------------------------------------------------------------------------------
/sinanews/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = sinanews.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sinanews
12 | 


--------------------------------------------------------------------------------
/sinanews/sinanews/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/sinanews/sinanews/__init__.py


--------------------------------------------------------------------------------
/sinanews/sinanews/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class sinanewsItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 |     content = Field()
12 |     url = Field()
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/sinanews/sinanews/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/sinanews/sinanews/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for sinanews project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'sinanews'
17 | 
18 | SPIDER_MODULES = ['sinanews.spiders']
19 | NEWSPIDER_MODULE = 'sinanews.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'sinanews (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomHttpProxyFromMysqlMiddleware': 400,
27 |     'misc.middleware.CustomUserAgentMiddleware': 401,
28 | }
29 | 
30 | ITEM_PIPELINES = {
31 |     'sinanews.pipelines.JsonWithEncodingPipeline': 300,
32 |     #'sinanews.pipelines.RedisPipeline': 301,
33 | }
34 | 
35 | LOG_LEVEL = 'INFO'
36 | 
37 | DOWNLOAD_DELAY = 1
38 | 


--------------------------------------------------------------------------------
/sinanews/sinanews/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sinanews/sinanews/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from sinanews.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | import pprint
24 | class MyPrettyPrinter(pprint.PrettyPrinter):
25 |     def format(self, object, context, maxlevels, level):
26 |         if isinstance(object, unicode):
27 |             return (object.encode('utf8'), True, False)
28 |         return pprint.PrettyPrinter.format(self, object, context, maxlevels, level)
29 | pp = MyPrettyPrinter()
30 | 
31 | 
32 | class sinanewsSpider(CommonSpider):
33 |     name = "sinanews"
34 |     allowed_domains = ["news.sina.com.cn"]
35 |     start_urls = [
36 |         "http://news.sina.com.cn/",
37 |     ]
38 |     rules = [
39 |         Rule(sle(allow=("http://news.sina.com.cn/$")), callback='parse_0'),
40 |         Rule(sle(allow=(".*doc[^/]*shtml$")), callback='parse_1'), #, follow=True),
41 |         #Rule(sle(allow=('/c/2015-11-19/doc-ifxkszhk0386278.shtml')), callback='parse_1', follow=True, process_request='process_request'),
42 |     ]
43 | 
44 |     list_css_rules = {
45 |         '#blk_yw_01 a': {
46 |             'url': 'a::attr(href)',
47 |             'name': 'a::text',
48 |         }
49 |     }
50 | 
51 |     content_css_rules = {
52 |         'text': 'p::text',
53 |         'images': 'img::attr(src)',
54 |         'images-desc': '.img_descr::text',
55 |         # need url analysis for video
56 |         #'video': '#J_Article_Player',
57 |     }
58 | 
59 |     def process_request(self, r):
60 |         info('process '+str(r))
61 |         return r
62 |     
63 |     def parse_0(self, response):
64 |         info('Parse 0 '+response.url)
65 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
66 |         pp.pprint(x)
67 |         #pdb.set_trace()
68 |         #return self.parse_with_rules(response, self.list_css_rules, sinanewsItem)
69 | 
70 |     def parse_1(self, response):
71 |         info('Parse 1 '+response.url)
72 |         x = self.parse_with_rules(response, self.content_css_rules, dict)
73 |         pp.pprint(x)
74 |         #self.parse_with_rules(response, self.css_rules, sinanewsItem)
75 | 


--------------------------------------------------------------------------------
/sis/README.md:
--------------------------------------------------------------------------------
1 | usage:
2 | 
3 | ```sh
4 | scrapy crawl sis -a forum_id=230 -a digit=2
5 | sort -t"\"" -nk12 data_utf8.json
6 | ```
7 | 


--------------------------------------------------------------------------------
/sis/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = sis.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sis
12 | 


--------------------------------------------------------------------------------
/sis/sis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/sis/sis/__init__.py


--------------------------------------------------------------------------------
/sis/sis/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class SisItem(Item):
 9 |     title = Field()
10 |     link = Field()
11 |     imgs = Field()
12 |     torrents = Field()
13 |     sharetitle  = Field()
14 |     bottomline = Field()
15 |     duty = Field()
16 |     xxx = Field()
17 | 
18 | class SisForumListItem(Item):
19 |     content = Field() # raw content with all html
20 |     title = Field()
21 |     thread_type = Field()
22 |     author = Field()
23 |     post_time = Field()
24 |     link = Field()
25 |     star = Field()
26 |     comment = Field()
27 |     view = Field()
28 |     size = Field()
29 |     video_type = Field()
30 |     last_post_time = Field()
31 | 


--------------------------------------------------------------------------------
/sis/sis/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | from scrapy import signals
 7 | 
 8 | 
 9 | import json
10 | import codecs
11 | from collections import OrderedDict
12 | 
13 | 
14 | class JsonWithEncodingPipeline(object):
15 | 
16 |     def __init__(self):
17 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
18 | 
19 |     def process_item(self, item, spider):
20 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
21 |         self.file.write(line)
22 |         return item
23 | 
24 |     def close_spider(self, spider):
25 |         self.file.close()
26 | 


--------------------------------------------------------------------------------
/sis/sis/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for sis project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'sis'
17 | 
18 | SPIDER_MODULES = ['sis.spiders']
19 | NEWSPIDER_MODULE = 'sis.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'sis (+http://www.yourdomain.com)'
23 | DOWNLOADER_MIDDLEWARES = {
24 |     # 'misc.middleware.CustomHttpProxyMiddleware': 400,
25 |     'misc.middleware.CustomUserAgentMiddleware': 401,
26 | }
27 | 
28 | ITEM_PIPELINES = {
29 |     'sis.pipelines.JsonWithEncodingPipeline': 300,
30 | }
31 | 
32 | LOG_LEVEL = 'INFO'
33 | 
34 | DOWNLOAD_DELAY = 1
35 | 


--------------------------------------------------------------------------------
/sis/sis/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sis/sis/spiders/sis_spider.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import re
 4 | import json
 5 | import sys
 6 | from urlparse import urljoin
 7 | 
 8 | 
 9 | from scrapy.selector import Selector
10 | try:
11 |     from scrapy.spiders import Spider
12 | except:
13 |     from scrapy.spiders import BaseSpider as Spider
14 | from scrapy.utils.response import get_base_url
15 | from scrapy.spiders import CrawlSpider, Rule
16 | from scrapy.linkextractors import LinkExtractor as sle
17 | 
18 | from sis.items import *
19 | from misc.log import *
20 | 
21 | 
22 | class sisSpider(CrawlSpider):
23 |     name = "sis"
24 |     ip = "38.103.161.187"
25 |     allowed_domains = [ip]
26 |     ip_format = 'http://' + ip + '/forum/forum-%d-1.html'
27 |     '''
28 |     start_urls = [
29 |         # ip_format % d for d in [230] #[143, 230, 58]
30 |     ]
31 |     rules = [
32 |         # Rule(sle(allow=("/forum/thread-\d*-1-1\.html")), callback='parse_2'),
33 |         # Rule(sle(allow=("/forum/forum-(143|230|58)-[0-9]{,2}\.html")), follow=True, callback='parse_1'),
34 |         Rule(sle(allow=("/forum/forum-230-[0-9]{,4}\.html")), follow=True, callback='parse_1'),
35 |     ]
36 |     '''
37 | 
38 |     def __init__(self, forum_id=58, digit=1, *args, **kwargs):
39 |         self.start_urls = [self.ip_format % d for d in [int(forum_id)]]
40 |         self.rules = [Rule(sle(allow=("/forum/forum-" + str(forum_id) + "-[0-9]{," + str(digit) + "}\.html")), follow=True, callback='parse_1'),]
41 |         super(sisSpider, self).__init__(*args, **kwargs)
42 | 
43 |     def parse_2(self, response):
44 |         items = []
45 |         sel = Selector(response)
46 |         sites = sel.css('.postcontent')[0:1]
47 |         for site in sites:
48 |             item = SisItem()
49 |             item['title'] = site.css('.postmessage h2::text').extract()
50 |             imgs = site.css('.postmessage img::attr(src)').extract()
51 |             item['imgs'] = filter(lambda x: not x.endswith('.gif'), imgs)
52 |             item['torrents'] = [urljoin(response.url, x) for x in site.css('.t_attachlist a[href*=attachment]::attr(href)').extract()]
53 |             # item['duty'] = site.css('.c .l2::text').extract()
54 |             item['link'] = response.url
55 |             items.append(item)
56 |             # print repr(item).decode("unicode-escape") + '\n'
57 |         # info('parsed ' + str(response))
58 |         self.parse_1(response)
59 |         return items
60 | 
61 |     def parse_1(self, response):
62 |         items = []
63 |         # url cannot encode to Chinese easily.. XXX
64 |         info('parsed ' + str(response))
65 |         sel = Selector(response)
66 |         threads = sel.css('tbody[id*=normalthread_]')
67 |         for thread in threads:
68 |             item = SisForumListItem()
69 |             # filter some thread
70 |             inner_thread = thread.css('span[id*=thread_]')
71 |             url = urljoin(response.url, inner_thread.css('a[href]::attr(href)').extract()[0])
72 |             thread_content = re.sub(r"\s\s+", " ", thread.extract())
73 |             # if re.search(u"(奸|姦)", thread_content):
74 |             item['title'] = inner_thread.css('a::text').extract()[0]
75 |             item['link'] = url
76 |             item['star'] = re.sub(r'\s+', '', thread.css('td[class=author] cite::text').extract()[1])
77 |             item['comment'] = thread.css('td[class=nums] strong::text').extract()[0]
78 |             item['view'] = thread.css('td[class=nums] em::text').extract()[0]
79 |             item['post_time'] = thread.css('td[class=author] em::text').extract()[0]
80 |             print ' ', item['post_time'], item['star'], '|', item['title'], item['link'], item['comment'], item['view']
81 | 
82 |             # NOTE: content is only for debug purpose
83 |             # item['content'] = thread_content
84 | 
85 |             items.append(item)
86 |             # yield Request(url, callback=parse_2)
87 |         return items
88 | 
89 |     def _process_request(self, request):
90 |         info('process ' + str(request))
91 |         return request
92 | 


--------------------------------------------------------------------------------
/startproject.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | usage() {
 5 |     echo "\n  usage:\n      ./startproject.sh <project name>\n"
 6 | }
 7 | 
 8 | if [ -z "$1" ]; then
 9 |     usage
10 |     exit
11 | fi
12 | 
13 | echo "Starting project $1."
14 | 
15 | cp -r template $1
16 | if [ "$(uname)" == "Darwin" ]; then
17 |     #alias sed='sed -i'
18 |     find $1 -type f | xargs sed -i '' "s/template/$1/"
19 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
20 |     find $1 -type f | xargs sed -i "s/template/$1/"
21 | elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then
22 |     find $1 -type f | xargs sed -i "s/template/$1/"
23 | fi
24 | mv $1/template $1/$1
25 | 
26 | echo "Create $1 succeed!"
27 | 


--------------------------------------------------------------------------------
/template/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = template.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = template
12 | 


--------------------------------------------------------------------------------
/template/template/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/template/template/__init__.py


--------------------------------------------------------------------------------
/template/template/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class templateItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/template/template/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/template/template/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for template project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'template'
17 | 
18 | SPIDER_MODULES = ['template.spiders']
19 | NEWSPIDER_MODULE = 'template.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'template (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'template.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'template.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/template/template/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/template/template/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from template.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class templateSpider(CommonSpider):
24 |     name = "template"
25 |     allowed_domains = ["template.com"]
26 |     start_urls = [
27 |         "http://www.template.com/",
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("/topsites/category;?[0-9]*/Top/World/Chinese_Simplified_CN/.*$")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '.linkto': {
35 |             'url': 'a::attr(href)',
36 |             'name': 'a::text',
37 |         }   
38 |     }   
39 | 
40 |     content_css_rules = { 
41 |         'text': '#Cnt-Main-Article-QQ p *::text',
42 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
43 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
44 |     }
45 | 
46 |     def parse_1(self, response):
47 |         info('Parse '+response.url)
48 |         # x = self.parse_with_rules(response, self.list_css_rules, dict)
49 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
50 |         # print(json.dumps(x, ensure_ascii=False, indent=2))
51 |         # pp.pprint(x)
52 |         # return self.parse_with_rules(response, self.css_rules, templateItem)
53 | 


--------------------------------------------------------------------------------
/tutorial/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = tutorial.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tutorial
12 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/tutorial/tutorial/__init__.py


--------------------------------------------------------------------------------
/tutorial/tutorial/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class TutorialItem(Item):
 9 |     # define the fields for your item here like:
10 |     # name = Field()
11 |     title = Field()
12 |     link = Field()
13 |     desc = Field()
14 |     num = Field()
15 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/tutorial/tutorial/misc/__init__.py


--------------------------------------------------------------------------------
/tutorial/tutorial/misc/log.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from scrapy import log
 3 | 
 4 | def warn(msg):
 5 |     log.msg(str(msg), level=log.WARNING)
 6 | 
 7 | 
 8 | def info(msg):
 9 |     log.msg(str(msg), level=log.INFO)
10 | 
11 | 
12 | def debug(msg):
13 |     log.msg(str(msg), level=log.DEBUG)
14 | 
15 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/misc/middleware.py:
--------------------------------------------------------------------------------
 1 | from scrapy import log
 2 | from proxy import PROXIES
 3 | from agents import AGENTS
 4 | 
 5 | import random
 6 | 
 7 | 
 8 | class CustomHttpProxyMiddleware(object):
 9 | 
10 |     def process_request(self, request, spider):
11 |         # TODO implement complex proxy providing algorithm
12 |         if self.use_proxy(request):
13 |             p = random.choice(PROXIES)
14 |             try:
15 |                 request.meta['proxy'] = "http://%s" % p['ip_port']
16 |             except Exception, e:
17 |                 log.msg("Exception %s" % e, _level=log.CRITICAL)
18 | 
19 |     def use_proxy(self, request):
20 |         """
21 |         using direct download for depth <= 2
22 |         using proxy with probability 0.3
23 |         """
24 |         if "depth" in request.meta and int(request.meta['depth']) <= 2:
25 |             return False
26 |         i = random.randint(1, 10)
27 |         return i <= 2
28 | 
29 | 
30 | class CustomUserAgentMiddleware(object):
31 |     def process_request(self, request, spider):
32 |         agent = random.choice(AGENTS)
33 |         request.headers['User-Agent'] = agent
34 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | from scrapy import signals
 7 | from scrapy.contrib.exporter import XmlItemExporter
 8 | 
 9 | 
10 | class TutorialPipeline(object):
11 |     def process_item(self, item, spider):
12 |         for field in item:
13 |             print field + ': ' + item[field][0]
14 |         return item
15 | 
16 | 
17 | class XmlExportPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.files = {}
21 | 
22 |     @classmethod
23 |     def from_crawler(cls, crawler):
24 |         pipeline = cls()
25 |         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
26 |         crawler.signals.connect(pipeline.close_spider, signals.close_spider)
27 |         return pipeline
28 | 
29 |     def spider_opened(self, spider):
30 |         file = open('%s_products.xml' % spider.name, 'w+b')
31 |         self.files[spider] = file
32 |         self.exporter = XmlItemExporter(file)
33 |         self.exporter.start_exporting()
34 | 
35 |     def close_spider(self, spider):
36 |         self.exporter.finish_exporting()
37 |         file = self.files.pop(spider)
38 |         file.close()
39 | 
40 |     def process_item(self, item, spider):
41 |         self.exporter.export_item(item)
42 |         return item
43 | 
44 | 
45 | import json
46 | import codecs
47 | 
48 | class JsonWithEncodingPipeline(object):
49 | 
50 |     def __init__(self):
51 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
52 | 
53 |     def process_item(self, item, spider):
54 |         line = json.dumps(dict(item), ensure_ascii=False) + "\n"
55 |         self.file.write(line)
56 |         return item
57 | 
58 |     def close_spider(self, spider):
59 |         self.file.close()
60 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for tutorial project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'tutorial'
10 | 
11 | SPIDER_MODULES = ['tutorial.spiders']
12 | NEWSPIDER_MODULE = 'tutorial.spiders'
13 | ITEM_PIPELINES = {
14 |     #'tutorial.pipelines.JsonWithEncodingPipeline': 300,
15 | }
16 | #Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
18 | 
19 | DOWNLOADER_MIDDLEWARES = {
20 |     #'tutorial.misc.middleware.CustomHttpProxyMiddleware': 400,
21 |     'tutorial.misc.middleware.CustomUserAgentMiddleware': 401,
22 | }
23 | 
24 | LOG_LEVEL = 'INFO'
25 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/underdev/README:
--------------------------------------------------------------------------------
1 | Some crawlers are under dev because related pages should be rendered first.
2 | Consider to use selenium or similar tool.
3 | 


--------------------------------------------------------------------------------
/underdev/meijutt/meijutt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/underdev/meijutt/meijutt/__init__.py


--------------------------------------------------------------------------------
/underdev/meijutt/meijutt/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class meijuttItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/underdev/meijutt/meijutt/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/underdev/meijutt/meijutt/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for meijutt project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'meijutt'
17 | 
18 | SPIDER_MODULES = ['meijutt.spiders']
19 | NEWSPIDER_MODULE = 'meijutt.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'meijutt (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'meijutt.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'meijutt.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/underdev/meijutt/meijutt/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/underdev/meijutt/meijutt/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from meijutt.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class meijuttSpider(CommonSpider):
24 |     name = "meijutt"
25 |     allowed_domains = ["meijutt.com"]
26 |     start_urls = [
27 |         "http://www.meijutt.com/content/meiju117.html", # 3
28 |         "http://www.meijutt.com/content/meiju116.html", # 4
29 |     ]
30 |     rules = [
31 |         Rule(sle(allow=(".*meiju11[67]\.html$")), callback='parse_1', follow=False),
32 |     ]
33 | 
34 |     content_css_rules = {
35 |         '.downurl .adds': {
36 |             'links': 'input::attr(value)'
37 |         }
38 |     }
39 | 
40 |     def parse_1(self, response):
41 |         info('Parse '+response.url)
42 |         x = self.parse_with_rules(response, self.content_css_rules, dict)
43 |         pp.pprint(x)
44 |         # return self.parse_with_rules(response, self.css_rules, meijuttItem)
45 | 


--------------------------------------------------------------------------------
/underdev/meijutt/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = meijutt.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = meijutt
12 | 


--------------------------------------------------------------------------------
/underdev/twitch/README:
--------------------------------------------------------------------------------
1 | AJAX part. Need render engine.
2 | 
3 | Data format may be JSON:
4 |  - https://api.twitch.tv/kraken/videos/top?limit=20&offset=0&period=week&broadcast_type=all&on_site=1
5 | 


--------------------------------------------------------------------------------
/underdev/twitch/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = twitch.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = twitch
12 | 


--------------------------------------------------------------------------------
/underdev/twitch/twitch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/underdev/twitch/twitch/__init__.py


--------------------------------------------------------------------------------
/underdev/twitch/twitch/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class twitchItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/underdev/twitch/twitch/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/underdev/twitch/twitch/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for twitch project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'twitch'
17 | 
18 | SPIDER_MODULES = ['twitch.spiders']
19 | NEWSPIDER_MODULE = 'twitch.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'twitch (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'twitch.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'twitch.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/underdev/twitch/twitch/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/underdev/twitch/twitch/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from twitch.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class twitchSpider(CommonSpider):
24 |     name = "twitch"
25 |     allowed_domains = ["twitch.tv"]
26 |     start_urls = [
27 |         "https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft"
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("https://www.twitch.tv/directory/game/Hearthstone%3A%20Heroes%20of%20Warcraft")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '.content': {
35 |             'room_name': '.meta .title a::text',
36 |             'author': '.meta .info a::text',
37 |             'people_count': '.meta .info a::attr(data-ember-action)'
38 |         }   
39 |     }   
40 | 
41 |     content_css_rules = { 
42 |         'text': '#Cnt-Main-Article-QQ p *::text',
43 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
44 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
45 |     }
46 | 
47 |     def parse_1(self, response):
48 |         info('Parse '+response.url)
49 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
50 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
51 |         import pdb; pdb.set_trace()
52 |         print(json.dumps(x, ensure_ascii=False, indent=2))
53 |         # pp.pprint(x)
54 |         # return self.parse_with_rules(response, self.css_rules, twitchItem)
55 | 


--------------------------------------------------------------------------------
/v2ex/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = v2ex.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = v2ex
12 | 


--------------------------------------------------------------------------------
/v2ex/v2ex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/v2ex/v2ex/__init__.py


--------------------------------------------------------------------------------
/v2ex/v2ex/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class v2exItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/v2ex/v2ex/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/v2ex/v2ex/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for v2ex project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'v2ex'
17 | 
18 | SPIDER_MODULES = ['v2ex.spiders']
19 | NEWSPIDER_MODULE = 'v2ex.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'v2ex (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'v2ex.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'v2ex.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/v2ex/v2ex/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/v2ex/v2ex/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from v2ex.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class v2exSpider(CommonSpider):
24 |     name = "v2ex"
25 |     allowed_domains = ["v2ex.com"]
26 |     start_urls = [
27 |         "http://www.v2ex.com/",
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("http://www.v2ex.com/$")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '.cell.item': {
35 |             'title': '.item_title a::text',
36 |             'node': '.node::text',
37 |             'author': '.node+ strong a::text',
38 |             'reply_count': '.count_livid::text'
39 |         }   
40 |     }   
41 | 
42 |     def parse_1(self, response):
43 |         info('Parse '+response.url)
44 |         # import pdb; pdb.set_trace()
45 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
46 |         print(json.dumps(x, ensure_ascii=False, indent=2))
47 |         #pp.pprint(x)
48 |         # return self.parse_with_rules(response, self.css_rules, v2exItem)
49 | 


--------------------------------------------------------------------------------
/youtube_trending/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = youtube_trending.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = youtube_trending
12 | 


--------------------------------------------------------------------------------
/youtube_trending/youtube_trending/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/youtube_trending/youtube_trending/__init__.py


--------------------------------------------------------------------------------
/youtube_trending/youtube_trending/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class youtube_trendingItem(Item):
 9 |     # define the fields for your item here like:
10 |     name = Field()
11 | 
12 | 


--------------------------------------------------------------------------------
/youtube_trending/youtube_trending/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | class JsonWithEncodingPipeline(object):
18 | 
19 |     def __init__(self):
20 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
21 | 
22 |     def process_item(self, item, spider):
23 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
24 |         self.file.write(line)
25 |         return item
26 | 
27 |     def close_spider(self, spider):
28 |         self.file.close()
29 | 
30 | 
31 | class RedisPipeline(object):
32 | 
33 |     def __init__(self):
34 |         self.r = redis.StrictRedis(host='localhost', port=6379)
35 | 
36 |     def process_item(self, item, spider):
37 |         if not item['id']:
38 |             print 'no id item!!'
39 | 
40 |         str_recorded_item = self.r.get(item['id'])
41 |         final_item = None
42 |         if str_recorded_item is None:
43 |             final_item = item
44 |         else:
45 |             ritem = eval(self.r.get(item['id']))
46 |             final_item = dict(item.items() + ritem.items())
47 |         self.r.set(item['id'], final_item)
48 | 
49 |     def close_spider(self, spider):
50 |         return
51 | 


--------------------------------------------------------------------------------
/youtube_trending/youtube_trending/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for youtube_trending project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'youtube_trending'
17 | 
18 | SPIDER_MODULES = ['youtube_trending.spiders']
19 | NEWSPIDER_MODULE = 'youtube_trending.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'youtube_trending (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'youtube_trending.pipelines.JsonWithEncodingPipeline': 300,
31 |     #'youtube_trending.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/youtube_trending/youtube_trending/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/youtube_trending/youtube_trending/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from urlparse import urlparse
 4 | import urllib
 5 | import pdb
 6 | 
 7 | 
 8 | from scrapy.selector import Selector
 9 | try:
10 |     from scrapy.spiders import Spider
11 | except:
12 |     from scrapy.spiders import BaseSpider as Spider
13 | from scrapy.utils.response import get_base_url
14 | from scrapy.spiders import CrawlSpider, Rule
15 | from scrapy.linkextractors import LinkExtractor as sle
16 | 
17 | 
18 | from youtube_trending.items import *
19 | from misc.log import *
20 | from misc.spider import CommonSpider
21 | 
22 | 
23 | class youtube_trendingSpider(CommonSpider):
24 |     name = "youtube_trending"
25 |     allowed_domains = ["youtube.com"]
26 |     start_urls = [
27 |         "https://www.youtube.com/feed/trending",
28 |     ]
29 |     rules = [
30 |         Rule(sle(allow=("feed/trending$")), callback='parse_1', follow=True),
31 |     ]
32 | 
33 |     list_css_rules = { 
34 |         '.yt-lockup-content': {
35 |             'video_title': '.yt-lockup-title a::text',
36 |             'author': '.yt-lockup-byline a::text',
37 |         }   
38 |     }   
39 | 
40 |     content_css_rules = { 
41 |         'text': '#Cnt-Main-Article-QQ p *::text',
42 |         'images': '#Cnt-Main-Article-QQ img::attr(src)',
43 |         'images-desc': '#Cnt-Main-Article-QQ div p+ p::text',
44 |     }
45 | 
46 |     def parse_1(self, response):
47 |         info('Parse '+response.url)
48 |         x = self.parse_with_rules(response, self.list_css_rules, dict)
49 |         # x = self.parse_with_rules(response, self.content_css_rules, dict)
50 |         print(json.dumps(x, ensure_ascii=False, indent=2))
51 |         # pp.pprint(x)
52 |         # return self.parse_with_rules(response, self.css_rules, youtube_trendingItem)
53 | 


--------------------------------------------------------------------------------
/zhibo8/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -x
4 | 
5 | scrapy crawl zhibo8_schedule 
6 | 


--------------------------------------------------------------------------------
/zhibo8/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zhibo8.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zhibo8
12 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/README.md:
--------------------------------------------------------------------------------
1 | 抓取虎扑新闻和直播吧赛程
2 | 基于scrapy[http://scrapy.org/]
3 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/zhibo8/zhibo8/__init__.py


--------------------------------------------------------------------------------
/zhibo8/zhibo8/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class SportItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     #title = scrapy.Field()
14 |     #url = scrapy.Field() 
15 |     start_time = scrapy.Field() 
16 |     home_team = scrapy.Field() 
17 |     guest_team = scrapy.Field() 
18 |     guest_team = scrapy.Field() 
19 |     match_date = scrapy.Filed()
20 |     game_type = scrapy.Filed()
21 |     home_logo = scrapy.Filed()
22 |     guest_logo = scrapy.Filed()
23 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import MySQLdb
 9 | from utils.mysqldriver import MySQL
10 | 
11 | class SportPipeline(object):
12 |     _db = None
13 | 
14 |     def __init__(self):
15 |         dbconfig = {
16 |           'host':'localhost', 
17 |           'port': 3306, 
18 |           'user':'root', 
19 |           'passwd':'111111', 
20 |           'db':'sport', 
21 |           'charset':'utf8'
22 |         }
23 |     
24 |         self._db = MySQL(dbconfig)
25 | 
26 |     def process_item(self, item, spider):    
27 |         insert_sql = "INSERT INTO sport_schedule(home_team,guest_team,home_logo,guest_logo,match_date,game_type) values \
28 |                       ('%s','%s','%s','%s','%s','%s')" % (item['home_team'], item['guest_team'], item['home_logo'],\
29 |                       item['guest_logo'], item['match_date'], item['game_type'])
30 | 
31 |         self._db.insert(insert_sql)
32 |         return item
33 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-G
 2 | 
 3 | # Scrapy settings for zhibo8 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'zhibo8'
13 | 
14 | SPIDER_MODULES = ['zhibo8.spiders']
15 | NEWSPIDER_MODULE = 'zhibo8.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'zhibo8 (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | #CONCURRENT_REQUESTS=32
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | #DOWNLOAD_DELAY=3
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'zhibo8.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | #DOWNLOADER_MIDDLEWARES = {
53 | #    'zhibo8.middlewares.MyCustomDownloaderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable extensions
57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
58 | #EXTENSIONS = {
59 | #    'scrapy.telnet.TelnetConsole': None,
60 | #}
61 | 
62 | # Configure item pipelines
63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64 | ITEM_PIPELINES = {
65 |     'zhibo8.pipelines.SportPipeline': 300,
66 | }
67 | 
68 | # Enable and configure the AutoThrottle extension (disabled by default)
69 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
70 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
71 | #AUTOTHROTTLE_ENABLED=True
72 | # The initial download delay
73 | #AUTOTHROTTLE_START_DELAY=5
74 | # The maximum download delay to be set in case of high latencies
75 | #AUTOTHROTTLE_MAX_DELAY=60
76 | # Enable showing throttling stats for every response received:
77 | #AUTOTHROTTLE_DEBUG=False
78 | 
79 | # Enable and configure HTTP caching (disabled by default)
80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
81 | #HTTPCACHE_ENABLED=True
82 | #HTTPCACHE_EXPIRATION_SECS=0
83 | #HTTPCACHE_DIR='httpcache'
84 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
85 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
86 | 
87 | # start MySQL database configure setting
88 | MYSQL_HOST = 'localhost'
89 | MYSQL_DBNAME = 'zhibo8'
90 | MYSQL_USER = 'root'
91 | MYSQL_PASSWD = '111111'
92 | # end of MySQL database configure setting
93 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/spiders/example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class ExampleSpider(scrapy.Spider):
 6 |     name = "example"
 7 |     allowed_domains = ["example.com"]
 8 |     start_urls = (
 9 |         'http://www.example.com/',
10 |     )
11 | 
12 |     def parse(self, response):
13 |         pass
14 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/spiders/hupu_news_spider.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf8
 2 | """
 3 | 虎扑新闻抓取
 4 | """
 5 | import scrapy
 6 | 
 7 | class HupuNewsSpider(scrapy.Spider):
 8 |     name = "hupu_news"
 9 |     allowed_domains = ["hupu.com"]
10 |     start_urls = ["http://voice.hupu.com/nba/newslist"]
11 | 
12 |     def parse(self, response):
13 |         # refer : http://scrapy-chs.readthedocs.org/zh_CN/0.24/topics/selectors.html#topics-selectors-relative-xpaths
14 |         li_list = response.xpath('//div[@class="news-list"]/ul/li')
15 |         for li in li_list:
16 |             #print li.extract()
17 |             a = li.xpath('div/h4/a[1]')
18 |             #print a.extract()
19 |             title = li.xpath('div/h4/a[1]/text()').extract()
20 |             link = li.xpath('div/h4/a[1]/@href').extract()
21 |             print title[0],link[0]
22 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/spiders/zhibo8_decrypt.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf8
 2 | """
 3 | zhibo8直播源解密
 4 | """
 5 | import scrapy
 6 | import sys
 7 | import re 
 8 | import urllib2
 9 | reload(sys)
10 | sys.setdefaultencoding( "utf-8" )
11 | 
12 | test_url = 'http://zhibo8.cc/zhibo/zuqiu/2016/0203laisitechengvsliwupu.htm'
13 | 
14 | class Zhibo8Decrypt():
15 | 
16 |    def get_content(self, url, send_headers=''):
17 |        send_headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0'}
18 |        req = urllib2.Request(url, headers=send_headers)
19 |        ret = urllib2.urlopen(req)
20 |        html = ret.read()
21 |        return html
22 | 
23 |    def decrypt(self, content):
24 |         if '' == content:
25 |             return '' 
26 |         pattern = re.compile(r'C0ha0ne0l(.*?)')
27 |         ch = pattern.findall(content)
28 |         return ch
29 | 
30 | 
31 | zd = Zhibo8Decrypt()
32 | content = zd.get_content(test_url)
33 | zd.decrypt(content)
34 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/spiders/zhibo8_schedule_spider.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf8
 2 | """
 3 | zhibo8比分
 4 | """
 5 | import scrapy
 6 | import sys
 7 | reload(sys)
 8 | sys.setdefaultencoding( "utf-8" )
 9 | 
10 | game_type_list = [
11 |         u'CBA', 
12 |         u'NBA', 
13 |         u'法甲', 
14 |         u'英超', 
15 |         u'西甲', 
16 |         u'意甲', 
17 |         u'德甲', 
18 |         u'足总杯', 
19 |         u'国王杯', 
20 |         u'德国杯', 
21 |         u'解放者杯',
22 |         u'意大利杯',
23 |  ] 
24 | 
25 | 
26 | class Zhibo8ScheduleSpider(scrapy.Spider):
27 |     name = "zhibo8_schedule"
28 |     allowed_domains = ["zhibo8.cc"]
29 |     start_urls = ["http://zhibo8.cc/"]
30 | 
31 |     def parse(self, response):
32 |         arr_matches = []
33 |         # refer : http://scrapy-chs.readthedocs.org/zh_CN/0.24/topics/selectors.html#topics-selectors-relative-xpaths
34 |         div_list = response.xpath('//div[@class="schedule_container left"]/div[@class="box"]')
35 |         for div in div_list:
36 |             match_date = div.xpath('div[@class="titlebar"]/h2[1]/@title').extract()[0]
37 |             li_list = div.xpath('div[@class="content"]/ul/li')
38 |             ymd = match_date.replace('-', '')
39 |             for li in li_list:
40 |                 match = {}
41 |                 match['tags'] = li.xpath('./@label').extract()[0]
42 |                 text_content = li.xpath('string(.)').extract()[0]
43 |                 text_content = text_content.replace('  ', ' ')
44 |                 arr_content = text_content.split(' ')
45 |                 #print 0,arr_content[0],1,arr_content[1],2,arr_content[2],3,arr_content[3],4,arr_content[4],\
46 |                 #        5,arr_content[5],6,arr_content[6],7,arr_content[7]
47 |                 if len(arr_content) < 5 or '-' != arr_content[3]:
48 |                     continue
49 |                 match['start_time'] = match_date + ' ' + arr_content[0] + ':00'
50 |                 match['home_team']  = arr_content[2]
51 |                 match['guest_team'] = arr_content[4]
52 |                 match['match_date'] = ymd
53 |                 match['game_type']  = self.get_gametype(arr_content[1])
54 |                 match['home_logo']  = self.get_home_logo(li)
55 |                 match['guest_logo'] = self.get_guest_logo(li)
56 |                 #print match['start_time'],match['home_team'],match['guest_team'],match['home_logo'],match['guest_logo']
57 |                 arr_matches.append(match)
58 | 
59 |         return arr_matches
60 | 
61 | 
62 |     def get_gametype(self, s):
63 |         for game_type in game_type_list:
64 |             if game_type in s:
65 |                 return game_type
66 |         return s
67 | 
68 | 
69 |     def get_home_logo(self, li):
70 |         if li.xpath('./img[1]'):
71 |             return li.xpath('./img[1]/@src').extract()[0]
72 |         elif li.xpath('./b/img[1]'):
73 |             return li.xpath('./b/img[1]/@src').extract()[0]
74 |         else:
75 |             return ''
76 | 
77 |     def get_guest_logo(self, li):
78 |         if li.xpath('./img[2]'):
79 |             return li.xpath('./img[2]/@src').extract()[0]
80 |         elif li.xpath('./b/img[2]'):
81 |             return li.xpath('./b/img[2]/@src').extract()[0]
82 |         else:
83 |             return ''
84 | 
85 | 


--------------------------------------------------------------------------------
/zhibo8/zhibo8/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/zhibo8/zhibo8/utils/__init__.py


--------------------------------------------------------------------------------
/zhibo8/zhibo8/utils/mysqldriver.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*- 
  3 | u'''对MySQLdb常用函数进行封装的类
  4 |  
  5 |  整理者：兔大侠和他的朋友们（http://www.tudaxia.com）
  6 |  日期：2014-04-22
  7 |  出处：源自互联网，共享于互联网:-)
  8 |  
  9 |  注意：使用这个类的前提是正确安装 MySQL-Python模块。
 10 |  官方网站：http://mysql-python.sourceforge.net/
 11 | '''
 12 | 
 13 | import MySQLdb
 14 | import time
 15 | 
 16 | class MySQL:
 17 |     u'''对MySQLdb常用函数进行封装的类'''
 18 |     
 19 |     error_code = '' #MySQL错误号码
 20 | 
 21 |     _instance = None #本类的实例
 22 |     _conn = None # 数据库conn
 23 |     _cur = None #游标
 24 | 
 25 |     _TIMEOUT = 30 #默认超时30秒
 26 |     _timecount = 0
 27 |       
 28 |     def __init__(self, dbconfig):
 29 |       u'构造器：根据数据库连接参数，创建MySQL连接'
 30 |       try:
 31 |         self._conn = MySQLdb.connect(host=dbconfig['host'],
 32 |                        port=dbconfig['port'], 
 33 |                        user=dbconfig['user'],
 34 |                        passwd=dbconfig['passwd'],
 35 |                        db=dbconfig['db'],
 36 |                        charset=dbconfig['charset'])
 37 |       except MySQLdb.Error, e:
 38 |         self.error_code = e.args[0]
 39 |         error_msg = 'MySQL error! ', e.args[0], e.args[1]
 40 |         print error_msg
 41 |         
 42 |         # 如果没有超过预设超时时间，则再次尝试连接，
 43 |         if self._timecount < self._TIMEOUT:
 44 |           interval = 5
 45 |           self._timecount += interval
 46 |           time.sleep(interval)
 47 |           return self.__init__(dbconfig)
 48 |         else:
 49 |           raise Exception(error_msg)
 50 |       
 51 |       self._cur = self._conn.cursor()
 52 |       self._instance = MySQLdb
 53 | 
 54 |     def query(self,sql):
 55 |       u'执行 SELECT 语句'  
 56 |       try:
 57 |         self._cur.execute("SET NAMES utf8") 
 58 |         result = self._cur.execute(sql)
 59 |       except MySQLdb.Error, e:
 60 |         self.error_code = e.args[0]
 61 |         print "数据库错误代码:",e.args[0],e.args[1]
 62 |         result = False
 63 |       return result
 64 | 
 65 |     def update(self,sql):
 66 |       u'执行 UPDATE 及 DELETE 语句'
 67 |       try:
 68 |         self._cur.execute("SET NAMES utf8") 
 69 |         result = self._cur.execute(sql)
 70 |         self._conn.commit()
 71 |       except MySQLdb.Error, e:
 72 |         self.error_code = e.args[0]
 73 |         print "数据库错误代码:",e.args[0],e.args[1]
 74 |         result = False
 75 |       return result
 76 |       
 77 |     def insert(self,sql):
 78 |       u'执行 INSERT 语句。如主键为自增长int，则返回新生成的ID'
 79 |       try:
 80 |         self._cur.execute("SET NAMES utf8")
 81 |         self._cur.execute(sql)
 82 |         self._conn.commit()
 83 |         return self._conn.insert_id()
 84 |       except MySQLdb.Error, e:
 85 |         self.error_code = e.args[0]
 86 |         return False
 87 |     
 88 |     def fetchAllRows(self):
 89 |       u'返回结果列表'
 90 |       return self._cur.fetchall()
 91 | 
 92 |     def fetchOneRow(self):
 93 |       u'返回一行结果，然后游标指向下一行。到达最后一行以后，返回None'
 94 |       return self._cur.fetchone()
 95 |    
 96 |     def getRowCount(self):
 97 |       u'获取结果行数'
 98 |       return self._cur.rowcount
 99 |                 
100 |     def commit(self):
101 |       u'数据库commit操作'
102 |       self._conn.commit()
103 |               
104 |     def rollback(self):
105 |       u'数据库回滚操作'
106 |       self._conn.rollback()
107 |          
108 |     def __del__(self): 
109 |       u'释放资源（系统GC自动调用）'
110 |       try:
111 |         self._cur.close() 
112 |         self._conn.close() 
113 |       except:
114 |         pass
115 |       
116 |     def  close(self):
117 |       u'关闭数据库连接'
118 |       self.__del__()
119 |    
120 | """
121 | if __name__ == '__main__':
122 |     '''使用样例'''
123 |     
124 |     #数据库连接参数  
125 |     dbconfig = {'host':'localhost', 
126 |           'port': 3306, 
127 |           'user':'dbuser', 
128 |           'passwd':'dbpassword', 
129 |           'db':'testdb', 
130 |           'charset':'utf8'}
131 |     
132 |     #连接数据库，创建这个类的实例
133 |     db = MySQL(dbconfig)
134 |     
135 |     #操作数据库
136 |     sql = "SELECT * FROM `sample_table`"
137 |     db.query(sql);
138 |     
139 |     #获取结果列表
140 |     result = db.fetchAllRows();
141 |     
142 |     #相当于php里面的var_dump
143 |     print result
144 |     
145 |     #对行进行循环
146 |     for row in result:
147 |       #使用下标进行取值
148 |       #print row[0]
149 |       
150 |       #对列进行循环
151 |       for colum in row:
152 |         print colum
153 |    
154 |     #关闭数据库
155 |     db.close()
156 | """
157 | 


--------------------------------------------------------------------------------
/zhihu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = zhihu.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zhihu
12 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/zhihu/zhihu/__init__.py


--------------------------------------------------------------------------------
/zhihu/zhihu/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class ZhihuPeopleItem(Item):
 9 |     # define the fields for your item here like:
10 |     id = Field()
11 |     name = Field()
12 |     sign = Field()
13 |     location = Field()
14 |     business = Field()
15 |     employment = Field()
16 |     position = Field()
17 |     education = Field()
18 |     education_extra = Field()
19 |     description = Field()
20 |     agree = Field()
21 |     thanks = Field()
22 |     asks = Field()
23 |     answers = Field()
24 |     posts = Field()
25 |     collections = Field()
26 |     logs = Field()
27 |     followees = Field()
28 |     followers = Field()
29 |     follow_topics = Field()
30 | 
31 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | from scrapy import signals
10 | 
11 | 
12 | import json
13 | import codecs
14 | from collections import OrderedDict
15 | 
16 | 
17 | from misc.log import *
18 | 
19 | 
20 | class JsonWithEncodingPipeline(object):
21 | 
22 |     def __init__(self):
23 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
24 | 
25 |     def process_item(self, item, spider):
26 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
27 |         self.file.write(line)
28 |         return item
29 | 
30 |     def close_spider(self, spider):
31 |         self.file.close()
32 | 
33 | 
34 | class RedisPipeline(object):
35 | 
36 |     def __init__(self):
37 |         self.r = redis.StrictRedis(host='localhost', port=6379)
38 | 
39 |     def process_item(self, item, spider):
40 |         if not item['id']:
41 |             print 'no id item!!'
42 | 
43 |         str_recorded_item = self.r.get(item['id'])
44 |         final_item = None
45 |         if str_recorded_item is None:
46 |             final_item = item
47 |         else:
48 |             ritem = eval(self.r.get(item['id']))
49 |             if ritem == item:
50 |                 debug('item '+item['id']+' equal')
51 |             else:
52 |                 # info('item '+item['id']+' merge\n'+str(item)+'\n'+str(ritem))
53 |                 info('item '+item['id']+' use new item')
54 |             # final_item = dict(item.items() + ritem.items())
55 |             final_item = item
56 |         self.r.set(item['id'], final_item)
57 | 
58 |     def close_spider(self, spider):
59 |         return
60 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/redis-test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import redis
 4 | import json
 5 | 
 6 | 
 7 | r = redis.StrictRedis(host='localhost', port=6379)
 8 | 
 9 | 
10 | def dump_all(redis=r):
11 |     keys = redis.keys('*')
12 |     pairs = {}
13 |     for key in keys:
14 |         type = redis.type(key)
15 |         val = redis.get(key)
16 |         try:
17 |             pairs[key] = eval(val)
18 |         except Exception as e:
19 |             print pairs, key, val, e
20 |     return pairs
21 | 
22 | def del_all(redis=r):
23 |     keys = redis.keys('*')
24 |     for k in keys:
25 |         print 'Deleting:', k, 'result is', redis.delete(k)
26 | 
27 | def main():
28 |     # del_all()
29 |     # print json.dumps(dump_all(), indent=4)
30 |     keys = r.keys('*')
31 |     print keys
32 |     print len(keys)
33 | 
34 | if __name__ == '__main__':
35 |     main()
36 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for zhihu project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | import sys
10 | import os
11 | from os.path import dirname
12 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
13 | sys.path.append(path)
14 | from misc.log import *
15 | 
16 | BOT_NAME = 'zhihu'
17 | 
18 | SPIDER_MODULES = ['zhihu.spiders']
19 | NEWSPIDER_MODULE = 'zhihu.spiders'
20 | 
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | #USER_AGENT = 'zhihu (+http://www.yourdomain.com)'
23 | 
24 | DOWNLOADER_MIDDLEWARES = {
25 |    # 'misc.middleware.CustomHttpProxyMiddleware': 400,
26 |     'misc.middleware.CustomUserAgentMiddleware': 401,
27 | }
28 | 
29 | ITEM_PIPELINES = {
30 |     'zhihu.pipelines.JsonWithEncodingPipeline': 300,
31 |     'zhihu.pipelines.RedisPipeline': 301,
32 | }
33 | 
34 | LOG_LEVEL = 'INFO'
35 | 
36 | DOWNLOAD_DELAY = 1
37 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/spiders/zhihu_spider.py:
--------------------------------------------------------------------------------
  1 | #coding: utf-8
  2 | 
  3 | import re
  4 | import json
  5 | from urlparse import urlparse
  6 | 
  7 | 
  8 | from scrapy.selector import Selector
  9 | try:
 10 |     from scrapy.spiders import Spider
 11 | except:
 12 |     from scrapy.spiders import BaseSpider as Spider
 13 | from scrapy.utils.response import get_base_url
 14 | from scrapy.spiders import CrawlSpider, Rule
 15 | from scrapy.linkextractors import LinkExtractor as sle
 16 | 
 17 | 
 18 | from zhihu.items import *
 19 | from misc.log import *
 20 | 
 21 | '''
 22 | 1. 默认取sel.css()[0]，如否则需要'__unique':false
 23 | 2. 默认字典均为css解析，如否则需要'__use':'dump'表明是用于dump数据
 24 | '''
 25 | 
 26 | class ZhihuSpider(CrawlSpider):
 27 |     name = "zhihu"
 28 |     allowed_domains = ["zhihu.com"]
 29 |     start_urls = [
 30 |         "http://www.zhihu.com/",
 31 |         "http://www.zhihu.com/people/jia-yang-qing-74",
 32 |     ]
 33 |     rules = [
 34 |         Rule(sle(allow=("/people/[^/]+/followees$")), callback='parse_followees'),
 35 |         Rule(sle(allow=("/people/[^/]+/followers$", )), callback='parse_followers'),
 36 |         Rule(sle(allow=("/people/[^/]+$", )), callback='parse_people_with_rules', follow=True),
 37 |     ]
 38 | 
 39 |     # need dfs/bfs
 40 |     all_css_rules = {
 41 |         '.zm-profile-header': {
 42 |             '.zm-profile-header-main': {
 43 |                 '__use':'dump',
 44 |                 'name':'.title-section .name::text',
 45 |                 'sign':'.title-section .bio::text',
 46 |                 'location':'.location.item::text',
 47 |                 'business':'.business.item::text',
 48 |                 'employment':'.employment.item::text',
 49 |                 'position':'.position.item::text',
 50 |                 'education':'.education.item::text',
 51 |                 'education_extra':'.education-extra.item::text',
 52 |             }, '.zm-profile-header-operation': {
 53 |                 '__use':'dump',
 54 |                 'agree':'.zm-profile-header-user-agree strong::text',
 55 |                 'thanks':'.zm-profile-header-user-thanks strong::text',
 56 |             }, '.profile-navbar': {
 57 |                 '__use':'dump',
 58 |                 'asks':'a[href*=asks] .num::text',
 59 |                 'answers':'a[href*=answers] .num::text',
 60 |                 'posts':'a[href*=posts] .num::text',
 61 |                 'collections':'a[href*=collections] .num::text',
 62 |                 'logs':'a[href*=logs] .num::text',
 63 |             },
 64 |         }, '.zm-profile-side-following': {
 65 |             '__use':'dump',
 66 |             'followees':'a.item[href*=followees] strong::text',
 67 |             'followers':'a.item[href*=followers] strong::text',
 68 |         }
 69 |     }
 70 | 
 71 |     def traversal(self, sel, rules, item):
 72 |         # print 'traversal:', sel, rules.keys()
 73 |         if '__use' in rules:
 74 |             for nk, nv in rules.items():
 75 |                 if nk == '__use':
 76 |                     continue
 77 |                 if nk not in item:
 78 |                     item[nk] = []
 79 |                 if sel.css(nv):
 80 |                     item[nk] += [i.extract() for i in sel.css(nv)]
 81 |                 else:
 82 |                     item[nk] = []
 83 |         else:
 84 |             for nk, nv in rules.items():
 85 |                 for i in sel.css(nk):
 86 |                     self.traversal(i, nv, item)
 87 | 
 88 |     def dfs(self, sel, rules, item_class):
 89 |         if sel is None:
 90 |             return []
 91 |         item = item_class()
 92 |         self.traversal(sel, rules, item)
 93 |         return item
 94 | 
 95 |     def parse_with_rules(self, response, rules, item_class):
 96 |         return self.dfs(Selector(response), rules, item_class)
 97 | 
 98 |     def parse_people_with_rules(self, response):
 99 |         item = self.parse_with_rules(response, self.all_css_rules, ZhihuPeopleItem)
100 |         item['id'] = urlparse(response.url).path.split('/')[-1]
101 |         info('Parsed '+response.url) # +' to '+str(item))
102 |         return item
103 | 
104 |     def parse_followers(self, response):
105 |         return self.parse_people_with_rules(response)
106 | 
107 |     def parse_followees(self, response):
108 |         return self.parse_people_with_rules(response)
109 | 


--------------------------------------------------------------------------------
/ziroom/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ziroom.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ziroom
12 | 


--------------------------------------------------------------------------------
/ziroom/ziroom/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekan/scrapy-examples/edb1cb116bd6def65a6ef01f953b58eb43e54305/ziroom/ziroom/__init__.py


--------------------------------------------------------------------------------
/ziroom/ziroom/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ZiroomItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     room_id = scrapy.Field()
15 |     room_price = scrapy.Field()
16 |     room_name = scrapy.Field()
17 |     modifyDate = scrapy.Field()
18 | 


--------------------------------------------------------------------------------
/ziroom/ziroom/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Define your item pipelines here
 3 | #
 4 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 5 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 6 | 
 7 | import redis
 8 | 
 9 | 
10 | from scrapy import signals
11 | 
12 | 
13 | import json
14 | import codecs
15 | from collections import OrderedDict
16 | 
17 | 
18 | class JsonWithEncodingPipeline(object):
19 | 
20 |     def __init__(self):
21 |         self.file = codecs.open('data_utf8.json', 'w', encoding='utf-8')
22 | 
23 |     def process_item(self, item, spider):
24 |         line = json.dumps(OrderedDict(item), ensure_ascii=False, sort_keys=False) + "\n"
25 |         self.file.write(line)
26 |         return item
27 | 
28 |     def close_spider(self, spider):
29 |         self.file.close()
30 | 
31 | 
32 | class RedisPipeline(object):
33 | 
34 |     def __init__(self):
35 |         self.r = redis.StrictRedis(host='localhost', port=6379)
36 | 
37 |     def process_item(self, item, spider):
38 |         if not item['id']:
39 |             print 'no id item!!'
40 | 
41 |         str_recorded_item = self.r.get(item['id'])
42 |         final_item = None
43 |         if str_recorded_item is None:
44 |             final_item = item
45 |         else:
46 |             ritem = eval(self.r.get(item['id']))
47 |             final_item = dict(item.items() + ritem.items())
48 |         self.r.set(item['id'], final_item)
49 | 
50 |     def spider_closed(self, spider):
51 |         return
52 | 


--------------------------------------------------------------------------------
/ziroom/ziroom/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Scrapy settings for ziroom project
 3 | #
 4 | # For simplicity, this file contains only the most important settings by
 5 | # default. All the other settings are documented here:
 6 | #
 7 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 8 | #
 9 | 
10 | import sys
11 | import os
12 | from os.path import dirname
13 | path = dirname(dirname(os.path.abspath(os.path.dirname(__file__))))
14 | sys.path.append(path)
15 | 
16 | BOT_NAME = 'ziroom'
17 | 
18 | SPIDER_MODULES = ['ziroom.spiders']
19 | NEWSPIDER_MODULE = 'ziroom.spiders'
20 | 
21 | 
22 | 
23 | DOWNLOADER_MIDDLEWARES = {
24 |     #'misc.middleware.CustomHttpProxyMiddleware': 400,
25 |     'misc.middleware.CustomUserAgentMiddleware': 401,
26 | }
27 | 
28 | ITEM_PIPELINES = {
29 |     'ziroom.pipelines.JsonWithEncodingPipeline': 300,
30 |     #'template.pipelines.RedisPipeline': 301,
31 | }
32 | 
33 | LOG_LEVEL = 'INFO'
34 | 


--------------------------------------------------------------------------------
/ziroom/ziroom/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ziroom/ziroom/spiders/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy.spiders import Spider
 3 | from scrapy import Request
 4 | import re
 5 | import time
 6 | 
 7 | from ziroom.items import ZiroomItem
 8 | 
 9 | class Parse():
10 |     def __init__(self, response):
11 |         self.response = response
12 |         self.room_detail = response.xpath('//div[@class="room_detail_right"]')[0]
13 |         self.room_info = ' '.join(self.room_detail.xpath('.//ul[@class="detail_room"]/li/text()').extract())
14 |         self.metro_info = ''.join(self.room_detail.xpath('.//span[@id="lineList"]/text()').extract()).replace(' ', '').replace('\n',
15 |                                                                                                                   '')
16 |     def getID(self):
17 |         return int(re.findall('\d+', self.response.url)[0])
18 |     def getName(self):
19 |         return self.room_detail.xpath('.//h2/text()').extract()[0].replace(' ', '').replace('\n', '')
20 |     def getPrice(self):
21 |         room_price = int(self.room_detail.xpath('.//span[@class="room_price"]/text()').extract()[0][1:])
22 |         if room_price < 500:
23 |             room_price *= 30
24 |         return room_price
25 | 
26 | 
27 | 
28 | class PagesSpider(Spider):
29 |     name = "ziroom"
30 |     start_urls = ['http://www.ziroom.com/z/nl/z3.html?p=1']
31 | 
32 |     def parse(self, response):
33 |         print response.url
34 |         page = re.findall('p=(\d+)', response.url)[0]
35 | 
36 |         houseList = response.xpath('//ul[@id="houseList"]/li')
37 |         for each in houseList:
38 |             url = each.xpath('div/h3/a/@href').extract()[0][2:].encode('utf-8')
39 |             yield Request('http://' + url, self.parseItem)
40 | 
41 |         url = response.url
42 |         url_new = url.replace(page, str(int(page) + 1))
43 |         # yield Request(url_new, self.parse)
44 | 
45 |     def parseItem(self, response):
46 |         p = Parse(response)
47 |         item = ZiroomItem()
48 |         item['modifyDate'] = int(time.time())
49 |         item['room_id'] = p.getID()
50 |         item['room_price'] = p.getPrice()
51 |         item['room_name'] = p.getName()
52 |         yield item


--------------------------------------------------------------------------------