├── README.md
├── blog_crawl
    ├── README
    ├── blog_crawl
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── mindhacks_spider.py
    └── scrapy.cfg
├── docs
    ├── README
    ├── scrapyd
    └── webkit2png
├── newsspider
    ├── .scrapy
    │   └── scrapyd
    │   │   ├── dbs
    │   │       └── default.db
    │   │   ├── items
    │   │       └── default
    │   │       │   ├── hackernews
    │   │       │       ├── 325d2fa68a0e11e2adba7e97b6ad9650.jl
    │   │       │       └── c58abb2c87d411e2adba7e97b6ad9650.jl
    │   │       │   └── mindhacks
    │   │       │       └── ff9b53c687d411e2adba7e97b6ad9650.jl
    │   │   └── logs
    │   │       └── default
    │   │           ├── all
    │   │               └── b29488fe87d411e2adba7e97b6ad9650.log
    │   │           ├── hackernews
    │   │               ├── 325d2fa68a0e11e2adba7e97b6ad9650.log
    │   │               └── c58abb2c87d411e2adba7e97b6ad9650.log
    │   │           ├── mindhacks
    │   │               └── ff9b53c687d411e2adba7e97b6ad9650.log
    │   │           └── somespider
    │   │               └── 28e7b04a8a0e11e2adba7e97b6ad9650.log
    ├── README.md
    ├── dbs
    │   └── default.db
    ├── newsspider
    │   ├── __init__.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   └── allcrawl.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── cnblogs.py
    │   │   ├── dbanotes.py
    │   │   ├── hackernews.py
    │   │   ├── jobbole.py
    │   │   ├── mindhacks.py
    │   │   └── reddit.py
    ├── proxies.txt
    ├── query_db.py
    ├── scrapy.cfg
    ├── webkit2png
    ├── webkit2png.log
    └── webkit2png.py
├── proxycrawler
    ├── proxies.txt
    ├── proxycrawler
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── proxy.py
    └── scrapy.cfg
├── scrapy-ws.py
├── server
    ├── code.py
    ├── config.py
    ├── db.py
    ├── sql
    │   └── tables.sql
    ├── static
    │   └── favicon.ico
    ├── templates
    │   ├── base.html
    │   ├── item.html
    │   └── listing.html
    ├── view.py
    ├── webkit2png
    └── webkit2png.py
├── tutorial
    ├── README
    ├── items.json
    ├── scrapy.cfg
    └── tutorial
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── dmoz_spider.py
└── web.py
    ├── README
    ├── simple-blog
        ├── blog.py
        ├── model.py
        ├── schema.sql
        └── templates
        │   ├── base.html
        │   ├── edit.html
        │   ├── index.html
        │   ├── new.html
        │   └── view.html
    ├── simple-todo
        ├── model.py
        ├── schema.sql
        ├── templates
        │   ├── base.html
        │   └── index.html
        └── todo.py
    └── simple-wiki
        ├── model.py
        ├── schema.sql
        ├── templates
            ├── base.html
            ├── edit.html
            ├── index.html
            ├── new.html
            └── view.html
        └── wiki.py


/README.md:
--------------------------------------------------------------------------------
 1 | Scrapy-Examples
 2 | ===============
 3 | Some examples for scrapy.
 4 | 
 5 | Install methods:
 6 | 
 7 |     pip install twisted scrapy
 8 |     pip install BeautifulSoup
 9 | 
10 |     cp newsspider/webkit2png* /usr/bin
11 | 


--------------------------------------------------------------------------------
/blog_crawl/README:
--------------------------------------------------------------------------------
 1 | 
 2 | Crawl posts from mindhacks.cn
 3 | 
 4 | This exmaple demonstrates:
 5 |     
 6 |     Crawl urls from mindhacks.cn
 7 |     Save post contents to sqlite3
 8 |     How to follow next page link
 9 | 
10 | 
11 | 
12 | 
13 | Reference:
14 |     http://blog.pluskid.org/?p=366


--------------------------------------------------------------------------------
/blog_crawl/blog_crawl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/blog_crawl/blog_crawl/__init__.py


--------------------------------------------------------------------------------
/blog_crawl/blog_crawl/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class BlogCrawlItem(Item):
 9 |     # define the fields for your item here like:
10 |     url = Field()
11 |     raw = Field()
12 | 
13 |     def __str__(self):
14 |         return self['url']
15 | 


--------------------------------------------------------------------------------
/blog_crawl/blog_crawl/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
 5 | 
 6 | import sqlite3
 7 | from os import path
 8 |  
 9 | from scrapy import signals
10 | from scrapy.xlib.pydispatch import dispatcher
11 | 
12 | class BlogCrawlPipeline(object):
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 |  
17 | class SQLiteStorePipeline(object):
18 |     filename = 'data.sqlite'
19 |  
20 |     def __init__(self):
21 |         self.conn = None
22 |         dispatcher.connect(self.initialize, signals.engine_started)
23 |         dispatcher.connect(self.finalize, signals.engine_stopped)
24 |  
25 |     def process_item(self, item, domain):
26 |         try:
27 |             self.conn.execute('insert into blog values(?,?,?)', 
28 |                           (item['url'], item['raw'], unicode(domain)))
29 |         except:
30 |             print 'Failed to insert item: ' + item['url']
31 |         return item
32 |  
33 |     def initialize(self):
34 |         if path.exists(self.filename):
35 |             self.conn = sqlite3.connect(self.filename)
36 |         else:
37 |             self.conn = self.create_table(self.filename)
38 |  
39 |     def finalize(self):
40 |         if self.conn is not None:
41 |             self.conn.commit()
42 |             self.conn.close()
43 |             self.conn = None
44 |  
45 |     def create_table(self, filename):
46 |         conn = sqlite3.connect(filename)
47 |         conn.execute("""create table blog
48 |                      (url text primary key, raw text, domain text)""")
49 |         conn.commit()
50 |         return conn
51 | 


--------------------------------------------------------------------------------
/blog_crawl/blog_crawl/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for blog_crawl project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'blog_crawl'
10 | 
11 | SPIDER_MODULES = ['blog_crawl.spiders']
12 | NEWSPIDER_MODULE = 'blog_crawl.spiders'
13 | 
14 | ITEM_PIPELINES = ['blog_crawl.pipelines.SQLiteStorePipeline']
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'blog_crawl (+http://www.yourdomain.com)'
18 | 


--------------------------------------------------------------------------------
/blog_crawl/blog_crawl/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/blog_crawl/blog_crawl/spiders/mindhacks_spider.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spider import BaseSpider
 2 | from scrapy.selector import HtmlXPathSelector
 3 | from blog_crawl.items import BlogCrawlItem
 4 |  
 5 | class MindhacksSpider(BaseSpider):
 6 |     name = "mindhacks.cn"
 7 |     allowed_domains = [ "mindhacks.cn"]
 8 |     start_urls = ["http://mindhacks.cn/"]
 9 |  
10 |     def parse(self, response):
11 |         hxs = HtmlXPathSelector(response)
12 |         sites = hxs.select('//h3/a/@href')
13 |         items = []
14 |         # save all urls
15 |         #for site in sites:
16 |         #    item = BlogCrawlItem()
17 |         #    item['url'] = site.extract()
18 |         #    items.append(item)
19 | 
20 |         # process each post
21 |         items.extend([self.make_requests_from_url(url.extract()).replace(callback=self.parse_post)
22 |                   for url in sites])
23 | 
24 |         # process next page
25 |         page_links=hxs.select('//div[@class="wp-pagenavi"]/a[not(@title)]')
26 |         for link in page_links:
27 |             if link.select('text()').extract()[0] == u'\xbb':
28 |                 url = link.select('@href').extract()[0]
29 |                 items.append(self.make_requests_from_url(url))
30 |         return items
31 | 
32 |     def parse_post(self, response):
33 |         item = BlogCrawlItem()
34 |         item['url'] = unicode(response.url)
35 |         item['raw'] = response.body_as_unicode()
36 |         return [item]
37 | 


--------------------------------------------------------------------------------
/blog_crawl/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = blog_crawl.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = blog_crawl
12 | 


--------------------------------------------------------------------------------
/docs/README:
--------------------------------------------------------------------------------
 1 | Create project:
 2 | 
 3 |     scrapy startproject tutorial
 4 | 
 5 | Generate a spider:
 6 | 
 7 |     scrapy genspider mydomain mydomain.com
 8 |     scrapy genspider -t crawl/xmlfeed/csvfeed spider mydomain mydomain.com
 9 | 
10 | Start crawling:
11 | 
12 |     scrapy crawl dmoz
13 | 
14 | Scrapy shell:
15 | 
16 |     scrapy shell http://www.dmoz.org/Computers/Programming/Languages/Python/Books/
17 | 
18 | Export json:
19 | 
20 |     scrapy crawl dmoz -o items.json -t json
21 | 
22 | 


--------------------------------------------------------------------------------
/docs/scrapyd:
--------------------------------------------------------------------------------
 1 | Scrapy Service (scrapyd)
 2 | 
 3 | Run as a server:
 4 | # scrapy server
 5 | 2013-03-11 13:36:27+0800 [-] Log opened.
 6 | 2013-03-11 13:36:27+0800 [-] Scrapyd web console available at http://0.0.0.0:6800/
 7 | 2013-03-11 13:36:27+0800 [Launcher] Scrapyd started: max_proc=8, runner='scrapyd.runner'
 8 | 2013-03-11 13:36:27+0800 [-] Site starting on 6800
 9 | 2013-03-11 13:36:27+0800 [-] Starting factory <twisted.web.server.Site instance at 0x36677e8>
10 | 
11 | start a spider:
12 | # curl http://localhost:6800/listprojects.json
13 | {"status": "ok", "projects": ["default"]}
14 | 
15 | # curl http://localhost:6800/schedule.json -d project=default -d spider=hackernews
16 | {"status": "ok", "jobid": "325d2fa68a0e11e2adba7e97b6ad9650"}
17 | 
18 | # curl http://localhost:6800/cancel.json -d project=default -d job=325d2fa68a0e11e2adba7e97b6ad9650
19 | {"status": "ok", "prevstate": null}
20 | 
21 | # curl http://localhost:6800/listversions.json?project=default
22 | {"status": "ok", "versions": []}
23 | 
24 | # curl http://localhost:6800/listspiders.json?project=default
25 | {"status": "ok", "spiders": ["mindhacks", "hackernews"]}
26 | 
27 | # curl http://localhost:6800/listjobs.json?project=default
28 | {"status": "ok", "running": [], "finished": [{"start_time": "2013-03-11 13:40:22.537393", "end_time": "2013-03-11 13:40:23.159254", "id": "28e7b04a8a0e11e2adba7e97b6ad9650", "spider": "somespider"}, {"start_time": "2013-03-11 13:40:37.538718", "end_time": "2013-03-11 13:40:58.144857", "id": "325d2fa68a0e11e2adba7e97b6ad9650", "spider": "hackernews"}], "pending": []}
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/webkit2png:
--------------------------------------------------------------------------------
 1 | #Reference
 2 | https://github.com/AdamN/python-webkit2png/
 3 | 
 4 | #安装xvfb用于虚拟X环境
 5 | apt-get install xvfb
 6 | 
 7 | #安装中文字体
 8 | apt-get install xfonts-wqy
 9 | 
10 | #配置字体
11 | fontconfig-voodoo -f -s zh_CN
12 | 
13 | #Get webkit2png
14 | wget https://raw.github.com/adamn/python-webkit2png/master/scripts/webkit2png
15 | wget https://raw.github.com/adamn/python-webkit2png/master/webkit2png/webkit2png.py
16 | chmod +x webkit2png webkit2png.py
17 | 
18 | #Using
19 | ./webkit2png -x 1366 768 http://www.sina.com.cn -o test2.png
20 | ./webkit2png -x 1366 768 -F javascript http://www.sina.com.cn -o test2.png
21 | 
22 | 


--------------------------------------------------------------------------------
/newsspider/.scrapy/scrapyd/dbs/default.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/newsspider/.scrapy/scrapyd/dbs/default.db


--------------------------------------------------------------------------------
/newsspider/.scrapy/scrapyd/items/default/mindhacks/ff9b53c687d411e2adba7e97b6ad9650.jl:
--------------------------------------------------------------------------------
 1 | {"url": "http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/", "created": "2011-11-04T01:18:00+00:00", "site": "mindhacks", "title": "\u600e\u6837\u82b1\u4e24\u5e74\u65f6\u95f4\u53bb\u9762\u8bd5\u4e00\u4e2a\u4eba"}
 2 | {"url": "http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/", "created": "2012-06-04T23:10:49+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e94\uff09\uff1a\u770b\u4e0d\u89c1\u7684\u7262\u7b3c\uff08\u4e0a\uff09"}
 3 | {"url": "http://mindhacks.cn/2012/08/27/modern-cpp-practices/", "created": "2012-08-27T14:09:47+00:00", "site": "mindhacks", "title": "C++11\uff08\u53ca\u73b0\u4ee3C++\u98ce\u683c\uff09\u548c\u5feb\u901f\u8fed\u4ee3\u5f0f\u5f00\u53d1"}
 4 | {"url": "http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/", "created": "2009-07-06T21:48:07+00:00", "site": "mindhacks", "title": "[BetterExplained]\u9047\u5230\u95ee\u9898\u4e3a\u4ec0\u4e48\u5e94\u8be5\u81ea\u5df1\u52a8\u624b"}
 5 | {"url": "http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/", "created": "2009-10-05T01:18:30+00:00", "site": "mindhacks", "title": "\u4e0d\u662f\u4e66\u8bc4 \uff1a\u300a\u6211\u662f\u4e00\u53eaIT\u5c0f\u5c0f\u9e1f\u300b"}
 6 | {"url": "http://mindhacks.cn/2009/12/20/dark-time/", "created": "2009-12-20T13:39:00+00:00", "site": "mindhacks", "title": "\u6697\u65f6\u95f4"}
 7 | {"url": "http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/", "created": "2010-03-18T00:28:14+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e09\uff09\uff1a\u9047\u89c120\u4e07\u5e74\u524d\u7684\u81ea\u5df1"}
 8 | {"url": "http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/", "created": "2010-11-14T17:41:00+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u7eed\uff09"}
 9 | {"url": "http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/", "created": "2011-01-23T20:28:34+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u56db\uff09\uff1a\u7406\u667a\u4e0e\u60c5\u611f"}
10 | {"url": "http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/", "created": "2011-07-10T00:24:32+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4e09\uff09\uff1a\u4e3a\u4ec0\u4e48\u7b97\u6cd5\u8fd9\u4e48\u96be\uff1f"}
11 | {"url": "http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/", "created": "2011-11-04T01:18:00+00:00", "site": "mindhacks", "title": "\u600e\u6837\u82b1\u4e24\u5e74\u65f6\u95f4\u53bb\u9762\u8bd5\u4e00\u4e2a\u4eba"}
12 | {"url": "http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/", "created": "2012-06-04T23:10:49+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e94\uff09\uff1a\u770b\u4e0d\u89c1\u7684\u7262\u7b3c\uff08\u4e0a\uff09"}
13 | {"url": "http://mindhacks.cn/2012/08/27/modern-cpp-practices/", "created": "2012-08-27T14:09:47+00:00", "site": "mindhacks", "title": "C++11\uff08\u53ca\u73b0\u4ee3C++\u98ce\u683c\uff09\u548c\u5feb\u901f\u8fed\u4ee3\u5f0f\u5f00\u53d1"}
14 | {"url": "http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/", "created": "2009-02-15T19:57:26+00:00", "site": "mindhacks", "title": "[BetterExplained]\u4e3a\u4ec0\u4e48\u4f60\u5e94\u8be5\uff08\u4ece\u73b0\u5728\u5f00\u59cb\u5c31\uff09\u5199\u535a\u5ba2"}
15 | {"url": "http://mindhacks.cn/2009/03/09/first-principles-of-programming/", "created": "2009-03-09T15:12:00+00:00", "site": "mindhacks", "title": "\u7f16\u7a0b\u7684\u9996\u8981\u539f\u5219(s)\u662f\u4ec0\u4e48\uff1f"}
16 | {"url": "http://mindhacks.cn/2009/03/15/preconception-explained/", "created": "2009-03-15T18:49:27+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e8c\uff09\uff1a\u4ec1\u8005\u89c1\u4ec1\u667a\u8005\u89c1\u667a\uff1f\u4ece\u89c6\u89c9\u9519\u89c9\u5230\u504f\u89c1"}
17 | {"url": "http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/", "created": "2009-03-28T19:23:02+00:00", "site": "mindhacks", "title": "[BetterExplained]\u5982\u4f55\u6709\u6548\u5730\u8bb0\u5fc6\u4e0e\u5b66\u4e60"}
18 | {"url": "http://mindhacks.cn/2009/05/17/seven-years-in-nju/", "created": "2009-05-17T23:57:30+00:00", "site": "mindhacks", "title": "\u6211\u5728\u5357\u5927\u7684\u4e03\u5e74"}
19 | {"url": "http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/", "created": "2009-07-06T21:48:07+00:00", "site": "mindhacks", "title": "[BetterExplained]\u9047\u5230\u95ee\u9898\u4e3a\u4ec0\u4e48\u5e94\u8be5\u81ea\u5df1\u52a8\u624b"}
20 | {"url": "http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/", "created": "2009-10-05T01:18:30+00:00", "site": "mindhacks", "title": "\u4e0d\u662f\u4e66\u8bc4 \uff1a\u300a\u6211\u662f\u4e00\u53eaIT\u5c0f\u5c0f\u9e1f\u300b"}
21 | {"url": "http://mindhacks.cn/2009/12/20/dark-time/", "created": "2009-12-20T13:39:00+00:00", "site": "mindhacks", "title": "\u6697\u65f6\u95f4"}
22 | {"url": "http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/", "created": "2010-03-18T00:28:14+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e09\uff09\uff1a\u9047\u89c120\u4e07\u5e74\u524d\u7684\u81ea\u5df1"}
23 | {"url": "http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/", "created": "2010-11-14T17:41:00+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u7eed\uff09"}
24 | {"url": "http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/", "created": "2011-01-23T20:28:34+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u56db\uff09\uff1a\u7406\u667a\u4e0e\u60c5\u611f"}
25 | {"url": "http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/", "created": "2011-07-10T00:24:32+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4e09\uff09\uff1a\u4e3a\u4ec0\u4e48\u7b97\u6cd5\u8fd9\u4e48\u96be\uff1f"}
26 | {"url": "http://mindhacks.cn/2009/01/16/hammers-and-nails/", "created": "2009-01-16T21:25:00+00:00", "site": "mindhacks", "title": "\u9524\u5b50\u548c\u9489\u5b50"}
27 | {"url": "http://mindhacks.cn/2009/01/18/escape-from-your-shawshank-part1/", "created": "2009-01-18T21:32:00+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e00\uff09\uff1a\u4e3a\u4ec0\u4e48\u4e00\u5b9a\u8981\u4eb2\u8eab\u7ecf\u5386\u4e86\u4e4b\u540e\u624d\u80fd\u660e\u767d\uff1f"}
28 | {"url": "http://mindhacks.cn/2009/02/07/independence-day/", "created": "2009-02-07T12:57:34+00:00", "site": "mindhacks", "title": "\u72ec\u7acb\u65e5"}
29 | {"url": "http://mindhacks.cn/2009/02/07/better-explained-conflicts-in-intimate-relationship/", "created": "2009-02-07T20:35:17+00:00", "site": "mindhacks", "title": "[BetterExplained]\u4eb2\u5bc6\u5173\u7cfb\u4e2d\u7684\u51b2\u7a81\u89e3\u51b3"}
30 | {"url": "http://mindhacks.cn/2009/02/09/writing-is-better-thinking/", "created": "2009-02-09T22:24:00+00:00", "site": "mindhacks", "title": "[BetterExplained]\u4e66\u5199\u662f\u4e3a\u4e86\u66f4\u597d\u7684\u601d\u8003"}
31 | {"url": "http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/", "created": "2009-02-15T19:57:26+00:00", "site": "mindhacks", "title": "[BetterExplained]\u4e3a\u4ec0\u4e48\u4f60\u5e94\u8be5\uff08\u4ece\u73b0\u5728\u5f00\u59cb\u5c31\uff09\u5199\u535a\u5ba2"}
32 | {"url": "http://mindhacks.cn/2009/03/09/first-principles-of-programming/", "created": "2009-03-09T15:12:00+00:00", "site": "mindhacks", "title": "\u7f16\u7a0b\u7684\u9996\u8981\u539f\u5219(s)\u662f\u4ec0\u4e48\uff1f"}
33 | {"url": "http://mindhacks.cn/2009/03/15/preconception-explained/", "created": "2009-03-15T18:49:27+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e8c\uff09\uff1a\u4ec1\u8005\u89c1\u4ec1\u667a\u8005\u89c1\u667a\uff1f\u4ece\u89c6\u89c9\u9519\u89c9\u5230\u504f\u89c1"}
34 | {"url": "http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/", "created": "2009-03-28T19:23:02+00:00", "site": "mindhacks", "title": "[BetterExplained]\u5982\u4f55\u6709\u6548\u5730\u8bb0\u5fc6\u4e0e\u5b66\u4e60"}
35 | {"url": "http://mindhacks.cn/2009/05/17/seven-years-in-nju/", "created": "2009-05-17T23:57:30+00:00", "site": "mindhacks", "title": "\u6211\u5728\u5357\u5927\u7684\u4e03\u5e74"}
36 | {"url": "http://mindhacks.cn/2008/07/07/the-importance-of-knowing-why/", "created": "2008-07-07T21:05:00+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4ee5\u7b97\u6cd5\u5b66\u4e60\u4e3a\u4f8b\uff09"}
37 | {"url": "http://mindhacks.cn/2008/07/08/learning-habits-part1/", "created": "2008-07-08T21:13:00+00:00", "site": "mindhacks", "title": "\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e00)\uff1a\u5b66\u4e60\u4e0e\u601d\u8003"}
38 | {"url": "http://mindhacks.cn/2008/07/20/learning-habits-part2/", "created": "2008-07-20T21:16:00+00:00", "site": "mindhacks", "title": "\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e8c)\uff1a\u65f6\u95f4\u7ba1\u7406"}
39 | {"url": "http://mindhacks.cn/2008/09/11/machine-learning-and-ai-resources/", "created": "2008-09-11T19:29:00+00:00", "site": "mindhacks", "title": "\u673a\u5668\u5b66\u4e60\u4e0e\u4eba\u5de5\u667a\u80fd\u5b66\u4e60\u8d44\u6e90\u5bfc\u5f15"}
40 | {"url": "http://mindhacks.cn/2008/09/17/learning-habits-part3/", "created": "2008-09-17T21:18:00+00:00", "site": "mindhacks", "title": "\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e09)\uff1a\u9605\u8bfb\u65b9\u6cd5"}
41 | {"url": "http://mindhacks.cn/2008/09/21/the-magical-bayesian-method/", "created": "2008-09-21T19:34:00+00:00", "site": "mindhacks", "title": "\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u5e73\u51e1\u800c\u53c8\u795e\u5947\u7684\u8d1d\u53f6\u65af\u65b9\u6cd5"}
42 | {"url": "http://mindhacks.cn/2008/10/29/methodology-for-programmers/", "created": "2008-10-29T21:09:00+00:00", "site": "mindhacks", "title": "\u65b9\u6cd5\u8bba\u3001\u65b9\u6cd5\u8bba\u2014\u2014\u7a0b\u5e8f\u5458\u7684\u963f\u5580\u7409\u65af\u4e4b\u8e35"}
43 | {"url": "http://mindhacks.cn/2008/12/05/learning-habits-part4/", "created": "2008-12-05T21:21:00+00:00", "site": "mindhacks", "title": "\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u56db)\uff1a\u77e5\u8bc6\u7ed3\u6784"}
44 | {"url": "http://mindhacks.cn/2008/12/18/how-to-think-straight/", "created": "2008-12-18T20:10:00+00:00", "site": "mindhacks", "title": "\u5982\u4f55\u6e05\u6670\u5730\u601d\u8003\uff08\u8fd1\u4e00\u5e74\u6765\u4e1a\u4f59\u9605\u8bfb\u7684\u5173\u4e8e\u601d\u7ef4\u65b9\u9762\u7684\u77e5\u8bc6\u7ed3\u6784\u6574\u7406\uff09"}
45 | {"url": "http://mindhacks.cn/2009/01/14/make-yourself-irreplacable/", "created": "2009-01-14T21:28:00+00:00", "site": "mindhacks", "title": "\u4ec0\u4e48\u624d\u662f\u4f60\u7684\u4e0d\u53ef\u66ff\u4ee3\u6027\u548c\u6838\u5fc3\u7ade\u4e89\u529b"}
46 | {"url": "http://mindhacks.cn/2006/10/15/cantor-godel-turing-an-eternal-golden-diagonal/", "created": "2006-10-15T19:16:00+00:00", "site": "mindhacks", "title": "\u5eb7\u6258\u5c14\u3001\u54e5\u5fb7\u5c14\u3001\u56fe\u7075\u2014\u2014\u6c38\u6052\u7684\u91d1\u8272\u5bf9\u89d2\u7ebf(rev#2)"}
47 | {"url": "http://mindhacks.cn/2007/05/24/learn-to-focus/", "created": "2007-05-24T20:30:00+00:00", "site": "mindhacks", "title": "\u5b66\u4e60\u5bc6\u5ea6\u4e0e\u4e13\u6ce8\u529b"}
48 | {"url": "http://mindhacks.cn/2007/12/02/probability-theory-in-evolution/", "created": "2007-12-02T18:55:00+00:00", "site": "mindhacks", "title": "\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u8fdb\u5316\u8bba\u4e2d\u7684\u6982\u7387\u8bba"}
49 | {"url": "http://mindhacks.cn/2008/03/03/failing-to-see-the-big-picture/", "created": "2008-03-03T15:42:00+00:00", "site": "mindhacks", "title": "Failing To See the Big Picture \u2013 Mistakes we make when learning programming"}
50 | {"url": "http://mindhacks.cn/2008/04/08/reading-method/", "created": "2008-04-08T21:00:00+00:00", "site": "mindhacks", "title": "\u9605\u8bfb\u4e0e\u601d\u8003"}
51 | {"url": "http://mindhacks.cn/2008/04/18/learning-from-polya/", "created": "2008-04-18T21:37:00+00:00", "site": "mindhacks", "title": "\u8ddf\u6ce2\u5229\u4e9a\u5b66\u89e3\u9898(rev#3)"}
52 | {"url": "http://mindhacks.cn/2008/06/05/how-memory-works/", "created": "2008-06-05T21:40:00+00:00", "site": "mindhacks", "title": "\u5b66\u4e60\u4e0e\u8bb0\u5fc6"}
53 | {"url": "http://mindhacks.cn/2008/06/13/why-is-quicksort-so-quick/", "created": "2008-06-13T19:53:00+00:00", "site": "mindhacks", "title": "\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u5feb\u6392\u4e3a\u4ec0\u4e48\u90a3\u6837\u5feb"}
54 | 


--------------------------------------------------------------------------------
/newsspider/.scrapy/scrapyd/logs/default/all/b29488fe87d411e2adba7e97b6ad9650.log:
--------------------------------------------------------------------------------
1 | 2013-03-08 17:43:58+0800 [scrapy] INFO: Scrapy 0.17.0 started (bot: Baiduspider)
2 | 2013-03-08 17:43:58+0800 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
3 | 2013-03-08 17:43:58+0800 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
4 | 2013-03-08 17:43:58+0800 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
5 | 2013-03-08 17:43:58+0800 [scrapy] DEBUG: Enabled item pipelines: NewsspiderPipeline
6 | 


--------------------------------------------------------------------------------
/newsspider/.scrapy/scrapyd/logs/default/mindhacks/ff9b53c687d411e2adba7e97b6ad9650.log:
--------------------------------------------------------------------------------
  1 | 2013-03-08 17:46:08+0800 [scrapy] INFO: Scrapy 0.17.0 started (bot: Baiduspider)
  2 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
  3 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
  4 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
  5 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Enabled item pipelines: NewsspiderPipeline
  6 | 2013-03-08 17:46:08+0800 [mindhacks] INFO: Spider opened
  7 | 2013-03-08 17:46:08+0800 [mindhacks] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
  8 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
  9 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
 10 | 2013-03-08 17:46:08+0800 [mindhacks] DEBUG: Redirecting (301) to <GET http://mindhacks.cn/> from <GET http://www.mindhacks.cn/>
 11 | 2013-03-08 17:46:08+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/> (referer: None)
 12 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/> (referer: http://mindhacks.cn/)
 13 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/>
 14 | 	{'created': u'2011-11-04T01:18:00+00:00',
 15 | 	 'site': 'mindhacks',
 16 | 	 'title': u'\u600e\u6837\u82b1\u4e24\u5e74\u65f6\u95f4\u53bb\u9762\u8bd5\u4e00\u4e2a\u4eba',
 17 | 	 'url': u'http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/'}
 18 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/> (referer: http://mindhacks.cn/)
 19 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/>
 20 | 	{'created': u'2012-06-04T23:10:49+00:00',
 21 | 	 'site': 'mindhacks',
 22 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e94\uff09\uff1a\u770b\u4e0d\u89c1\u7684\u7262\u7b3c\uff08\u4e0a\uff09',
 23 | 	 'url': u'http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/'}
 24 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2012/08/27/modern-cpp-practices/> (referer: http://mindhacks.cn/)
 25 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2012/08/27/modern-cpp-practices/>
 26 | 	{'created': u'2012-08-27T14:09:47+00:00',
 27 | 	 'site': 'mindhacks',
 28 | 	 'title': u'C++11\uff08\u53ca\u73b0\u4ee3C++\u98ce\u683c\uff09\u548c\u5feb\u901f\u8fed\u4ee3\u5f0f\u5f00\u53d1',
 29 | 	 'url': u'http://mindhacks.cn/2012/08/27/modern-cpp-practices/'}
 30 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/> (referer: http://mindhacks.cn/)
 31 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/>
 32 | 	{'created': u'2009-07-06T21:48:07+00:00',
 33 | 	 'site': 'mindhacks',
 34 | 	 'title': u'[BetterExplained]\u9047\u5230\u95ee\u9898\u4e3a\u4ec0\u4e48\u5e94\u8be5\u81ea\u5df1\u52a8\u624b',
 35 | 	 'url': u'http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/'}
 36 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/> (referer: http://mindhacks.cn/)
 37 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/>
 38 | 	{'created': u'2009-10-05T01:18:30+00:00',
 39 | 	 'site': 'mindhacks',
 40 | 	 'title': u'\u4e0d\u662f\u4e66\u8bc4 \uff1a\u300a\u6211\u662f\u4e00\u53eaIT\u5c0f\u5c0f\u9e1f\u300b',
 41 | 	 'url': u'http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/'}
 42 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/12/20/dark-time/> (referer: http://mindhacks.cn/)
 43 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/12/20/dark-time/>
 44 | 	{'created': u'2009-12-20T13:39:00+00:00',
 45 | 	 'site': 'mindhacks',
 46 | 	 'title': u'\u6697\u65f6\u95f4',
 47 | 	 'url': u'http://mindhacks.cn/2009/12/20/dark-time/'}
 48 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/> (referer: http://mindhacks.cn/)
 49 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/>
 50 | 	{'created': u'2010-03-18T00:28:14+00:00',
 51 | 	 'site': 'mindhacks',
 52 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e09\uff09\uff1a\u9047\u89c120\u4e07\u5e74\u524d\u7684\u81ea\u5df1',
 53 | 	 'url': u'http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/'}
 54 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/> (referer: http://mindhacks.cn/)
 55 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/>
 56 | 	{'created': u'2010-11-14T17:41:00+00:00',
 57 | 	 'site': 'mindhacks',
 58 | 	 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u7eed\uff09',
 59 | 	 'url': u'http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/'}
 60 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/> (referer: http://mindhacks.cn/)
 61 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/>
 62 | 	{'created': u'2011-01-23T20:28:34+00:00',
 63 | 	 'site': 'mindhacks',
 64 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u56db\uff09\uff1a\u7406\u667a\u4e0e\u60c5\u611f',
 65 | 	 'url': u'http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/'}
 66 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/> (referer: http://mindhacks.cn/)
 67 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/>
 68 | 	{'created': u'2011-07-10T00:24:32+00:00',
 69 | 	 'site': 'mindhacks',
 70 | 	 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4e09\uff09\uff1a\u4e3a\u4ec0\u4e48\u7b97\u6cd5\u8fd9\u4e48\u96be\uff1f',
 71 | 	 'url': u'http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/'}
 72 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/> (referer: http://mindhacks.cn/)
 73 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/>
 74 | 	{'created': u'2011-11-04T01:18:00+00:00',
 75 | 	 'site': 'mindhacks',
 76 | 	 'title': u'\u600e\u6837\u82b1\u4e24\u5e74\u65f6\u95f4\u53bb\u9762\u8bd5\u4e00\u4e2a\u4eba',
 77 | 	 'url': u'http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/'}
 78 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/> (referer: http://mindhacks.cn/)
 79 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/>
 80 | 	{'created': u'2012-06-04T23:10:49+00:00',
 81 | 	 'site': 'mindhacks',
 82 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e94\uff09\uff1a\u770b\u4e0d\u89c1\u7684\u7262\u7b3c\uff08\u4e0a\uff09',
 83 | 	 'url': u'http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/'}
 84 | 2013-03-08 17:46:13+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2012/08/27/modern-cpp-practices/> (referer: http://mindhacks.cn/)
 85 | 2013-03-08 17:46:13+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2012/08/27/modern-cpp-practices/>
 86 | 	{'created': u'2012-08-27T14:09:47+00:00',
 87 | 	 'site': 'mindhacks',
 88 | 	 'title': u'C++11\uff08\u53ca\u73b0\u4ee3C++\u98ce\u683c\uff09\u548c\u5feb\u901f\u8fed\u4ee3\u5f0f\u5f00\u53d1',
 89 | 	 'url': u'http://mindhacks.cn/2012/08/27/modern-cpp-practices/'}
 90 | 2013-03-08 17:46:13+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/page/2/> (referer: http://mindhacks.cn/)
 91 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/> (referer: http://mindhacks.cn/)
 92 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/>
 93 | 	{'created': u'2009-02-15T19:57:26+00:00',
 94 | 	 'site': 'mindhacks',
 95 | 	 'title': u'[BetterExplained]\u4e3a\u4ec0\u4e48\u4f60\u5e94\u8be5\uff08\u4ece\u73b0\u5728\u5f00\u59cb\u5c31\uff09\u5199\u535a\u5ba2',
 96 | 	 'url': u'http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/'}
 97 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/03/09/first-principles-of-programming/> (referer: http://mindhacks.cn/)
 98 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/09/first-principles-of-programming/>
 99 | 	{'created': u'2009-03-09T15:12:00+00:00',
100 | 	 'site': 'mindhacks',
101 | 	 'title': u'\u7f16\u7a0b\u7684\u9996\u8981\u539f\u5219(s)\u662f\u4ec0\u4e48\uff1f',
102 | 	 'url': u'http://mindhacks.cn/2009/03/09/first-principles-of-programming/'}
103 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/03/15/preconception-explained/> (referer: http://mindhacks.cn/)
104 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/15/preconception-explained/>
105 | 	{'created': u'2009-03-15T18:49:27+00:00',
106 | 	 'site': 'mindhacks',
107 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e8c\uff09\uff1a\u4ec1\u8005\u89c1\u4ec1\u667a\u8005\u89c1\u667a\uff1f\u4ece\u89c6\u89c9\u9519\u89c9\u5230\u504f\u89c1',
108 | 	 'url': u'http://mindhacks.cn/2009/03/15/preconception-explained/'}
109 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/> (referer: http://mindhacks.cn/)
110 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/>
111 | 	{'created': u'2009-03-28T19:23:02+00:00',
112 | 	 'site': 'mindhacks',
113 | 	 'title': u'[BetterExplained]\u5982\u4f55\u6709\u6548\u5730\u8bb0\u5fc6\u4e0e\u5b66\u4e60',
114 | 	 'url': u'http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/'}
115 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/05/17/seven-years-in-nju/> (referer: http://mindhacks.cn/)
116 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/05/17/seven-years-in-nju/>
117 | 	{'created': u'2009-05-17T23:57:30+00:00',
118 | 	 'site': 'mindhacks',
119 | 	 'title': u'\u6211\u5728\u5357\u5927\u7684\u4e03\u5e74',
120 | 	 'url': u'http://mindhacks.cn/2009/05/17/seven-years-in-nju/'}
121 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/> (referer: http://mindhacks.cn/)
122 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/>
123 | 	{'created': u'2009-07-06T21:48:07+00:00',
124 | 	 'site': 'mindhacks',
125 | 	 'title': u'[BetterExplained]\u9047\u5230\u95ee\u9898\u4e3a\u4ec0\u4e48\u5e94\u8be5\u81ea\u5df1\u52a8\u624b',
126 | 	 'url': u'http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/'}
127 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/> (referer: http://mindhacks.cn/)
128 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/>
129 | 	{'created': u'2009-10-05T01:18:30+00:00',
130 | 	 'site': 'mindhacks',
131 | 	 'title': u'\u4e0d\u662f\u4e66\u8bc4 \uff1a\u300a\u6211\u662f\u4e00\u53eaIT\u5c0f\u5c0f\u9e1f\u300b',
132 | 	 'url': u'http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/'}
133 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/12/20/dark-time/> (referer: http://mindhacks.cn/)
134 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/12/20/dark-time/>
135 | 	{'created': u'2009-12-20T13:39:00+00:00',
136 | 	 'site': 'mindhacks',
137 | 	 'title': u'\u6697\u65f6\u95f4',
138 | 	 'url': u'http://mindhacks.cn/2009/12/20/dark-time/'}
139 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/> (referer: http://mindhacks.cn/)
140 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/>
141 | 	{'created': u'2010-03-18T00:28:14+00:00',
142 | 	 'site': 'mindhacks',
143 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e09\uff09\uff1a\u9047\u89c120\u4e07\u5e74\u524d\u7684\u81ea\u5df1',
144 | 	 'url': u'http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/'}
145 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/> (referer: http://mindhacks.cn/)
146 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/>
147 | 	{'created': u'2010-11-14T17:41:00+00:00',
148 | 	 'site': 'mindhacks',
149 | 	 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u7eed\uff09',
150 | 	 'url': u'http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/'}
151 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/> (referer: http://mindhacks.cn/)
152 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/>
153 | 	{'created': u'2011-01-23T20:28:34+00:00',
154 | 	 'site': 'mindhacks',
155 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u56db\uff09\uff1a\u7406\u667a\u4e0e\u60c5\u611f',
156 | 	 'url': u'http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/'}
157 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/> (referer: http://mindhacks.cn/)
158 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/page/3/> (referer: http://mindhacks.cn/page/2/)
159 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/>
160 | 	{'created': u'2011-07-10T00:24:32+00:00',
161 | 	 'site': 'mindhacks',
162 | 	 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4e09\uff09\uff1a\u4e3a\u4ec0\u4e48\u7b97\u6cd5\u8fd9\u4e48\u96be\uff1f',
163 | 	 'url': u'http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/'}
164 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/01/16/hammers-and-nails/> (referer: http://mindhacks.cn/page/2/)
165 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/01/16/hammers-and-nails/>
166 | 	{'created': u'2009-01-16T21:25:00+00:00',
167 | 	 'site': 'mindhacks',
168 | 	 'title': u'\u9524\u5b50\u548c\u9489\u5b50',
169 | 	 'url': u'http://mindhacks.cn/2009/01/16/hammers-and-nails/'}
170 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/01/18/escape-from-your-shawshank-part1/> (referer: http://mindhacks.cn/page/2/)
171 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/01/18/escape-from-your-shawshank-part1/>
172 | 	{'created': u'2009-01-18T21:32:00+00:00',
173 | 	 'site': 'mindhacks',
174 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e00\uff09\uff1a\u4e3a\u4ec0\u4e48\u4e00\u5b9a\u8981\u4eb2\u8eab\u7ecf\u5386\u4e86\u4e4b\u540e\u624d\u80fd\u660e\u767d\uff1f',
175 | 	 'url': u'http://mindhacks.cn/2009/01/18/escape-from-your-shawshank-part1/'}
176 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/02/07/independence-day/> (referer: http://mindhacks.cn/page/2/)
177 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/07/independence-day/>
178 | 	{'created': u'2009-02-07T12:57:34+00:00',
179 | 	 'site': 'mindhacks',
180 | 	 'title': u'\u72ec\u7acb\u65e5',
181 | 	 'url': u'http://mindhacks.cn/2009/02/07/independence-day/'}
182 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/02/07/better-explained-conflicts-in-intimate-relationship/> (referer: http://mindhacks.cn/page/2/)
183 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/07/better-explained-conflicts-in-intimate-relationship/>
184 | 	{'created': u'2009-02-07T20:35:17+00:00',
185 | 	 'site': 'mindhacks',
186 | 	 'title': u'[BetterExplained]\u4eb2\u5bc6\u5173\u7cfb\u4e2d\u7684\u51b2\u7a81\u89e3\u51b3',
187 | 	 'url': u'http://mindhacks.cn/2009/02/07/better-explained-conflicts-in-intimate-relationship/'}
188 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/02/09/writing-is-better-thinking/> (referer: http://mindhacks.cn/page/2/)
189 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/09/writing-is-better-thinking/>
190 | 	{'created': u'2009-02-09T22:24:00+00:00',
191 | 	 'site': 'mindhacks',
192 | 	 'title': u'[BetterExplained]\u4e66\u5199\u662f\u4e3a\u4e86\u66f4\u597d\u7684\u601d\u8003',
193 | 	 'url': u'http://mindhacks.cn/2009/02/09/writing-is-better-thinking/'}
194 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/> (referer: http://mindhacks.cn/page/2/)
195 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/>
196 | 	{'created': u'2009-02-15T19:57:26+00:00',
197 | 	 'site': 'mindhacks',
198 | 	 'title': u'[BetterExplained]\u4e3a\u4ec0\u4e48\u4f60\u5e94\u8be5\uff08\u4ece\u73b0\u5728\u5f00\u59cb\u5c31\uff09\u5199\u535a\u5ba2',
199 | 	 'url': u'http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/'}
200 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/03/09/first-principles-of-programming/> (referer: http://mindhacks.cn/page/2/)
201 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/09/first-principles-of-programming/>
202 | 	{'created': u'2009-03-09T15:12:00+00:00',
203 | 	 'site': 'mindhacks',
204 | 	 'title': u'\u7f16\u7a0b\u7684\u9996\u8981\u539f\u5219(s)\u662f\u4ec0\u4e48\uff1f',
205 | 	 'url': u'http://mindhacks.cn/2009/03/09/first-principles-of-programming/'}
206 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/03/15/preconception-explained/> (referer: http://mindhacks.cn/page/2/)
207 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/15/preconception-explained/>
208 | 	{'created': u'2009-03-15T18:49:27+00:00',
209 | 	 'site': 'mindhacks',
210 | 	 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e8c\uff09\uff1a\u4ec1\u8005\u89c1\u4ec1\u667a\u8005\u89c1\u667a\uff1f\u4ece\u89c6\u89c9\u9519\u89c9\u5230\u504f\u89c1',
211 | 	 'url': u'http://mindhacks.cn/2009/03/15/preconception-explained/'}
212 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/> (referer: http://mindhacks.cn/page/2/)
213 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/>
214 | 	{'created': u'2009-03-28T19:23:02+00:00',
215 | 	 'site': 'mindhacks',
216 | 	 'title': u'[BetterExplained]\u5982\u4f55\u6709\u6548\u5730\u8bb0\u5fc6\u4e0e\u5b66\u4e60',
217 | 	 'url': u'http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/'}
218 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/05/17/seven-years-in-nju/> (referer: http://mindhacks.cn/page/2/)
219 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/05/17/seven-years-in-nju/>
220 | 	{'created': u'2009-05-17T23:57:30+00:00',
221 | 	 'site': 'mindhacks',
222 | 	 'title': u'\u6211\u5728\u5357\u5927\u7684\u4e03\u5e74',
223 | 	 'url': u'http://mindhacks.cn/2009/05/17/seven-years-in-nju/'}
224 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/page/4/> (referer: http://mindhacks.cn/page/3/)
225 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/07/07/the-importance-of-knowing-why/> (referer: http://mindhacks.cn/page/3/)
226 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/07/07/the-importance-of-knowing-why/>
227 | 	{'created': u'2008-07-07T21:05:00+00:00',
228 | 	 'site': 'mindhacks',
229 | 	 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4ee5\u7b97\u6cd5\u5b66\u4e60\u4e3a\u4f8b\uff09',
230 | 	 'url': u'http://mindhacks.cn/2008/07/07/the-importance-of-knowing-why/'}
231 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/07/08/learning-habits-part1/> (referer: http://mindhacks.cn/page/3/)
232 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/07/08/learning-habits-part1/>
233 | 	{'created': u'2008-07-08T21:13:00+00:00',
234 | 	 'site': 'mindhacks',
235 | 	 'title': u'\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e00)\uff1a\u5b66\u4e60\u4e0e\u601d\u8003',
236 | 	 'url': u'http://mindhacks.cn/2008/07/08/learning-habits-part1/'}
237 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/07/20/learning-habits-part2/> (referer: http://mindhacks.cn/page/3/)
238 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/07/20/learning-habits-part2/>
239 | 	{'created': u'2008-07-20T21:16:00+00:00',
240 | 	 'site': 'mindhacks',
241 | 	 'title': u'\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e8c)\uff1a\u65f6\u95f4\u7ba1\u7406',
242 | 	 'url': u'http://mindhacks.cn/2008/07/20/learning-habits-part2/'}
243 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/09/11/machine-learning-and-ai-resources/> (referer: http://mindhacks.cn/page/3/)
244 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/09/11/machine-learning-and-ai-resources/>
245 | 	{'created': u'2008-09-11T19:29:00+00:00',
246 | 	 'site': 'mindhacks',
247 | 	 'title': u'\u673a\u5668\u5b66\u4e60\u4e0e\u4eba\u5de5\u667a\u80fd\u5b66\u4e60\u8d44\u6e90\u5bfc\u5f15',
248 | 	 'url': u'http://mindhacks.cn/2008/09/11/machine-learning-and-ai-resources/'}
249 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/09/17/learning-habits-part3/> (referer: http://mindhacks.cn/page/3/)
250 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/09/17/learning-habits-part3/>
251 | 	{'created': u'2008-09-17T21:18:00+00:00',
252 | 	 'site': 'mindhacks',
253 | 	 'title': u'\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e09)\uff1a\u9605\u8bfb\u65b9\u6cd5',
254 | 	 'url': u'http://mindhacks.cn/2008/09/17/learning-habits-part3/'}
255 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/09/21/the-magical-bayesian-method/> (referer: http://mindhacks.cn/page/3/)
256 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/09/21/the-magical-bayesian-method/>
257 | 	{'created': u'2008-09-21T19:34:00+00:00',
258 | 	 'site': 'mindhacks',
259 | 	 'title': u'\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u5e73\u51e1\u800c\u53c8\u795e\u5947\u7684\u8d1d\u53f6\u65af\u65b9\u6cd5',
260 | 	 'url': u'http://mindhacks.cn/2008/09/21/the-magical-bayesian-method/'}
261 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/10/29/methodology-for-programmers/> (referer: http://mindhacks.cn/page/3/)
262 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/10/29/methodology-for-programmers/>
263 | 	{'created': u'2008-10-29T21:09:00+00:00',
264 | 	 'site': 'mindhacks',
265 | 	 'title': u'\u65b9\u6cd5\u8bba\u3001\u65b9\u6cd5\u8bba\u2014\u2014\u7a0b\u5e8f\u5458\u7684\u963f\u5580\u7409\u65af\u4e4b\u8e35',
266 | 	 'url': u'http://mindhacks.cn/2008/10/29/methodology-for-programmers/'}
267 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/12/05/learning-habits-part4/> (referer: http://mindhacks.cn/page/3/)
268 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/12/05/learning-habits-part4/>
269 | 	{'created': u'2008-12-05T21:21:00+00:00',
270 | 	 'site': 'mindhacks',
271 | 	 'title': u'\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u56db)\uff1a\u77e5\u8bc6\u7ed3\u6784',
272 | 	 'url': u'http://mindhacks.cn/2008/12/05/learning-habits-part4/'}
273 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/12/18/how-to-think-straight/> (referer: http://mindhacks.cn/page/3/)
274 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/12/18/how-to-think-straight/>
275 | 	{'created': u'2008-12-18T20:10:00+00:00',
276 | 	 'site': 'mindhacks',
277 | 	 'title': u'\u5982\u4f55\u6e05\u6670\u5730\u601d\u8003\uff08\u8fd1\u4e00\u5e74\u6765\u4e1a\u4f59\u9605\u8bfb\u7684\u5173\u4e8e\u601d\u7ef4\u65b9\u9762\u7684\u77e5\u8bc6\u7ed3\u6784\u6574\u7406\uff09',
278 | 	 'url': u'http://mindhacks.cn/2008/12/18/how-to-think-straight/'}
279 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2009/01/14/make-yourself-irreplacable/> (referer: http://mindhacks.cn/page/3/)
280 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/01/14/make-yourself-irreplacable/>
281 | 	{'created': u'2009-01-14T21:28:00+00:00',
282 | 	 'site': 'mindhacks',
283 | 	 'title': u'\u4ec0\u4e48\u624d\u662f\u4f60\u7684\u4e0d\u53ef\u66ff\u4ee3\u6027\u548c\u6838\u5fc3\u7ade\u4e89\u529b',
284 | 	 'url': u'http://mindhacks.cn/2009/01/14/make-yourself-irreplacable/'}
285 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2006/10/15/cantor-godel-turing-an-eternal-golden-diagonal/> (referer: http://mindhacks.cn/page/4/)
286 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2006/10/15/cantor-godel-turing-an-eternal-golden-diagonal/>
287 | 	{'created': u'2006-10-15T19:16:00+00:00',
288 | 	 'site': 'mindhacks',
289 | 	 'title': u'\u5eb7\u6258\u5c14\u3001\u54e5\u5fb7\u5c14\u3001\u56fe\u7075\u2014\u2014\u6c38\u6052\u7684\u91d1\u8272\u5bf9\u89d2\u7ebf(rev#2)',
290 | 	 'url': u'http://mindhacks.cn/2006/10/15/cantor-godel-turing-an-eternal-golden-diagonal/'}
291 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2007/05/24/learn-to-focus/> (referer: http://mindhacks.cn/page/4/)
292 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2007/05/24/learn-to-focus/>
293 | 	{'created': u'2007-05-24T20:30:00+00:00',
294 | 	 'site': 'mindhacks',
295 | 	 'title': u'\u5b66\u4e60\u5bc6\u5ea6\u4e0e\u4e13\u6ce8\u529b',
296 | 	 'url': u'http://mindhacks.cn/2007/05/24/learn-to-focus/'}
297 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2007/12/02/probability-theory-in-evolution/> (referer: http://mindhacks.cn/page/4/)
298 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2007/12/02/probability-theory-in-evolution/>
299 | 	{'created': u'2007-12-02T18:55:00+00:00',
300 | 	 'site': 'mindhacks',
301 | 	 'title': u'\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u8fdb\u5316\u8bba\u4e2d\u7684\u6982\u7387\u8bba',
302 | 	 'url': u'http://mindhacks.cn/2007/12/02/probability-theory-in-evolution/'}
303 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/03/03/failing-to-see-the-big-picture/> (referer: http://mindhacks.cn/page/4/)
304 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/03/03/failing-to-see-the-big-picture/>
305 | 	{'created': u'2008-03-03T15:42:00+00:00',
306 | 	 'site': 'mindhacks',
307 | 	 'title': u'Failing To See the Big Picture \u2013 Mistakes we make when learning programming',
308 | 	 'url': u'http://mindhacks.cn/2008/03/03/failing-to-see-the-big-picture/'}
309 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/04/08/reading-method/> (referer: http://mindhacks.cn/page/4/)
310 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/04/08/reading-method/>
311 | 	{'created': u'2008-04-08T21:00:00+00:00',
312 | 	 'site': 'mindhacks',
313 | 	 'title': u'\u9605\u8bfb\u4e0e\u601d\u8003',
314 | 	 'url': u'http://mindhacks.cn/2008/04/08/reading-method/'}
315 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/04/18/learning-from-polya/> (referer: http://mindhacks.cn/page/4/)
316 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/04/18/learning-from-polya/>
317 | 	{'created': u'2008-04-18T21:37:00+00:00',
318 | 	 'site': 'mindhacks',
319 | 	 'title': u'\u8ddf\u6ce2\u5229\u4e9a\u5b66\u89e3\u9898(rev#3)',
320 | 	 'url': u'http://mindhacks.cn/2008/04/18/learning-from-polya/'}
321 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/06/05/how-memory-works/> (referer: http://mindhacks.cn/page/4/)
322 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/06/05/how-memory-works/>
323 | 	{'created': u'2008-06-05T21:40:00+00:00',
324 | 	 'site': 'mindhacks',
325 | 	 'title': u'\u5b66\u4e60\u4e0e\u8bb0\u5fc6',
326 | 	 'url': u'http://mindhacks.cn/2008/06/05/how-memory-works/'}
327 | 2013-03-08 17:46:26+0800 [mindhacks] DEBUG: Crawled (200) <GET http://mindhacks.cn/2008/06/13/why-is-quicksort-so-quick/> (referer: http://mindhacks.cn/page/4/)
328 | 2013-03-08 17:46:26+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/06/13/why-is-quicksort-so-quick/>
329 | 	{'created': u'2008-06-13T19:53:00+00:00',
330 | 	 'site': 'mindhacks',
331 | 	 'title': u'\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u5feb\u6392\u4e3a\u4ec0\u4e48\u90a3\u6837\u5feb',
332 | 	 'url': u'http://mindhacks.cn/2008/06/13/why-is-quicksort-so-quick/'}
333 | 2013-03-08 17:46:26+0800 [mindhacks] INFO: Closing spider (finished)
334 | 2013-03-08 17:46:26+0800 [mindhacks] INFO: Stored jsonlines feed (53 items) in: /root/newsspider/.scrapy/scrapyd/items/default/mindhacks/ff9b53c687d411e2adba7e97b6ad9650.jl
335 | 2013-03-08 17:46:26+0800 [mindhacks] INFO: Dumping Scrapy stats:
336 | 	{'downloader/request_bytes': 17825,
337 | 	 'downloader/request_count': 58,
338 | 	 'downloader/request_method_count/GET': 58,
339 | 	 'downloader/response_bytes': 1501575,
340 | 	 'downloader/response_count': 58,
341 | 	 'downloader/response_status_count/200': 57,
342 | 	 'downloader/response_status_count/301': 1,
343 | 	 'finish_reason': 'finished',
344 | 	 'finish_time': datetime.datetime(2013, 3, 8, 9, 46, 26, 149294),
345 | 	 'item_scraped_count': 53,
346 | 	 'log_count/DEBUG': 117,
347 | 	 'log_count/INFO': 5,
348 | 	 'request_depth_max': 4,
349 | 	 'response_received_count': 57,
350 | 	 'scheduler/dequeued': 58,
351 | 	 'scheduler/dequeued/memory': 58,
352 | 	 'scheduler/enqueued': 58,
353 | 	 'scheduler/enqueued/memory': 58,
354 | 	 'start_time': datetime.datetime(2013, 3, 8, 9, 46, 8, 299627)}
355 | 2013-03-08 17:46:26+0800 [mindhacks] INFO: Spider closed (finished)
356 | 


--------------------------------------------------------------------------------
/newsspider/.scrapy/scrapyd/logs/default/somespider/28e7b04a8a0e11e2adba7e97b6ad9650.log:
--------------------------------------------------------------------------------
1 | 2013-03-11 13:40:22+0800 [scrapy] INFO: Scrapy 0.17.0 started (bot: Baiduspider)
2 | 2013-03-11 13:40:23+0800 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
3 | 2013-03-11 13:40:23+0800 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
4 | 2013-03-11 13:40:23+0800 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
5 | 2013-03-11 13:40:23+0800 [scrapy] DEBUG: Enabled item pipelines: NewsspiderPipeline
6 | 


--------------------------------------------------------------------------------
/newsspider/README.md:
--------------------------------------------------------------------------------
1 | Tech news spider
2 | 
3 | TODOs
4 | 
5 | * Support more tech sites
6 | * Extract abstract for links
7 | * Improve the FrontEnd for viewing crawled results
8 | 


--------------------------------------------------------------------------------
/newsspider/dbs/default.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/newsspider/dbs/default.db


--------------------------------------------------------------------------------
/newsspider/newsspider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/newsspider/newsspider/__init__.py


--------------------------------------------------------------------------------
/newsspider/newsspider/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/newsspider/newsspider/commands/__init__.py


--------------------------------------------------------------------------------
/newsspider/newsspider/commands/allcrawl.py:
--------------------------------------------------------------------------------
 1 | from scrapy.command import ScrapyCommand
 2 | import urllib
 3 | import urllib2
 4 | from scrapy import log
 5 | 
 6 | class AllCrawlCommand(ScrapyCommand):
 7 |     requires_project = True
 8 |     default_settings = {'LOG_ENABLED': False}
 9 | 
10 |     def short_desc(self):
11 |         return "Schedule a run for all available spiders (run scrapy server first)"
12 | 
13 |     def run(self, args, opts):
14 |         url = 'http://localhost:6800/schedule.json'
15 |         for s in self.crawler.spiders.list():
16 |             values = {'project' : 'default', 'spider' : s}
17 |             data = urllib.urlencode(values)
18 |             req = urllib2.Request(url, data)
19 |             response = urllib2.urlopen(req)
20 |             log.msg(response)
21 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class NewsspiderItem(Item):
 9 |     # define the fields for your item here like:
10 |     title = Field()
11 |     url = Field()
12 |     site = Field()
13 |     abstract = Field()
14 |     created = Field()
15 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/middlewares.py:
--------------------------------------------------------------------------------
 1 | # Importing base64 library because we'll need it ONLY in case if the proxy 
 2 | # we are going to use requires authentication
 3 | import base64 
 4 | import random
 5 | 
 6 | class ProxyMiddleware(object): 
 7 |     # overwrite process request 
 8 |     def process_request(self, request, spider): 
 9 |         data = file('proxies.txt','r').readlines()
10 |         length = len(data)
11 |         index  = random.randint(0, length -1)
12 |         item   = data[index]
13 |         arr    = item.split()
14 |         request.meta['proxy'] = "http://%s:%s" % (arr[0], arr[1]) 
15 |         
16 |         # Use the following lines if your proxy requires authentication 
17 |         # proxy_user_pass = "USERNAME:PASSWORD"
18 |         # setup basic authentication for the proxy
19 |         # encoded_user_pass = base64.encodestring(proxy_user_pass)
20 |         # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_passwq#
21 | 
22 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 5 | import sys
 6 | import MySQLdb
 7 | import hashlib
 8 | from scrapy.exceptions import DropItem
 9 | from scrapy import log
10 | import time
11 | import os
12 | import uuid
13 | 
14 | class NewsspiderPipeline(object):
15 | 
16 |     def __init__(self):
17 |         self.conn = MySQLdb.connect(user='root', 
18 |                 passwd='feisky', db='news', host='localhost', 
19 |                 charset="utf8", 
20 |                 use_unicode=True)
21 |         self.cursor = self.conn.cursor()
22 |         self.downloadPreview=False
23 |         self.pngPath='/var/scrapy'
24 |         self.filename=''
25 | 
26 |     def process_item(self, item, spider):
27 |         settings = spider.settings
28 |         if settings['DOWNLOAD_PREVIEW']:
29 |             self.downloadPreview=True
30 |             if settings['PNG_PATH']: 
31 |                 self.pngPath=settings['PNG_PATH']
32 | 
33 |         url = item.get('url', '')
34 |         if len(url)==0:
35 |             return item
36 | 
37 |         try:
38 |             if self.downloadPreview:
39 |                 self.filename = "%s/%s.png" % (self.pngPath, str(uuid.uuid1()))
40 |                 cmd = os.popen(u'''/usr/bin/webkit2png -x 1366 768 -F javascript "%s" -o "%s"''' % 
41 |                         (url, self.filename))
42 |                 result = cmd.read()
43 |                 if 'Failed' in result:
44 |                     os.unlink(filename)
45 |                 cmd.close()
46 | 
47 |             if self.cursor.execute('''select * from news where url='%s' ''' % 
48 |                     item.get('url', '')) == 0:
49 |                 self.cursor.execute(
50 |                     """INSERT INTO news(title,url,site,abstract, created, file)  
51 |                     VALUES (%s, %s, %s, %s, %s, %s)""", 
52 |                     (   item.get('title','').encode('utf-8'), 
53 |                         url,
54 |                         item.get('site', ''),
55 |                         item.get('abstract', '').encode('utf-8'),
56 |                         item.get('created', time.strftime('%Y-%m-%d %H:%M:%S')),
57 |                         self.filename.split('/')[-1]) )
58 |                 self.conn.commit()
59 |             else:
60 |                 log.msg('%s already exists' % url , level=log.WARNING)
61 |         except MySQLdb.Error, e:
62 |             log.msg("Error %d: %s" % (e.args[0], e.args[1]), level=log.ERROR)
63 |         return item
64 |     
65 |     def finalize(self):
66 |         if self.conn is not None:
67 |             self.conn.commit()
68 |             self.conn.close()
69 |             self.conn=None
70 | 
71 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for newsspider project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'Newsspider'
10 | USER_AGENT = 'Newsspider+(+http://www.www.com/)'
11 | # Baiduspider+(+http://www.baidu.com/search/spider.htm")
12 | # Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
13 | # Googlebot/2.1 (+http://www.googlebot.com/bot.html)
14 | # Googlebot/2.1 (+http://www.google.com/bot.html)
15 | # Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html")
16 | # Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp")
17 | # iaskspider/2.0(+http://iask.com/help/help_index.html")
18 | # Mozilla/5.0 (compatible; iaskspider/1.0; MSIE 6.0)
19 | # Sogou web spider/3.0(+http://www.sogou.com/docs/help/webmasters.htm#07")
20 | # Sogou Push Spider/3.0(+http://www.sogou.com/docs/help/webmasters.htm#07")
21 | # Mozilla/5.0 (compatible; YodaoBot/1.0; http://www.yodao.com/help/webmaster/spider/"; )
22 | # msnbot/1.0 (+http://search.msn.com/msnbot.htm")
23 | 
24 | SPIDER_MODULES = ['newsspider.spiders']
25 | NEWSPIDER_MODULE = 'newsspider.spiders'
26 | COMMANDS_MODULE = 'newsspider.commands'
27 | DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
28 | ITEM_PIPELINES = ['newsspider.pipelines.NewsspiderPipeline']
29 | SCHEDULER = 'scrapy.core.scheduler.Scheduler'
30 | 
31 | DOWNLOADER_MIDDLEWARES = {   
32 |         # 'newsspider.middlewares.ProxyMiddleware': 100, 
33 |         'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
34 |         'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110, 
35 |         'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
36 |         'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
37 |         'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
38 |         'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
39 |         'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
40 |         'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
41 |         'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
42 |         'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
43 |         'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 800,
44 |         'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
45 |         'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
46 |         'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
47 |     }
48 | 
49 | CONCURRENT_ITEMS = 100
50 | CONCURRENT_REQUESTS = 16
51 | CONCURRENT_REQUESTS_PER_DOMAIN =8
52 | # CONCURRENT_REQUESTS_PER_IP = 8
53 | ROBOTSTXT_OBEY = False
54 | 
55 | DEPTH_LIMIT = 6
56 | DOWNLOAD_DELAY = 0.25
57 | RANDOMIZE_DOWNLOAD_DELAY = True
58 | DOWNLOAD_TIMEOUT = 30
59 | DNSCACHE_ENABLED = True
60 | 
61 | 
62 | #LOG_FILE = ''
63 | LOG_LEVEL = 'DEBUG'
64 | 
65 | PNG_PATH = '/var/scrapy'
66 | DOWNLOAD_PREVIEW = False
67 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/spiders/cnblogs.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import XMLFeedSpider
 2 | from newsspider.items import NewsspiderItem
 3 | from scrapy import log
 4 | 
 5 | class FeedSpider(XMLFeedSpider):
 6 |     name = 'cnblogs'
 7 |     allowed_domains = ['cnblogs.com']
 8 |     start_urls = ['http://feed.cnblogs.com/blog/u/53116/rss']
 9 |     iterator = 'iternodes' # you can change this; see the docs
10 |     itertag = 'item' # change it accordingly
11 |     #namespaces = [ ('content', 'http://purl.org/rss/1.0/modules/content'),
12 |     #        ('dc', 'http://purl.org/dc/elements/1.1/') ]
13 | 
14 |     def parse_node(self, response, selector):
15 |         #for prefix, uri in self.namespaces:
16 |         #    selector.register_namespace (prefix, uri)
17 | 
18 |         item = NewsspiderItem()
19 |         item['url'] = selector.select('id/text()').extract()[0]
20 |         item['title'] = selector.select('title/text()').extract()[0]
21 |         item['created'] = selector.select('published/text()').extract()[0]
22 |         item['abstract'] = selector.select('summary/text()').extract()[0]
23 |         item['site']= FeedSpider.name
24 |         return item
25 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/spiders/dbanotes.py:
--------------------------------------------------------------------------------
 1 | #from scrapy.selector import HtmlXPathSelector
 2 | #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 3 | from scrapy.contrib.spiders import CrawlSpider, Rule
 4 | from newsspider.items import NewsspiderItem
 5 | from BeautifulSoup import BeautifulSoup
 6 | import time
 7 | 
 8 | class HackernewsSpider(CrawlSpider):
 9 |     name = 'dbanotes'
10 |     allowed_domains = ['news.dbanotes.net']
11 |     start_urls = ['http://news.dbanotes.net/']
12 | 
13 |     #rules = (
14 |     #    Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
15 |     #)
16 | 
17 |     def parse(self, response):
18 |         #hxs = HtmlXPathSelector(response)
19 |         #i = NewsspiderItem()
20 |         #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract()
21 |         #i['name'] = hxs.select('//div[@id="name"]').extract()
22 |         #i['description'] = hxs.select('//div[@id="description"]').extract()
23 |         soup=BeautifulSoup(response.body)
24 |         links=soup.findAll('td', {'class':'title'})
25 |         for link in links:
26 |             linkinfo = link.findChild()
27 |             if not linkinfo: continue
28 |             title = linkinfo.text
29 |             url = linkinfo.get('href', '')
30 |             if not url.startswith('http'):
31 |                 if url.startswith('/'):
32 |                     url = 'http://news.dbanotes.net' + url
33 |                 else:
34 |                     url = 'http://news.dbanotes.net/' + url
35 | 
36 |             # deal with next page
37 |             if title == 'More' or title == 'next': # next page
38 |                 yield self.make_requests_from_url(url).replace(callback=self.parse)
39 |             # get a new news item
40 |             elif len(title)>0 and len(url)>0:
41 |                 item = NewsspiderItem()
42 |                 item['title']=title
43 |                 item['url']=url
44 |                 item['site']='dbanotes'
45 |                 yield item
46 | 
47 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/spiders/hackernews.py:
--------------------------------------------------------------------------------
 1 | #from scrapy.selector import HtmlXPathSelector
 2 | #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 3 | from scrapy.contrib.spiders import CrawlSpider, Rule
 4 | from newsspider.items import NewsspiderItem
 5 | from BeautifulSoup import BeautifulSoup
 6 | import time
 7 | 
 8 | class HackernewsSpider(CrawlSpider):
 9 |     name = 'hackernews'
10 |     allowed_domains = ['news.ycombinator.com']
11 |     start_urls = ['http://news.ycombinator.com/']
12 | 
13 |     #rules = (
14 |     #    Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
15 |     #)
16 | 
17 |     def parse(self, response):
18 |         #hxs = HtmlXPathSelector(response)
19 |         #i = NewsspiderItem()
20 |         #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract()
21 |         #i['name'] = hxs.select('//div[@id="name"]').extract()
22 |         #i['description'] = hxs.select('//div[@id="description"]').extract()
23 |         soup=BeautifulSoup(response.body)
24 |         links=soup.findAll('td', {'class':'title'})
25 |         for link in links:
26 |             linkinfo = link.findChild()
27 |             if not linkinfo: continue
28 |             title = linkinfo.text
29 |             url = linkinfo.get('href', '')
30 |             if not url.startswith('http'):
31 |                 if url.startswith('/'):
32 |                     url = 'http://news.ycombinator.com' + url
33 |                 else:
34 |                     url = 'http://news.ycombinator.com/' + url
35 | 
36 |             # deal with next page
37 |             if title == 'More' or title == 'next': # next page
38 |                 yield self.make_requests_from_url(url).replace(callback=self.parse)
39 |             # get a new news item
40 |             elif len(title)>0 and len(url)>0:
41 |                 item = NewsspiderItem()
42 |                 item['title']=title
43 |                 item['url']=url
44 |                 item['site']='hackernews'
45 |                 yield item
46 | 
47 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/spiders/jobbole.py:
--------------------------------------------------------------------------------
 1 | from scrapy.contrib.spiders import XMLFeedSpider
 2 | from newsspider.items import NewsspiderItem
 3 | from scrapy import log
 4 | 
 5 | class FeedSpider(XMLFeedSpider):
 6 |     name = 'jobbole'
 7 |     allowed_domains = ['blog.jobbole.com']
 8 |     start_urls = ['http://blog.jobbole.com/feed/']
 9 |     iterator = 'iternodes' # you can change this; see the docs
10 |     itertag = 'item' # change it accordingly
11 |     namespaces = [ ('content', 'http://purl.org/rss/1.0/modules/content'),
12 |             ('dc', 'http://purl.org/dc/elements/1.1/') ]
13 | 
14 |     def parse_node(self, response, selector):
15 |         #for prefix, uri in self.namespaces:
16 |         #    selector.register_namespace (prefix, uri)
17 |         selector.remove_namespaces()
18 |         item = NewsspiderItem()
19 |         item['url'] = selector.select('link/text()').extract()[0]
20 |         item['title'] = selector.select('title/text()').extract()[0]
21 |         item['created'] = selector.select('pubDate/text()').extract()[0]
22 |         item['abstract'] = selector.select('description/text()').extract()[0]
23 |         item['site']= FeedSpider.name
24 |         return item
25 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/spiders/mindhacks.py:
--------------------------------------------------------------------------------
 1 | from scrapy.selector import HtmlXPathSelector
 2 | from scrapy.contrib.spiders import CrawlSpider, Rule
 3 | from newsspider.items import NewsspiderItem
 4 | 
 5 | class MindhacksSpider(CrawlSpider):
 6 |     name = 'mindhacks'
 7 |     allowed_domains = ['mindhacks.cn']
 8 |     start_urls = ['http://www.mindhacks.cn/']
 9 | 
10 |     def parse(self, response):
11 |         hxs = HtmlXPathSelector(response)
12 |         sites = hxs.select('//h3/a/@href')
13 | 
14 |         for url in sites:
15 |             yield self.make_requests_from_url(url.extract()).replace(callback=self.parse_post)
16 | 
17 |         # process next page
18 |         page_links=hxs.select('//div[@class="wp-pagenavi"]/a[not(@title)]')
19 |         for link in page_links:
20 |             if link.select('text()').extract()[0] == u'\xbb':
21 |                 url = link.select('@href').extract()[0]
22 |                 yield self.make_requests_from_url(url)
23 | 
24 |     def parse_post(self, response):
25 |         hxs = HtmlXPathSelector(response)
26 |         title = hxs.select('//h1/a/text()').extract()[0]
27 |         url = hxs.select('//h1/a/@href').extract()[0]
28 |         created = hxs.select('//*[@class="published"]/@title').extract()[0]
29 | 
30 |         if len(title) >0 and len(url) > 0:
31 |             item = NewsspiderItem()
32 |             item['url'] = url
33 |             item['title'] = title
34 |             item['created'] = created
35 |             item['site']= MindhacksSpider.name
36 |             yield item
37 | 


--------------------------------------------------------------------------------
/newsspider/newsspider/spiders/reddit.py:
--------------------------------------------------------------------------------
 1 | from scrapy.selector import HtmlXPathSelector
 2 | #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 3 | from scrapy.contrib.spiders import CrawlSpider, Rule
 4 | from newsspider.items import NewsspiderItem
 5 | import time
 6 | 
 7 | class RedditSpider(CrawlSpider):
 8 |     name = 'reddit'
 9 |     allowed_domains = ['reddit.com']
10 |     start_urls = ['http://www.reddit.com/r/programming/','http://www.reddit.com/']
11 | 
12 |     def parse(self, response):
13 |         hxs = HtmlXPathSelector(response)
14 | 
15 |         # deal with next page
16 |         nextlink = hxs.select('//p[@class="nextprev"]//a').select('@href').extract()[0]
17 |         if len(nextlink)>0:
18 |             yield self.make_requests_from_url(nextlink)
19 | 
20 |         links = hxs.select('//*[@id="siteTable"]//div//p[1]/a')
21 |         for link in links:
22 |             url = link.select('@href').extract()[0]
23 |             title = link.select('text()').extract()[0]
24 |             if len(url)>0 and len(title)>0:
25 |                 item = NewsspiderItem()
26 |                 item['title']=title 
27 |                 item['url']=url
28 |                 item['site']=RedditSpider.name
29 |                 yield item
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/newsspider/proxies.txt:
--------------------------------------------------------------------------------
  1 | 5.199.132.164	443	HTTP
  2 | 64.208.21.16	80	HTTP
  3 | 110.153.9.250	80	HTTP
  4 | 110.139.206.93	8080	HTTP
  5 | 72.247.48.10	80	HTTP
  6 | 180.250.130.186	80	HTTP
  7 | 114.80.149.183	80	HTTP
  8 | 123.108.14.39	8080	HTTP
  9 | 211.167.112.14	80	HTTP
 10 | 89.218.100.178	9090	HTTP
 11 | 202.149.78.234	8080	HTTP
 12 | 101.255.33.250	80	HTTP
 13 | 80.79.179.10	8181	HTTP
 14 | 180.250.165.197	8080	HTTP
 15 | 119.110.71.109	8080	HTTP
 16 | 123.129.242.131	8081	HTTP
 17 | 211.167.112.14	82	HTTP
 18 | 89.218.100.218	9090	HTTP
 19 | 202.201.1.119	8001	HTTP
 20 | 80.90.12.36	8080	HTTP
 21 | 186.47.84.139	8080	HTTP
 22 | 125.141.206.36	8080	HTTP
 23 | 211.239.84.130	443	HTTP
 24 | 89.218.120.114	9090	HTTP
 25 | 203.92.47.202	8082	HTTP
 26 | 114.247.21.244	3131	HTTP
 27 | 190.92.87.98	8080	HTTP
 28 | 87.110.149.88	8080	HTTP
 29 | 186.208.71.70	8080	HTTP
 30 | 119.235.21.11	80	HTTP
 31 | 148.236.5.91	8080	HTTP
 32 | 218.28.112.114	809	HTTP
 33 | 119.252.168.34	8080	HTTP
 34 | 202.52.244.110	8080	HTTP
 35 | 218.108.85.59	82	HTTP
 36 | 14.31.11.70	9009	HTTP
 37 | 186.225.212.245	8080	HTTP
 38 | 120.85.132.234	80	HTTP
 39 | 175.25.243.27	80	HTTP
 40 | 218.61.8.124	88	HTTP
 41 | 89.251.103.130	8080	HTTP
 42 | 195.202.159.123	8080	HTTP
 43 | 212.233.147.48	8080	HTTP
 44 | 218.206.204.254	80	HTTP
 45 | 14.31.11.78	9009	HTTP
 46 | 110.4.12.170	83	HTTP
 47 | 186.226.98.254	8080	HTTP
 48 | 124.227.191.68	9000	HTTP
 49 | 190.29.22.247	8080	HTTP
 50 | 218.89.165.131	6060	HTTP
 51 | 91.218.84.195	80	HTTP
 52 | 210.14.143.53	7020	HTTP
 53 | 178.18.17.208	8080	HTTP
 54 | 202.43.65.130	8080	HTTP
 55 | 37.229.231.253	8080	HTTP
 56 | 218.206.204.254	443	HTTP
 57 | 58.210.247.18	1337	HTTP
 58 | 112.213.118.48	80	HTTP
 59 | 187.53.150.62	8080	HTTP
 60 | 125.210.188.35	80	HTTP
 61 | 195.69.191.203	80	HTTP
 62 | 218.102.39.154	8080	HTTP
 63 | 91.228.53.28	8080	HTTP
 64 | 210.14.143.122	80	HTTP
 65 | 202.43.188.5	8080	HTTP
 66 | 202.46.146.22	8080	HTTP
 67 | 92.126.217.47	80	HTTP
 68 | 218.249.83.87	8080	HTTP
 69 | 58.211.114.107	443	HTTP
 70 | 113.160.50.51	80	HTTP
 71 | 178.18.17.250	8080	HTTP
 72 | 219.153.5.3	8181	HTTP
 73 | 93.186.97.236	8080	HTTP
 74 | 210.177.139.89	8080	HTTP
 75 | 222.168.65.130	80	HTTP
 76 | 103.28.227.78	8080	HTTP
 77 | 219.150.254.158	8080	HTTP
 78 | 58.221.129.158	1337	HTTP
 79 | 119.82.253.88	8080	HTTP
 80 | 187.85.89.167	8080	HTTP
 81 | 182.93.206.92	8080	HTTP
 82 | 202.106.179.141	10160	HTTP
 83 | 219.223.252.150	56142	HTTP
 84 | 95.77.97.146	8080	HTTP
 85 | 211.99.28.21	808	HTTP
 86 | 211.142.236.137	8080	HTTP
 87 | 41.75.201.146	8080	HTTP
 88 | 109.207.63.89	8090	HTTP
 89 | 219.159.105.180	8080	HTTP
 90 | 59.34.57.88	8080	HTTP
 91 | 187.110.169.186	8080	HTTP
 92 | 190.40.80.144	8080	HTTP
 93 | 210.212.98.228	80	HTTP
 94 | 220.246.4.74	8080	HTTP
 95 | 211.142.236.133	8080	HTTP
 96 | 218.22.71.122	8080	HTTP
 97 | 58.210.212.107	80	HTTP
 98 | 112.25.15.18	9098	HTTP
 99 | 59.37.168.16	8081	HTTP
100 | 185.8.2.50	8080	HTTP
101 | 190.102.17.121	80	HTTP
102 | 218.106.99.22	888	HTTP
103 | 221.3.153.74	80	HTTP
104 | 106.3.98.79	80	HTTP
105 | 211.142.236.137	80	HTTP
106 | 2.133.93.170	9090	HTTP
107 | 78.9.164.162	8080	HTTP
108 | 119.6.73.235	80	HTTP
109 | 221.130.17.48	80	HTTP
110 | 59.57.15.71	80	HTTP
111 | 186.5.65.164	8080	HTTP
112 | 190.0.17.202	8080	HTTP
113 | 197.254.11.30	8080	HTTP
114 | 5.8.242.12	8080	HTTP
115 | 221.130.18.218	80	HTTP
116 | 110.74.220.50	8080	HTTP
117 | 211.144.72.153	80	HTTP
118 | 2.135.237.154	9090	HTTP
119 | 82.200.253.202	9090	HTTP
120 | 119.110.69.70	80	HTTP
121 | 221.130.17.139	80	HTTP
122 | 59.172.208.189	8080	HTTP
123 | 190.14.255.169	8080	HTTP
124 | 190.0.46.66	8080	HTTP
125 | 200.93.115.248	8080	HTTP
126 | 5.10.224.62	80	HTTP
127 | 221.130.18.253	80	HTTP
128 | 110.93.211.11	80	HTTP
129 | 82.209.195.5	8080	HTTP
130 | 125.39.68.195	80	HTTP
131 | 221.130.23.4	80	HTTP
132 | 59.172.208.190	8080	HTTP
133 | 198.15.119.111	8080	HTTP
134 | 190.0.61.194	8080	HTTP
135 | 200.107.32.127	8080	HTTP
136 | 5.10.224.62	8080	HTTP
137 | 221.130.23.4	80	HTTP
138 | 110.93.211.11	8080	HTTP
139 | 218.206.204.254	80	HTTP
140 | 2.135.238.26	9090	HTTP
141 | 180.250.192.222	8080	HTTP
142 | 221.130.23.5	80	HTTP
143 | 200.137.133.171	80	HTTP
144 | 31.170.178.2	8080	HTTP
145 | 221.130.23.6	80	HTTP
146 | 112.5.254.30	80	HTTP
147 | 218.206.204.254	443	HTTP
148 | 2.135.238.108	9090	HTTP
149 | 88.249.127.222	8080	HTTP
150 | 186.46.122.250	8080	HTTP
151 | 221.130.23.6	80	HTTP
152 | 61.166.55.153	11808	HTTP
153 | 201.218.63.4	8080	HTTP
154 | 190.211.97.71	8080	HTTP
155 | 200.213.4.4	8080	HTTP
156 | 41.78.26.45	8080	HTTP
157 | 221.130.23.29	80	HTTP
158 | 112.175.248.22	8080	HTTP
159 | 2.135.242.42	9090	HTTP
160 | 110.138.160.170	8080	HTTP
161 | 187.85.225.185	80	HTTP
162 | 221.130.23.8	80	HTTP
163 | 106.3.98.82	80	HTTP
164 | 201.63.184.5	8080	HTTP
165 | 46.249.66.50	8080	HTTP
166 | 222.169.11.34	8080	HTTP
167 | 114.113.221.72	54321	HTTP
168 | 221.10.40.232	80	HTTP
169 | 2.135.243.42	9090	HTTP
170 | 110.138.163.58	8080	HTTP
171 | 221.130.23.78	80	HTTP
172 | 106.3.98.82	82	HTTP
173 | 202.171.253.98	80	HTTP
174 | 202.108.77.153	80	HTTP
175 | 77.236.209.236	8080	HTTP
176 | 116.68.171.70	8080	HTTP
177 | 221.10.40.232	82	HTTP
178 | 27.50.11.165	80	HTTP
179 | 118.96.137.140	8080	HTTP
180 | 5.10.224.58	80	HTTP
181 | 106.3.98.82	83	HTTP
182 | 202.171.253.103	80	HTTP
183 | 198.154.114.100	8080	HTTP
184 | 202.162.198.178	8080	HTTP
185 | 91.202.164.185	8080	HTTP
186 | 223.4.205.37	808	HTTP
187 | 117.34.72.51	808	HTTP
188 | 221.10.40.232	83	HTTP
189 | 36.73.40.189	8080	HTTP
190 | 200.208.251.218	8080	HTTP
191 | 72.64.146.136	43	HTTP
192 | 221.130.23.80	80	HTTP
193 | 112.5.254.19	80	HTTP
194 | 202.171.253.103	85	HTTP
195 | 200.27.114.228	8080	HTTP
196 | 202.182.49.41	8080	HTTP
197 | 103.10.22.226	8080	HTTP
198 | 58.252.56.148	8080	HTTP
199 | 118.97.206.28	8080	HTTP
200 | 221.130.18.76	80	HTTP
201 | 58.67.147.204	8080	HTTP
202 | 201.64.247.3	8080	HTTP
203 | 81.169.154.244	8080	HTTP
204 | 221.130.23.81	80	HTTP
205 | 112.5.254.20	80	HTTP
206 | 202.171.253.108	80	HTTP
207 | 200.27.114.233	8080	HTTP
208 | 110.138.208.50	8080	HTTP
209 | 122.72.15.231	80	HTTP
210 | 118.97.212.162	8080	HTTP
211 | 221.130.199.19	80	HTTP
212 | 77.89.233.54	8080	HTTP
213 | 202.46.85.107	8080	HTTP
214 | 125.69.132.100	8080	HTTP
215 | 221.130.23.82	80	HTTP
216 | 117.41.182.188	8080	HTTP
217 | 202.171.253.108	83	HTTP
218 | 200.54.92.187	80	HTTP
219 | 37.77.50.133	80	HTTP
220 | 111.161.30.228	80	HTTP
221 | 124.81.208.34	8080	HTTP
222 | 119.4.250.105	80	HTTP
223 | 221.130.199.98	80	HTTP
224 | 85.172.4.154	80	HTTP
225 | 202.93.136.98	8080	HTTP
226 | 221.130.23.91	80	HTTP
227 | 118.145.0.76	10086	HTTP
228 | 203.124.12.71	8080	HTTP
229 | 200.61.31.69	8080	HTTP
230 | 61.55.141.11	80	HTTP
231 | 114.113.221.77	54321	HTTP
232 | 180.243.92.86	8080	HTTP
233 | 119.7.221.135	81	HTTP
234 | 221.178.174.171	888	HTTP
235 | 87.236.233.92	8080	HTTP
236 | 203.91.43.43	9988	HTTP
237 | 190.3.108.211	8080	HTTP
238 | 221.181.192.91	80	HTTP
239 | 206.130.99.82	8080	HTTP
240 | 200.71.86.50	8080	HTTP
241 | 61.135.223.4	7000	HTTP
242 | 119.110.69.70	8080	HTTP
243 | 202.74.241.196	8080	HTTP
244 | 119.7.221.135	82	HTTP
245 | 222.124.35.117	8080	HTTP
246 | 110.139.60.228	8080	HTTP
247 | 190.72.150.144	8080	HTTP
248 | 221.215.155.38	8090	HTTP
249 | 120.203.214.162	80	HTTP
250 | 211.232.93.13	808	HTTP
251 | 200.75.51.151	8080	HTTP
252 | 78.133.155.54	8080	HTTP
253 | 180.87.197.91	8080	HTTP
254 | 1.63.18.22	8080	HTTP
255 | 119.7.221.137	82	HTTP
256 | 222.187.222.118	8080	HTTP
257 | 111.13.87.150	80	HTTP
258 | 219.76.104.17	8080	HTTP
259 | 200.208.251.220	8080	HTTP
260 | 222.89.55.123	8080	HTTP
261 | 120.203.214.176	80	HTTP
262 | 218.102.39.153	8080	HTTP
263 | 200.109.228.67	8080	HTTP
264 | 109.207.61.189	8090	HTTP
265 | 180.249.119.252	8080	HTTP
266 | 2.133.92.106	9090	HTTP
267 | 119.59.193.175	8080	HTTP
268 | 62.84.67.170	8080	HTTP
269 | 111.161.30.236	80	HTTP
270 | 49.212.167.222	80	HTTP
271 | 58.20.230.131	8080	HTTP
272 | 222.217.99.72	9000	HTTP
273 | 120.203.214.187	80	HTTP
274 | 200.195.176.77	8080	HTTP
275 | 113.53.254.124	8080	HTTP
276 | 2.133.92.122	9090	HTTP
277 | 119.145.2.18	80	HTTP
278 | 171.101.144.18	8080	HTTP
279 | 112.5.254.172	80	HTTP
280 | 103.247.16.241	8080	HTTP
281 | 61.235.69.243	8080	HTTP
282 | 222.217.99.177	9000	HTTP
283 | 2.135.243.84	9090	HTTP
284 | 200.202.240.174	80	HTTP
285 | 123.235.12.118	8080	HTTP
286 | 200.169.162.132	80	HTTP
287 | 2.133.92.157	80	HTTP
288 | 119.233.255.51	80	HTTP
289 | 213.131.41.6	8080	HTTP
290 | 178.48.2.237	8080	HTTP
291 | 109.236.220.98	8080	HTTP
292 | 222.240.224.131	80	HTTP
293 | 123.134.95.142	80	HTTP
294 | 5.135.242.225	8080	HTTP
295 | 200.204.161.246	8080	HTTP
296 | 218.22.71.124	8080	HTTP
297 | 2.133.92.158	80	HTTP
298 | 119.233.255.60	80	HTTP
299 | 1.234.45.130	80	HTTP
300 | 180.247.120.217	8080	HTTP
301 | 116.255.234.73	3288	HTTP
302 | 61.177.248.202	1080	SOCKS4
303 | 201.56.208.233	8080	HTTP
304 | 177.83.122.189	8080	HTTP
305 | 218.22.71.126	8080	HTTP
306 | 2.133.93.82	9090	HTTP
307 | 119.235.21.10	8080	HTTP
308 | 27.116.21.163	8080	HTTP
309 | 124.207.170.230	8080	HTTP
310 | 121.204.0.2	80	HTTP
311 | 183.60.44.136	88	HTTP
312 | 49.0.96.1	8000	HTTP
313 | 201.86.70.162	80	HTTP
314 | 177.182.252.197	8080	HTTP
315 | 221.210.40.150	8080	HTTP
316 | 119.252.172.131	80	HTTP
317 | 49.0.110.1	8000	HTTP
318 | 124.240.187.81	82	HTTP
319 | 200.54.78.66	8080	HTTP
320 | 125.165.51.4	8080	HTTP
321 | 183.61.246.78	80	HTTP
322 | 62.201.207.14	8080	HTTP
323 | 201.249.192.74	8080	HTTP
324 | 190.116.87.4	8080	HTTP
325 | 2.135.238.92	9090	HTTP
326 | 120.194.100.46	8001	HTTP
327 | 58.215.88.12	80	HTTP
328 | 164.77.196.78	80	HTTP
329 | 202.154.225.229	8080	HTTP
330 | 186.5.102.162	8080	HTTP
331 | 114.80.136.112	7780	HTTP
332 | 183.129.249.82	80	HTTP
333 | 62.201.210.190	8080	HTTP
334 | 202.152.22.38	8080	HTTP
335 | 31.135.196.229	8080	HTTP
336 | 41.216.171.154	8080	HTTP
337 | 59.49.79.121	9527	HTTP
338 | 177.11.17.46	8080	HTTP
339 | 71.189.47.2	8081	HTTP
340 | 190.78.2.84	8080	HTTP
341 | 115.100.60.198	8000	HTTP
342 | 183.129.249.83	80	HTTP
343 | 63.141.216.176	80	HTTP
344 | 203.172.245.34	8080	HTTP
345 | 195.140.190.146	8080	HTTP
346 | 81.201.61.138	8080	HTTP
347 | 58.53.192.218	8123	HTTP
348 | 121.12.118.241	999	HTTP
349 | 59.57.15.71	80	HTTP
350 | 180.242.88.43	5311	HTTP
351 | 202.29.211.122	8080	HTTP
352 | 115.236.19.48	8080	HTTP
353 | 211.100.47.131	8990	HTTP
354 | 66.35.68.146	8080	HTTP
355 | 212.175.88.3	8080	HTTP
356 | 197.251.194.164	8080	HTTP
357 | 89.171.46.225	8080	HTTP
358 | 59.59.51.74	8001	HTTP
359 | 122.11.38.182	9090	HTTP
360 | 59.172.208.186	8080	HTTP
361 | 183.110.231.124	80	HTTP
362 | 202.202.1.189	80	HTTP
363 | 116.112.66.102	808	HTTP
364 | 211.100.47.244	8990	HTTP
365 | 74.221.211.117	8080	HTTP
366 | 213.110.196.11	80	HTTP
367 | 202.108.50.72	80	HTTP
368 | 94.137.239.19	81	HTTP
369 | 60.165.173.36	8003	HTTP
370 | 122.72.0.6	80	HTTP
371 | 61.156.217.166	8000	HTTP
372 | 187.20.25.42	8080	HTTP
373 | 203.93.104.20	80	HTTP
374 | 119.97.146.152	80	HTTP
375 | 211.100.52.42	8990	HTTP
376 | 77.65.22.245	8080	HTTP
377 | 217.117.14.247	80	HTTP
378 | 202.145.3.130	8080	HTTP
379 | 110.139.58.31	8080	HTTP
380 | 60.191.142.233	8360	HTTP
381 | 122.144.1.213	9999	HTTP
382 | 78.188.3.171	8080	HTTP
383 | 190.29.30.114	8080	HTTP
384 | 119.252.168.34	80	HTTP
385 | 120.194.100.42	8001	HTTP
386 | 211.142.236.133	80	HTTP
387 | 77.78.104.129	8080	HTTP
388 | 218.100.84.123	8080	HTTP
389 | 202.146.237.79	808	HTTP
390 | 114.113.221.70	54321	HTTP
391 | 61.136.93.38	8080	HTTP
392 | 122.252.181.20	8080	HTTP
393 | 78.188.47.21	8080	HTTP
394 | 190.128.170.18	8080	HTTP
395 | 178.233.149.172	8080	HTTP
396 | 120.203.214.182	80	HTTP
397 | 219.83.100.195	8080	HTTP
398 | 208.163.36.221	8080	HTTP
399 | 61.152.108.187	80	HTTP
400 | 123.30.174.61	8080	HTTP
401 | 83.17.80.124	8080	HTTP
402 | 200.60.11.20	8080	HTTP
403 | 177.70.17.154	8080	HTTP
404 | 187.5.122.231	8080	HTTP
405 | 122.72.2.180	80	HTTP
406 | 211.142.236.137	80	HTTP
407 | 77.238.209.194	8080	HTTP
408 | 222.124.19.210	8080	HTTP
409 | 221.179.173.170	8080	HTTP
410 | 118.97.58.166	8080	HTTP
411 | 61.155.140.154	55808	HTTP
412 | 92.39.54.161	80	HTTP
413 | 200.137.133.169	80	HTTP
414 | 177.85.233.190	8080	HTTP
415 | 122.72.120.63	80	HTTP
416 | 211.142.236.137	8080	HTTP
417 | 78.159.235.3	8080	HTTP
418 | 222.124.147.105	8080	HTTP
419 | 36.73.42.103	8080	HTTP
420 | 101.255.33.254	80	HTTP
421 | 201.41.66.212	8080	HTTP
422 | 178.219.103.205	8080	HTTP
423 | 200.24.17.46	80	HTTP
424 | 122.72.124.2	80	HTTP
425 | 218.22.71.125	8080	HTTP
426 | 80.90.27.60	8080	HTTP
427 | 222.124.207.29	8080	HTTP
428 | 60.166.13.182	80	HTTP
429 | 122.224.5.210	443	HTTP
430 | 85.207.17.146	8080	HTTP
431 | 123.164.148.134	80	HTTP
432 | 103.28.113.134	8080	HTTP
433 | 202.29.60.220	8080	HTTP
434 | 180.250.130.186	8080	HTTP
435 | 202.181.176.3	80	HTTP
436 | 122.225.22.22	8080	HTTP
437 | 218.22.71.210	8080	HTTP
438 | 222.124.218.164	8080	HTTP
439 | 60.216.7.28	3079	HTTP
440 | 89.218.100.90	9090	HTTP
441 | 123.164.148.134	82	HTTP
442 | 103.247.37.86	8080	HTTP
443 | 202.152.40.202	8080	HTTP
444 | 195.191.250.229	80	HTTP
445 | 213.24.60.52	8080	HTTP
446 | 202.97.159.227	8080	HTTP
447 | 218.104.193.102	80	HTTP
448 | 81.213.157.71	80	HTTP
449 | 223.25.195.68	8080	HTTP
450 | 78.38.80.142	8080	HTTP
451 | 186.192.17.138	8080	HTTP
452 | 89.237.134.10	8080	HTTP
453 | 124.81.113.183	8080	HTTP
454 | 109.74.236.165	8080	HTTP
455 | 217.29.117.162	8080	HTTP
456 | 203.110.169.76	9128	HTTP
457 | 218.201.21.175	80	HTTP
458 | 82.200.236.58	9090	HTTP
459 | 72.64.146.136	8080	HTTP
460 | 81.90.224.209	8080	HTTP
461 | 189.29.118.245	8080	HTTP
462 | 103.23.139.97	8080	HTTP
463 | 125.39.238.242	8080	HTTP
464 | 109.224.5.194	80	HTTP
465 | 60.214.67.86	8080	HTTP
466 | 203.110.169.83	9128	HTTP
467 | 218.201.21.176	80	HTTP
468 | 110.74.222.117	8080	HTTP
469 | 95.129.199.70	8080	HTTP
470 | 190.79.44.28	8080	HTTP
471 | 211.151.171.207	80	HTTP
472 | 218.108.242.100	48814	HTTP
473 | 93.189.28.106	8080	HTTP
474 | 211.144.76.58	9000	HTTP
475 | 218.201.21.177	80	HTTP
476 | 82.200.254.114	9090	HTTP
477 | 122.72.76.122	80	HTTP
478 | 103.5.49.37	8080	HTTP
479 | 201.12.116.18	8080	HTTP
480 | 109.207.61.182	8090	HTTP
481 | 150.165.75.129	8080	HTTP
482 | 111.161.30.237	80	HTTP
483 | 212.76.180.50	8080	HTTP
484 | 72.64.146.135	8080	HTTP
485 | 110.139.151.124	8080	HTTP
486 | 211.154.151.218	88	HTTP
487 | 218.201.21.178	80	HTTP
488 | 82.200.254.146	9090	HTTP
489 | 122.72.76.130	80	HTTP
490 | 109.207.61.167	8090	HTTP
491 | 202.43.188.9	8080	HTTP
492 | 110.136.245.31	8080	HTTP
493 | 151.236.194.2	8080	HTTP
494 | 113.108.92.104	80	HTTP
495 | 218.56.161.14	8118	HTTP
496 | 116.77.35.118	80	HTTP
497 | 211.167.112.14	80	HTTP
498 | 218.204.39.164	80	HTTP
499 | 89.188.224.70	8080	HTTP
500 | 190.121.154.246	8080	HTTP
501 | 124.240.187.79	81	HTTP
502 | 202.43.188.15	8080	HTTP
503 | 112.175.18.180	80	HTTP
504 | 164.77.196.75	80	HTTP
505 | 114.32.95.96	8080	HTTP
506 | 219.76.104.1	80	HTTP
507 | 111.161.30.233	80	HTTP
508 | 211.167.112.14	82	HTTP
509 | 221.130.17.37	80	HTTP
510 | 89.218.68.13	80	HTTP
511 | 200.55.206.210	8080	HTTP
512 | 124.240.187.79	82	HTTP
513 | 113.142.8.205	8080	HTTP
514 | 177.125.167.253	8080	HTTP
515 | 219.76.104.1	8080	HTTP
516 | 118.97.255.107	8080	HTTP
517 | 211.167.112.15	80	HTTP
518 | 221.130.18.45	80	HTTP
519 | 89.218.68.34	9090	HTTP
520 | 219.154.46.138	8080	HTTP
521 | 186.16.203.50	8080	HTTP
522 | 202.102.48.205	8080	HTTP
523 | 113.195.134.231	8080	HTTP
524 | 178.169.97.35	54321	HTTP
525 | 116.112.66.102	808	HTTP
526 | 219.231.164.40	45238	HTTP
527 | 176.33.138.156	8080	HTTP
528 | 211.167.112.15	82	HTTP
529 | 221.130.18.52	80	HTTP
530 | 89.218.68.130	9090	HTTP
531 | 197.251.194.126	8080	HTTP
532 | 203.93.28.166	8080	HTTP
533 | 183.110.231.240	80	HTTP
534 | 118.96.66.107	80	HTTP
535 | 219.242.50.50	8080	HTTP
536 | 211.167.112.16	80	HTTP
537 | 221.130.18.189	80	HTTP
538 | 89.218.68.132	80	HTTP
539 | 2.133.92.18	9090	HTTP
540 | 202.102.58.208	80	HTTP
541 | 218.108.242.105	41884	HTTP
542 | 116.226.46.19	8080	HTTP
543 | 183.221.250.137	80	HTTP
544 | 118.97.91.129	8080	HTTP
545 | 221.7.145.42	8080	HTTP
546 | 211.167.112.17	80	HTTP
547 | 221.130.18.253	80	HTTP
548 | 89.218.100.210	9090	HTTP
549 | 2.133.92.26	9090	HTTP
550 | 202.102.58.209	80	HTTP
551 | 27.116.21.162	8080	HTTP
552 | 118.26.231.104	5060	HTTP
553 | 183.221.250.141	80	HTTP
554 | 222.74.98.234	8080	HTTP
555 | 222.89.154.6	9000	HTTP
556 | 202.28.110.17	8080	HTTP
557 | 211.167.112.17	82	HTTP
558 | 221.176.14.72	80	HTTP
559 | 89.218.100.250	9090	HTTP
560 | 2.133.92.162	9090	HTTP
561 | 118.194.164.90	54321	HTTP
562 | 186.101.41.25	80	HTTP
563 | 118.195.65.243	80	HTTP
564 | 5.8.242.10	8080	HTTP
565 | 202.107.195.231	80	HTTP
566 | 218.28.254.77	880	HTTP
567 | 221.176.169.194	8001	HTTP
568 | 89.218.101.74	9090	HTTP
569 | 46.249.66.50	80	HTTP
570 | 119.7.221.137	81	HTTP
571 | 119.167.231.183	80	HTTP
572 | 41.78.25.69	8080	HTTP
573 | 210.212.152.5	80	HTTP
574 | 221.6.15.156	82	HTTP
575 | 221.181.192.25	80	HTTP
576 | 2.133.93.154	9090	HTTP
577 | 210.101.131.232	8080	HTTP
578 | 93.90.235.178	8080	HTTP
579 | 119.7.221.137	83	HTTP
580 | 186.101.65.115	80	HTTP
581 | 50.22.206.179	8080	HTTP
582 | 221.6.15.157	82	HTTP
583 | 93.94.180.15	8080	HTTP
584 | 2.133.94.42	9090	HTTP
585 | 218.94.1.166	82	HTTP
586 | 110.139.206.93	80	HTTP
587 | 119.62.128.172	80	HTTP
588 | 61.8.72.99	8080	HTTP
589 | 141.105.87.77	80	HTTP
590 | 41.215.3.98	80	HTTP
591 | 222.88.94.245	80	HTTP
592 | 101.255.36.30	808	HTTP
593 | 218.249.114.42	8088	HTTP
594 | 111.161.30.227	80	HTTP
595 | 121.14.9.76	80	HTTP
596 | 187.4.63.148	80	HTTP
597 | 68.71.76.242	8082	HTTP
598 | 197.251.194.121	8080	HTTP
599 | 119.187.148.102	8000	HTTP
600 | 222.92.141.155	8090	HTTP
601 | 101.255.60.158	8080	HTTP
602 | 2.135.237.194	9090	HTTP
603 | 111.161.30.232	80	HTTP
604 | 121.17.144.132	8080	HTTP
605 | 187.4.63.149	80	HTTP
606 | 121.52.144.245	80	HTTP
607 | 74.221.215.254	8080	HTTP
608 | 122.115.62.108	8081	HTTP
609 | 221.130.23.29	80	HTTP
610 | 222.187.222.118	8080	HTTP
611 | 103.11.99.162	8080	HTTP
612 | 2.135.237.250	9090	HTTP
613 | 2.133.94.26	9090	HTTP
614 | 112.125.120.145	10080	HTTP
615 | 122.4.78.26	34808	HTTP
616 | 122.72.0.145	80	HTTP
617 | 78.80.36.194	8080	HTTP
618 | 219.76.104.18	8080	HTTP
619 | 221.179.173.170	8080	HTTP
620 | 222.188.10.1	1080	SOCKS4
621 | 103.246.145.184	8080	HTTP
622 | 2.135.242.162	9090	HTTP
623 | 2.135.238.146	9090	HTTP
624 | 122.72.0.28	80	HTTP
625 | 187.33.208.250	8080	HTTP
626 | 122.72.2.180	80	HTTP
627 | 78.131.55.82	8080	HTTP
628 | 58.67.147.196	8080	HTTP
629 | 202.116.1.149	8128	HTTP
630 | 221.195.42.195	8080	HTTP
631 | 222.217.99.156	9000	HTTP
632 | 2.135.242.170	9090	HTTP
633 | 24.158.199.54	8082	HTTP
634 | 117.121.238.17	8080	HTTP
635 | 123.125.74.212	80	HTTP
636 | 187.115.65.187	80	HTTP
637 | 122.72.2.180	8080	HTTP
638 | 82.79.92.226	8080	HTTP
639 | 202.137.22.182	8080	HTTP
640 | 223.4.173.109	808	HTTP
641 | 2.135.242.186	9090	HTTP
642 | 118.96.192.84	8080	HTTP
643 | 124.240.187.79	80	HTTP
644 | 91.202.164.29	8080	HTTP
645 | 103.10.22.231	8080	HTTP
646 | 27.54.218.248	80	HTTP
647 | 222.74.98.234	8080	HTTP
648 | 27.50.132.145	88	HTTP
649 | 2.135.242.226	9090	HTTP
650 | 190.67.169.194	8080	HTTP
651 | 124.240.187.79	83	HTTP
652 | 123.164.148.132	80	HTTP
653 | 101.255.33.249	80	HTTP
654 | 69.29.105.153	8080	HTTP
655 | 58.67.147.205	8080	HTTP
656 | 61.167.49.188	8080	HTTP
657 | 124.240.187.80	80	HTTP
658 | 190.66.22.53	8080	HTTP
659 | 109.207.61.170	8090	HTTP
660 | 180.248.156.56	8080	HTTP
661 | 114.113.221.69	54321	HTTP
662 | 124.95.142.94	80	HTTP
663 | 58.248.254.38	80	HTTP
664 | 


--------------------------------------------------------------------------------
/newsspider/query_db.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #coding: utf-8
 3 | 
 4 | def query():
 5 |     import MySQLdb
 6 |     conn = MySQLdb.connect(host='localhost', user='root',passwd='feisky',
 7 |             db='news' , charset="utf8")
 8 |     try:
 9 |         cursor = conn.cursor()
10 |         cursor.execute("select * from news order by created desc")
11 |         data =  cursor.fetchall()
12 |     finally:
13 |         conn.close()
14 | 
15 |     for d in data:
16 |         print '%-60s %-30s' % (d[1],d[2])
17 | 
18 | if __name__ == '__main__':
19 |     query()
20 | 


--------------------------------------------------------------------------------
/newsspider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = newsspider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = newsspider
12 | 


--------------------------------------------------------------------------------
/newsspider/webkit2png:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # webkit2png.py
  4 | #
  5 | # Creates screenshots of webpages using by QtWebkit.
  6 | #
  7 | # Copyright (c) 2008 Roland Tapken <roland@dau-sicher.de>
  8 | #
  9 | # This program is free software; you can redistribute it and/or
 10 | # modify it under the terms of the GNU General Public License
 11 | # as published by the Free Software Foundation; either version 2
 12 | # of the License, or (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program; if not, write to the Free Software
 21 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 22 | #
 23 | # Nice ideas "todo":
 24 | #  - Add QTcpSocket support to create a "screenshot daemon" that
 25 | #    can handle multiple requests at the same time.
 26 | 
 27 | from webkit2png import WebkitRenderer
 28 | 
 29 | import sys
 30 | import signal
 31 | import os
 32 | import urlparse
 33 | import logging
 34 | from optparse import OptionParser
 35 | 
 36 | from PyQt4.QtCore import *
 37 | from PyQt4.QtGui import *
 38 | from PyQt4.QtWebKit import *
 39 | from PyQt4.QtNetwork import *
 40 | 
 41 | VERSION="20091224"
 42 | LOG_FILENAME = 'webkit2png.log'
 43 | logger = logging.getLogger('webkit2png');
 44 | 
 45 | def init_qtgui(display=None, style=None, qtargs=None):
 46 |     """Initiates the QApplication environment using the given args."""
 47 |     if QApplication.instance():
 48 |         logger.debug("QApplication has already been instantiated. \
 49 |                         Ignoring given arguments and returning existing QApplication.")
 50 |         return QApplication.instance()
 51 | 
 52 |     qtargs2 = [sys.argv[0]]
 53 | 
 54 |     if display:
 55 |         qtargs2.append('-display')
 56 |         qtargs2.append(display)
 57 |         # Also export DISPLAY var as this may be used
 58 |         # by flash plugin
 59 |         os.environ["DISPLAY"] = display
 60 | 
 61 |     if style:
 62 |         qtargs2.append('-style')
 63 |         qtargs2.append(style)
 64 | 
 65 |     qtargs2.extend(qtargs or [])
 66 | 
 67 |     return QApplication(qtargs2)
 68 | 
 69 | 
 70 | if __name__ == '__main__':
 71 |     # This code will be executed if this module is run 'as-is'.
 72 | 
 73 |     # Enable HTTP proxy
 74 |     if 'http_proxy' in os.environ:
 75 |         proxy_url = urlparse.urlparse(os.environ.get('http_proxy'))
 76 |         proxy = QNetworkProxy(QNetworkProxy.HttpProxy, proxy_url.hostname, proxy_url.port)
 77 |         QNetworkProxy.setApplicationProxy(proxy)
 78 | 
 79 |     # Parse command line arguments.
 80 |     # Syntax:
 81 |     # $0 [--xvfb|--display=DISPLAY] [--debug] [--output=FILENAME] <URL>
 82 | 
 83 |     description = "Creates a screenshot of a website using QtWebkit." \
 84 |                 + "This program comes with ABSOLUTELY NO WARRANTY. " \
 85 |                 + "This is free software, and you are welcome to redistribute " \
 86 |                 + "it under the terms of the GNU General Public License v2."
 87 | 
 88 |     parser = OptionParser(usage="usage: %prog [options] <URL>",
 89 |                           version="%prog " + VERSION + ", Copyright (c) Roland Tapken",
 90 |                           description=description, add_help_option=True)
 91 |     parser.add_option("-x", "--xvfb", nargs=2, type="int", dest="xvfb",
 92 |                       help="Start an 'xvfb' instance with the given desktop size.", metavar="WIDTH HEIGHT")
 93 |     parser.add_option("-g", "--geometry", dest="geometry", nargs=2, default=(0, 0), type="int",
 94 |                       help="Geometry of the virtual browser window (0 means 'autodetect') [default: %default].", metavar="WIDTH HEIGHT")
 95 |     parser.add_option("-o", "--output", dest="output",
 96 |                       help="Write output to FILE instead of STDOUT.", metavar="FILE")
 97 |     parser.add_option("-f", "--format", dest="format", default="png",
 98 |                       help="Output image format [default: %default]", metavar="FORMAT")
 99 |     parser.add_option("--scale", dest="scale", nargs=2, type="int",
100 |                       help="Scale the image to this size", metavar="WIDTH HEIGHT")
101 |     parser.add_option("--aspect-ratio", dest="ratio", type="choice", choices=["ignore", "keep", "expand", "crop"],
102 |                       help="One of 'ignore', 'keep', 'crop' or 'expand' [default: %default]")
103 |     parser.add_option("-F", "--feature", dest="features", action="append", type="choice",
104 |                       choices=["javascript", "plugins"],
105 |                       help="Enable additional Webkit features ('javascript', 'plugins')", metavar="FEATURE")
106 |     parser.add_option("-w", "--wait", dest="wait", default=0, type="int",
107 |                       help="Time to wait after loading before the screenshot is taken [default: %default]", metavar="SECONDS")
108 |     parser.add_option("-t", "--timeout", dest="timeout", default=0, type="int",
109 |                       help="Time before the request will be canceled [default: %default]", metavar="SECONDS")
110 |     parser.add_option("-W", "--window", dest="window", action="store_true",
111 |                       help="Grab whole window instead of frame (may be required for plugins)", default=False)
112 |     parser.add_option("-T", "--transparent", dest="transparent", action="store_true",
113 |                       help="Render output on a transparent background (Be sure to have a transparent background defined in the html)", default=False)
114 |     parser.add_option("", "--style", dest="style",
115 |                       help="Change the Qt look and feel to STYLE (e.G. 'windows').", metavar="STYLE")
116 |     parser.add_option("", "--encoded-url", dest="encoded_url", action="store_true",
117 |         help="Treat URL as url-encoded", metavar="ENCODED_URL", default=False)
118 |     parser.add_option("-d", "--display", dest="display",
119 |                       help="Connect to X server at DISPLAY.", metavar="DISPLAY")
120 |     parser.add_option("--debug", action="store_true", dest="debug",
121 |                       help="Show debugging information.", default=False)
122 |     parser.add_option("--log", action="store", dest="logfile", default=LOG_FILENAME,
123 |                       help="Select the log output file",)
124 | 
125 |     # Parse command line arguments and validate them (as far as we can)
126 |     (options,args) = parser.parse_args()
127 |     if len(args) != 1:
128 |         parser.error("incorrect number of arguments")
129 |     if options.display and options.xvfb:
130 |         parser.error("options -x and -d are mutually exclusive")
131 |     options.url = args[0]
132 | 
133 |     logging.basicConfig(filename=options.logfile,level=logging.WARN,)
134 | 
135 |     # Enable output of debugging information
136 |     if options.debug:
137 |         logger.setLevel(logging.DEBUG)
138 | 
139 |     if options.xvfb:
140 |         # Start 'xvfb' instance by replacing the current process
141 |         server_num = int(os.getpid() + 1e6)
142 |         newArgs = ["xvfb-run", "--auto-servernum", "--server-num", str(server_num), "--server-args=-screen 0, %dx%dx24" % options.xvfb, sys.argv[0]]
143 |         skipArgs = 0
144 |         for i in range(1, len(sys.argv)):
145 |             if skipArgs > 0:
146 |                 skipArgs -= 1
147 |             elif sys.argv[i] in ["-x", "--xvfb"]:
148 |                 skipArgs = 2 # following: width and height
149 |             else:
150 |                 newArgs.append(sys.argv[i])
151 |         logger.debug("Executing %s" % " ".join(newArgs))
152 |         try:
153 |             os.execvp(newArgs[0],newArgs[1:])
154 |         except OSError:
155 |             logger.error("Unable to find '%s'" % newArgs[0])
156 |             print >> sys.stderr, "Error - Unable to find '%s' for -x/--xvfb option" % newArgs[0]
157 |             sys.exit(1)
158 | 
159 |     # Prepare output ("1" means STDOUT)
160 |     if options.output is None:
161 |         options.output = sys.stdout
162 |     else:
163 |         options.output = open(options.output, "w")
164 | 
165 |     logger.debug("Version %s, Python %s, Qt %s", VERSION, sys.version, qVersion());
166 | 
167 |     # Technically, this is a QtGui application, because QWebPage requires it
168 |     # to be. But because we will have no user interaction, and rendering can
169 |     # not start before 'app.exec_()' is called, we have to trigger our "main"
170 |     # by a timer event.
171 |     def __main_qt():
172 |         # Render the page.
173 |         # If this method times out or loading failed, a
174 |         # RuntimeException is thrown
175 |         try:
176 |             # Initialize WebkitRenderer object
177 |             renderer = WebkitRenderer()
178 |             renderer.logger = logger
179 |             renderer.width = options.geometry[0]
180 |             renderer.height = options.geometry[1]
181 |             renderer.timeout = options.timeout
182 |             renderer.wait = options.wait
183 |             renderer.format = options.format
184 |             renderer.grabWholeWindow = options.window
185 |             renderer.renderTransparentBackground = options.transparent
186 |             renderer.encodedUrl = options.encoded_url
187 | 
188 |             if options.scale:
189 |                 renderer.scaleRatio = options.ratio
190 |                 renderer.scaleToWidth = options.scale[0]
191 |                 renderer.scaleToHeight = options.scale[1]
192 | 
193 |             if options.features:
194 |                 if "javascript" in options.features:
195 |                     renderer.qWebSettings[QWebSettings.JavascriptEnabled] = True
196 |                 if "plugins" in options.features:
197 |                     renderer.qWebSettings[QWebSettings.PluginsEnabled] = True
198 | 
199 |             renderer.render_to_file(url=options.url, file_object=options.output)
200 |             options.output.close()
201 |             QApplication.exit(0)
202 |         except RuntimeError, e:
203 |             logger.error("main: %s" % e)
204 |             print >> sys.stderr, e
205 |             QApplication.exit(1)
206 | 
207 |     # Initialize Qt-Application, but make this script
208 |     # abortable via CTRL-C
209 |     app = init_qtgui(display = options.display, style=options.style)
210 |     signal.signal(signal.SIGINT, signal.SIG_DFL)
211 | 
212 |     QTimer.singleShot(0, __main_qt)
213 |     sys.exit(app.exec_())


--------------------------------------------------------------------------------
/newsspider/webkit2png.log:
--------------------------------------------------------------------------------
1 | WARNING:webkit2png:Failed to load http://www.nytimes.com/2013/03/10/opinion/sunday/living-with-less-a-lot-less.html?pagewanted=2
2 | WARNING:webkit2png:Failed to load http://aws.amazon.com/about-aws/whats-new/2013/03/11/announcing-aws-elastic-beanstalk-for-node-js/
3 | WARNING:webkit2png:SSL: The host name did not match any of the valid hosts for this certificate
4 | WARNING:webkit2png:SSL: The host name did not match any of the valid hosts for this certificate
5 | WARNING:webkit2png:SSL: The host name did not match any of the valid hosts for this certificate
6 | 


--------------------------------------------------------------------------------
/newsspider/webkit2png.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # webkit2png.py
  3 | #
  4 | # Creates screenshots of webpages using by QtWebkit.
  5 | #
  6 | # Copyright (c) 2008 Roland Tapken <roland@dau-sicher.de>
  7 | #
  8 | # This program is free software; you can redistribute it and/or
  9 | # modify it under the terms of the GNU General Public License
 10 | # as published by the Free Software Foundation; either version 2
 11 | # of the License, or (at your option) any later version.
 12 | #
 13 | # This program is distributed in the hope that it will be useful,
 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | # GNU General Public License for more details.
 17 | #
 18 | # You should have received a copy of the GNU General Public License
 19 | # along with this program; if not, write to the Free Software
 20 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 21 | #
 22 | # Nice ideas "todo":
 23 | #  - Add QTcpSocket support to create a "screenshot daemon" that
 24 | #    can handle multiple requests at the same time.
 25 | 
 26 | import time
 27 | 
 28 | from PyQt4.QtCore import *
 29 | from PyQt4.QtGui import *
 30 | from PyQt4.QtWebKit import *
 31 | from PyQt4.QtNetwork import *
 32 | 
 33 | # Class for Website-Rendering. Uses QWebPage, which
 34 | # requires a running QtGui to work.
 35 | class WebkitRenderer(QObject):
 36 |     """A class that helps to create 'screenshots' of webpages using
 37 |     Qt's QWebkit. Requires PyQt4 library.
 38 | 
 39 |     Use "render()" to get a 'QImage' object, render_to_bytes() to get the
 40 |     resulting image as 'str' object or render_to_file() to write the image
 41 |     directly into a 'file' resource.
 42 | 
 43 |     These methods have to be called from within Qt's main (GUI) thread.
 44 |     An example on how to use this is the __qt_main() method at the end
 45 |     of the libraries source file. More generic examples:
 46 | 
 47 | def qt_main():
 48 |     while go_on():
 49 |         do_something_meaningful()
 50 |         while QApplication.hasPendingEvents():
 51 |              QApplication.processEvents()
 52 |     QApplication.quit()
 53 | 
 54 | app = init_qtgui()
 55 | QTimer.singleShot(0, qt_main)
 56 | sys.exit(app.exec_())
 57 | 
 58 |     Or let Qt handle event processing using a QTimer instance:
 59 | 
 60 |         def qt_main_loop():
 61 |             if not go_on():
 62 |                 QApplication.quit()
 63 |                 return
 64 |             do_something_meaningful()
 65 | 
 66 |         app = init_qtgui()
 67 |         main_timer = QTimer()
 68 |         QObject.connect(main_timer, QtCore.SIGNAL("timeout()"), qt_main_loop)
 69 |         sys.exit(app.exec_())
 70 | 
 71 |     Avaible properties:
 72 |     width -- The width of the "browser" window. 0 means autodetect (default).
 73 |     height -- The height of the window. 0 means autodetect (default).
 74 |     timeout -- Seconds after that the request is aborted (default: 0)
 75 |     wait -- Seconds to wait after loading has been finished (default: 0)
 76 |     scaleToWidth -- The resulting image is scaled to this width.
 77 |     scaleToHeight -- The resulting image is scaled to this height.
 78 |     scaleRatio -- The image is scaled using this method. Possible values are:
 79 |       keep
 80 |       expand
 81 |       crop
 82 |       ignore
 83 |     grabWhileWindow -- If this is True a screenshot of the whole window is taken. Otherwise only the current frame is rendered. This is required for plugins to be visible, but it is possible that another window overlays the current one while the screenshot is taken. To reduce this possibility, the window is activated just before it is rendered if this property is set to True (default: False).
 84 |     qWebSettings -- Settings that should be assigned to the created QWebPage instance. See http://doc.trolltech.com/4.6/qwebsettings.html for possible keys. Defaults:
 85 |       JavascriptEnabled: False
 86 |       PluginsEnabled: False
 87 |       PrivateBrowsingEnabled: True
 88 |       JavascriptCanOpenWindows: False
 89 |     """
 90 | 
 91 |     def __init__(self,**kwargs):
 92 |         """Sets default values for the properties."""
 93 | 
 94 |         if not QApplication.instance():
 95 |             raise RuntimeError(self.__class__.__name__ + " requires a running QApplication instance")
 96 |         QObject.__init__(self)
 97 | 
 98 |         # Initialize default properties
 99 |         self.width = kwargs.get('width', 0)
100 |         self.height = kwargs.get('height', 0)
101 |         self.timeout = kwargs.get('timeout', 0)
102 |         self.wait = kwargs.get('wait', 0)
103 |         self.scaleToWidth = kwargs.get('scaleToWidth', 0)
104 |         self.scaleToHeight = kwargs.get('scaleToHeight', 0)
105 |         self.scaleRatio = kwargs.get('scaleRatio', 'keep')
106 |         self.format = kwargs.get('format', 'png')
107 |         self.logger = kwargs.get('logger', None)
108 |         # Set this to true if you want to capture flash.
109 |         # Not that your desktop must be large enough for
110 |         # fitting the whole window.
111 |         self.grabWholeWindow = kwargs.get('grabWholeWindow', False)
112 |         self.renderTransparentBackground = kwargs.get('renderTransparentBackground', False)
113 |         self.ignoreAlert = kwargs.get('ignoreAlert', True)
114 |         self.ignoreConfirm = kwargs.get('ignoreConfirm', True)
115 |         self.ignorePrompt = kwargs.get('ignorePrompt', True)
116 |         self.interruptJavaScript = kwargs.get('interruptJavaScript', True)
117 |         self.encodedUrl = kwargs.get('encodedUrl', False)
118 | 
119 |         # Set some default options for QWebPage
120 |         self.qWebSettings = {
121 |             QWebSettings.JavascriptEnabled : False,
122 |             QWebSettings.PluginsEnabled : False,
123 |             QWebSettings.PrivateBrowsingEnabled : True,
124 |             QWebSettings.JavascriptCanOpenWindows : False
125 |         }
126 | 
127 | 
128 |     def render(self, url):
129 |         """Renders the given URL into a QImage object"""
130 |         # We have to use this helper object because
131 |         # QApplication.processEvents may be called, causing
132 |         # this method to get called while it has not returned yet.
133 |         helper = _WebkitRendererHelper(self)
134 |         helper._window.resize( self.width, self.height )
135 |         image = helper.render(url)
136 | 
137 |         # Bind helper instance to this image to prevent the
138 |         # object from being cleaned up (and with it the QWebPage, etc)
139 |         # before the data has been used.
140 |         image.helper = helper
141 | 
142 |         return image
143 | 
144 |     def render_to_file(self, url, file_object):
145 |         """Renders the image into a File resource.
146 |         Returns the size of the data that has been written.
147 |         """
148 |         format = self.format # this may not be constant due to processEvents()
149 |         image = self.render(url)
150 |         qBuffer = QBuffer()
151 |         image.save(qBuffer, format)
152 |         file_object.write(qBuffer.buffer().data())
153 |         return qBuffer.size()
154 | 
155 |     def render_to_bytes(self, url):
156 |         """Renders the image into an object of type 'str'"""
157 |         format = self.format # this may not be constant due to processEvents()
158 |         image = self.render(url)
159 |         qBuffer = QBuffer()
160 |         image.save(qBuffer, format)
161 |         return qBuffer.buffer().data()
162 | 
163 | class _WebkitRendererHelper(QObject):
164 |     """This helper class is doing the real work. It is required to
165 |     allow WebkitRenderer.render() to be called "asynchronously"
166 |     (but always from Qt's GUI thread).
167 |     """
168 | 
169 |     def __init__(self, parent):
170 |         """Copies the properties from the parent (WebkitRenderer) object,
171 |         creates the required instances of QWebPage, QWebView and QMainWindow
172 |         and registers some Slots.
173 |         """
174 |         QObject.__init__(self)
175 | 
176 |         # Copy properties from parent
177 |         for key,value in parent.__dict__.items():
178 |             setattr(self,key,value)
179 | 
180 |         # Create and connect required PyQt4 objects
181 |         self._page = CustomWebPage(logger=self.logger, ignore_alert=self.ignoreAlert,
182 |             ignore_confirm=self.ignoreConfirm, ignore_prompt=self.ignorePrompt,
183 |             interrupt_js=self.interruptJavaScript)
184 |         self._view = QWebView()
185 |         self._view.setPage(self._page)
186 |         self._window = QMainWindow()
187 |         self._window.setCentralWidget(self._view)
188 | 
189 |         # Import QWebSettings
190 |         for key, value in self.qWebSettings.iteritems():
191 |             self._page.settings().setAttribute(key, value)
192 | 
193 |         # Connect required event listeners
194 |         self.connect(self._page, SIGNAL("loadFinished(bool)"), self._on_load_finished)
195 |         self.connect(self._page, SIGNAL("loadStarted()"), self._on_load_started)
196 |         self.connect(self._page.networkAccessManager(), SIGNAL("sslErrors(QNetworkReply *,const QList<QSslError>&)"), self._on_ssl_errors)
197 |         self.connect(self._page.networkAccessManager(), SIGNAL("finished(QNetworkReply *)"), self._on_each_reply)
198 | 
199 |         # The way we will use this, it seems to be unesseccary to have Scrollbars enabled
200 |         self._page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
201 |         self._page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
202 |         self._page.settings().setUserStyleSheetUrl(QUrl("data:text/css,html,body{overflow-y:hidden !important;}"))
203 | 
204 |         # Show this widget
205 |         self._window.show()
206 | 
207 |     def __del__(self):
208 |         """Clean up Qt4 objects. """
209 |         self._window.close()
210 |         del self._window
211 |         del self._view
212 |         del self._page
213 | 
214 |     def render(self, url):
215 |         """The real worker. Loads the page (_load_page) and awaits
216 |         the end of the given 'delay'. While it is waiting outstanding
217 |         QApplication events are processed.
218 |         After the given delay, the Window or Widget (depends
219 |         on the value of 'grabWholeWindow' is drawn into a QPixmap
220 |         and postprocessed (_post_process_image).
221 |         """
222 |         self._load_page(url, self.width, self.height, self.timeout)
223 |         # Wait for end of timer. In this time, process
224 |         # other outstanding Qt events.
225 |         if self.wait > 0:
226 |             if self.logger: self.logger.debug("Waiting %d seconds " % self.wait)
227 |             waitToTime = time.time() + self.wait
228 |             while time.time() < waitToTime:
229 |                 if QApplication.hasPendingEvents():
230 |                     QApplication.processEvents()
231 | 
232 |         if self.renderTransparentBackground:
233 |             # Another possible drawing solution
234 |             image = QImage(self._page.viewportSize(), QImage.Format_ARGB32)
235 |             image.fill(QColor(255,0,0,0).rgba())
236 | 
237 |             # http://ariya.blogspot.com/2009/04/transparent-qwebview-and-qwebpage.html
238 |             palette = self._view.palette()
239 |             palette.setBrush(QPalette.Base, Qt.transparent)
240 |             self._page.setPalette(palette)
241 |             self._view.setAttribute(Qt.WA_OpaquePaintEvent, False)
242 | 
243 |             painter = QPainter(image)
244 |             painter.setBackgroundMode(Qt.TransparentMode)
245 |             self._page.mainFrame().render(painter)
246 |             painter.end()
247 |         else:
248 |             if self.grabWholeWindow:
249 |                 # Note that this does not fully ensure that the
250 |                 # window still has the focus when the screen is
251 |                 # grabbed. This might result in a race condition.
252 |                 self._view.activateWindow()
253 |                 image = QPixmap.grabWindow(self._window.winId())
254 |             else:
255 |                 image = QPixmap.grabWidget(self._window)
256 | 
257 |         return self._post_process_image(image)
258 | 
259 |     def _load_page(self, url, width, height, timeout):
260 |         """
261 |         This method implements the logic for retrieving and displaying
262 |         the requested page.
263 |         """
264 | 
265 |         # This is an event-based application. So we have to wait until
266 |         # "loadFinished(bool)" raised.
267 |         cancelAt = time.time() + timeout
268 |         self.__loading = True
269 |         self.__loadingResult = False # Default
270 |         if self.encodedUrl:
271 |             self._page.mainFrame().load(QUrl.fromEncoded(url))
272 |         else:
273 |             self._page.mainFrame().load(QUrl(url))
274 |         while self.__loading:
275 |             if timeout > 0 and time.time() >= cancelAt:
276 |                 raise RuntimeError("Request timed out on %s" % url)
277 |             while QApplication.hasPendingEvents() and self.__loading:
278 |                 QCoreApplication.processEvents()
279 | 
280 |         if self.logger: self.logger.debug("Processing result")
281 | 
282 |         if self.__loading_result == False:
283 |             if self.logger: self.logger.warning("Failed to load %s" % url)
284 | 
285 |         # Set initial viewport (the size of the "window")
286 |         size = self._page.mainFrame().contentsSize()
287 |         if self.logger: self.logger.debug("contentsSize: %s", size)
288 |         if width > 0:
289 |             size.setWidth(width)
290 |         if height > 0:
291 |             size.setHeight(height)
292 | 
293 |         self._window.resize(size)
294 | 
295 |     def _post_process_image(self, qImage):
296 |         """If 'scaleToWidth' or 'scaleToHeight' are set to a value
297 |         greater than zero this method will scale the image
298 |         using the method defined in 'scaleRatio'.
299 |         """
300 |         if self.scaleToWidth > 0 or self.scaleToHeight > 0:
301 |             # Scale this image
302 |             if self.scaleRatio == 'keep':
303 |                 ratio = Qt.KeepAspectRatio
304 |             elif self.scaleRatio in ['expand', 'crop']:
305 |                 ratio = Qt.KeepAspectRatioByExpanding
306 |             else: # 'ignore'
307 |                 ratio = Qt.IgnoreAspectRatio
308 |             qImage = qImage.scaled(self.scaleToWidth, self.scaleToHeight, ratio)
309 |             if self.scaleRatio == 'crop':
310 |                 qImage = qImage.copy(0, 0, self.scaleToWidth, self.scaleToHeight)
311 |         return qImage
312 | 
313 |     def _on_each_reply(self,reply):
314 |       """Logs each requested uri"""
315 |       self.logger.debug("Received %s" % (reply.url().toString()))
316 | 
317 |     # Eventhandler for "loadStarted()" signal
318 |     def _on_load_started(self):
319 |         """Slot that sets the '__loading' property to true."""
320 |         if self.logger: self.logger.debug("loading started")
321 |         self.__loading = True
322 | 
323 |     # Eventhandler for "loadFinished(bool)" signal
324 |     def _on_load_finished(self, result):
325 |         """Slot that sets the '__loading' property to false and stores
326 |         the result code in '__loading_result'.
327 |         """
328 |         if self.logger: self.logger.debug("loading finished with result %s", result)
329 |         self.__loading = False
330 |         self.__loading_result = result
331 | 
332 |     # Eventhandler for "sslErrors(QNetworkReply *,const QList<QSslError>&)" signal
333 |     def _on_ssl_errors(self, reply, errors):
334 |         """Slot that writes SSL warnings into the log but ignores them."""
335 |         for e in errors:
336 |             if self.logger: self.logger.warn("SSL: " + e.errorString())
337 |         reply.ignoreSslErrors()
338 | 
339 | 
340 | class CustomWebPage(QWebPage):
341 |     def __init__(self, **kwargs):
342 |         super(CustomWebPage, self).__init__()
343 |         self.logger = kwargs.get('logger', None)
344 |         self.ignore_alert = kwargs.get('ignore_alert', True)
345 |         self.ignore_confirm = kwargs.get('ignore_confirm', True)
346 |         self.ignore_prompt = kwargs.get('ignore_prompt', True)
347 |         self.interrupt_js = kwargs.get('interrupt_js', True)
348 | 
349 |     def javaScriptAlert(self, frame, message):
350 |         if self.logger: self.logger.debug('Alert: %s', message)
351 |         if not self.ignore_alert:
352 |             return super(CustomWebPage, self).javaScriptAlert(frame, message)
353 | 
354 |     def javaScriptConfirm(self, frame, message):
355 |         if self.logger: self.logger.debug('Confirm: %s', message)
356 |         if not self.ignore_confirm:
357 |             return super(CustomWebPage, self).javaScriptConfirm(frame, message)
358 |         else:
359 |             return False
360 | 
361 |     def javaScriptPrompt(self, frame, message, default, result):
362 |         """This function is called whenever a JavaScript program running inside frame tries to prompt
363 |         the user for input. The program may provide an optional message, msg, as well as a default value
364 |         for the input in defaultValue.
365 | 
366 |         If the prompt was cancelled by the user the implementation should return false;
367 |         otherwise the result should be written to result and true should be returned.
368 |         If the prompt was not cancelled by the user, the implementation should return true and
369 |         the result string must not be null.
370 |         """
371 |         if self.logger: self.logger.debug('Prompt: %s (%s)' % (message, default))
372 |         if not self.ignore_prompt:
373 |             return super(CustomWebPage, self).javaScriptPrompt(frame, message, default, result)
374 |         else:
375 |             return False
376 | 
377 |     def shouldInterruptJavaScript(self):
378 |         """This function is called when a JavaScript program is running for a long period of time.
379 |         If the user wanted to stop the JavaScript the implementation should return true; otherwise false.
380 |         """
381 |         if self.logger: self.logger.debug("WebKit ask to interrupt JavaScript")
382 |         return self.interrupt_js
383 | 


--------------------------------------------------------------------------------
/proxycrawler/proxies.txt:
--------------------------------------------------------------------------------
  1 | 5.199.132.164	443	HTTP
  2 | 64.208.21.16	80	HTTP
  3 | 110.153.9.250	80	HTTP
  4 | 110.139.206.93	8080	HTTP
  5 | 72.247.48.10	80	HTTP
  6 | 180.250.130.186	80	HTTP
  7 | 114.80.149.183	80	HTTP
  8 | 123.108.14.39	8080	HTTP
  9 | 211.167.112.14	80	HTTP
 10 | 89.218.100.178	9090	HTTP
 11 | 202.149.78.234	8080	HTTP
 12 | 101.255.33.250	80	HTTP
 13 | 80.79.179.10	8181	HTTP
 14 | 180.250.165.197	8080	HTTP
 15 | 119.110.71.109	8080	HTTP
 16 | 123.129.242.131	8081	HTTP
 17 | 211.167.112.14	82	HTTP
 18 | 89.218.100.218	9090	HTTP
 19 | 202.201.1.119	8001	HTTP
 20 | 80.90.12.36	8080	HTTP
 21 | 186.47.84.139	8080	HTTP
 22 | 125.141.206.36	8080	HTTP
 23 | 211.239.84.130	443	HTTP
 24 | 89.218.120.114	9090	HTTP
 25 | 203.92.47.202	8082	HTTP
 26 | 114.247.21.244	3131	HTTP
 27 | 190.92.87.98	8080	HTTP
 28 | 87.110.149.88	8080	HTTP
 29 | 186.208.71.70	8080	HTTP
 30 | 119.235.21.11	80	HTTP
 31 | 148.236.5.91	8080	HTTP
 32 | 218.28.112.114	809	HTTP
 33 | 119.252.168.34	8080	HTTP
 34 | 202.52.244.110	8080	HTTP
 35 | 218.108.85.59	82	HTTP
 36 | 14.31.11.70	9009	HTTP
 37 | 186.225.212.245	8080	HTTP
 38 | 120.85.132.234	80	HTTP
 39 | 175.25.243.27	80	HTTP
 40 | 218.61.8.124	88	HTTP
 41 | 89.251.103.130	8080	HTTP
 42 | 195.202.159.123	8080	HTTP
 43 | 212.233.147.48	8080	HTTP
 44 | 218.206.204.254	80	HTTP
 45 | 14.31.11.78	9009	HTTP
 46 | 110.4.12.170	83	HTTP
 47 | 186.226.98.254	8080	HTTP
 48 | 124.227.191.68	9000	HTTP
 49 | 190.29.22.247	8080	HTTP
 50 | 218.89.165.131	6060	HTTP
 51 | 91.218.84.195	80	HTTP
 52 | 210.14.143.53	7020	HTTP
 53 | 178.18.17.208	8080	HTTP
 54 | 202.43.65.130	8080	HTTP
 55 | 37.229.231.253	8080	HTTP
 56 | 218.206.204.254	443	HTTP
 57 | 58.210.247.18	1337	HTTP
 58 | 112.213.118.48	80	HTTP
 59 | 187.53.150.62	8080	HTTP
 60 | 125.210.188.35	80	HTTP
 61 | 195.69.191.203	80	HTTP
 62 | 218.102.39.154	8080	HTTP
 63 | 91.228.53.28	8080	HTTP
 64 | 210.14.143.122	80	HTTP
 65 | 202.43.188.5	8080	HTTP
 66 | 202.46.146.22	8080	HTTP
 67 | 92.126.217.47	80	HTTP
 68 | 218.249.83.87	8080	HTTP
 69 | 58.211.114.107	443	HTTP
 70 | 113.160.50.51	80	HTTP
 71 | 178.18.17.250	8080	HTTP
 72 | 219.153.5.3	8181	HTTP
 73 | 93.186.97.236	8080	HTTP
 74 | 210.177.139.89	8080	HTTP
 75 | 222.168.65.130	80	HTTP
 76 | 103.28.227.78	8080	HTTP
 77 | 219.150.254.158	8080	HTTP
 78 | 58.221.129.158	1337	HTTP
 79 | 119.82.253.88	8080	HTTP
 80 | 187.85.89.167	8080	HTTP
 81 | 182.93.206.92	8080	HTTP
 82 | 202.106.179.141	10160	HTTP
 83 | 219.223.252.150	56142	HTTP
 84 | 95.77.97.146	8080	HTTP
 85 | 211.99.28.21	808	HTTP
 86 | 211.142.236.137	8080	HTTP
 87 | 41.75.201.146	8080	HTTP
 88 | 109.207.63.89	8090	HTTP
 89 | 219.159.105.180	8080	HTTP
 90 | 59.34.57.88	8080	HTTP
 91 | 187.110.169.186	8080	HTTP
 92 | 190.40.80.144	8080	HTTP
 93 | 210.212.98.228	80	HTTP
 94 | 220.246.4.74	8080	HTTP
 95 | 211.142.236.133	8080	HTTP
 96 | 218.22.71.122	8080	HTTP
 97 | 58.210.212.107	80	HTTP
 98 | 112.25.15.18	9098	HTTP
 99 | 59.37.168.16	8081	HTTP
100 | 185.8.2.50	8080	HTTP
101 | 190.102.17.121	80	HTTP
102 | 218.106.99.22	888	HTTP
103 | 221.3.153.74	80	HTTP
104 | 106.3.98.79	80	HTTP
105 | 211.142.236.137	80	HTTP
106 | 2.133.93.170	9090	HTTP
107 | 78.9.164.162	8080	HTTP
108 | 119.6.73.235	80	HTTP
109 | 221.130.17.48	80	HTTP
110 | 59.57.15.71	80	HTTP
111 | 186.5.65.164	8080	HTTP
112 | 190.0.17.202	8080	HTTP
113 | 197.254.11.30	8080	HTTP
114 | 5.8.242.12	8080	HTTP
115 | 221.130.18.218	80	HTTP
116 | 110.74.220.50	8080	HTTP
117 | 211.144.72.153	80	HTTP
118 | 2.135.237.154	9090	HTTP
119 | 82.200.253.202	9090	HTTP
120 | 119.110.69.70	80	HTTP
121 | 221.130.17.139	80	HTTP
122 | 59.172.208.189	8080	HTTP
123 | 190.14.255.169	8080	HTTP
124 | 190.0.46.66	8080	HTTP
125 | 200.93.115.248	8080	HTTP
126 | 5.10.224.62	80	HTTP
127 | 221.130.18.253	80	HTTP
128 | 110.93.211.11	80	HTTP
129 | 82.209.195.5	8080	HTTP
130 | 125.39.68.195	80	HTTP
131 | 221.130.23.4	80	HTTP
132 | 59.172.208.190	8080	HTTP
133 | 198.15.119.111	8080	HTTP
134 | 190.0.61.194	8080	HTTP
135 | 200.107.32.127	8080	HTTP
136 | 5.10.224.62	8080	HTTP
137 | 221.130.23.4	80	HTTP
138 | 110.93.211.11	8080	HTTP
139 | 218.206.204.254	80	HTTP
140 | 2.135.238.26	9090	HTTP
141 | 180.250.192.222	8080	HTTP
142 | 221.130.23.5	80	HTTP
143 | 200.137.133.171	80	HTTP
144 | 31.170.178.2	8080	HTTP
145 | 221.130.23.6	80	HTTP
146 | 112.5.254.30	80	HTTP
147 | 218.206.204.254	443	HTTP
148 | 2.135.238.108	9090	HTTP
149 | 88.249.127.222	8080	HTTP
150 | 186.46.122.250	8080	HTTP
151 | 221.130.23.6	80	HTTP
152 | 61.166.55.153	11808	HTTP
153 | 201.218.63.4	8080	HTTP
154 | 190.211.97.71	8080	HTTP
155 | 200.213.4.4	8080	HTTP
156 | 41.78.26.45	8080	HTTP
157 | 221.130.23.29	80	HTTP
158 | 112.175.248.22	8080	HTTP
159 | 2.135.242.42	9090	HTTP
160 | 110.138.160.170	8080	HTTP
161 | 187.85.225.185	80	HTTP
162 | 221.130.23.8	80	HTTP
163 | 106.3.98.82	80	HTTP
164 | 201.63.184.5	8080	HTTP
165 | 46.249.66.50	8080	HTTP
166 | 222.169.11.34	8080	HTTP
167 | 114.113.221.72	54321	HTTP
168 | 221.10.40.232	80	HTTP
169 | 2.135.243.42	9090	HTTP
170 | 110.138.163.58	8080	HTTP
171 | 221.130.23.78	80	HTTP
172 | 106.3.98.82	82	HTTP
173 | 202.171.253.98	80	HTTP
174 | 202.108.77.153	80	HTTP
175 | 77.236.209.236	8080	HTTP
176 | 116.68.171.70	8080	HTTP
177 | 221.10.40.232	82	HTTP
178 | 27.50.11.165	80	HTTP
179 | 118.96.137.140	8080	HTTP
180 | 5.10.224.58	80	HTTP
181 | 106.3.98.82	83	HTTP
182 | 202.171.253.103	80	HTTP
183 | 198.154.114.100	8080	HTTP
184 | 202.162.198.178	8080	HTTP
185 | 91.202.164.185	8080	HTTP
186 | 223.4.205.37	808	HTTP
187 | 117.34.72.51	808	HTTP
188 | 221.10.40.232	83	HTTP
189 | 36.73.40.189	8080	HTTP
190 | 200.208.251.218	8080	HTTP
191 | 72.64.146.136	43	HTTP
192 | 221.130.23.80	80	HTTP
193 | 112.5.254.19	80	HTTP
194 | 202.171.253.103	85	HTTP
195 | 200.27.114.228	8080	HTTP
196 | 202.182.49.41	8080	HTTP
197 | 103.10.22.226	8080	HTTP
198 | 58.252.56.148	8080	HTTP
199 | 118.97.206.28	8080	HTTP
200 | 221.130.18.76	80	HTTP
201 | 58.67.147.204	8080	HTTP
202 | 201.64.247.3	8080	HTTP
203 | 81.169.154.244	8080	HTTP
204 | 221.130.23.81	80	HTTP
205 | 112.5.254.20	80	HTTP
206 | 202.171.253.108	80	HTTP
207 | 200.27.114.233	8080	HTTP
208 | 110.138.208.50	8080	HTTP
209 | 122.72.15.231	80	HTTP
210 | 118.97.212.162	8080	HTTP
211 | 221.130.199.19	80	HTTP
212 | 77.89.233.54	8080	HTTP
213 | 202.46.85.107	8080	HTTP
214 | 125.69.132.100	8080	HTTP
215 | 221.130.23.82	80	HTTP
216 | 117.41.182.188	8080	HTTP
217 | 202.171.253.108	83	HTTP
218 | 200.54.92.187	80	HTTP
219 | 37.77.50.133	80	HTTP
220 | 111.161.30.228	80	HTTP
221 | 124.81.208.34	8080	HTTP
222 | 119.4.250.105	80	HTTP
223 | 221.130.199.98	80	HTTP
224 | 85.172.4.154	80	HTTP
225 | 202.93.136.98	8080	HTTP
226 | 221.130.23.91	80	HTTP
227 | 118.145.0.76	10086	HTTP
228 | 203.124.12.71	8080	HTTP
229 | 200.61.31.69	8080	HTTP
230 | 61.55.141.11	80	HTTP
231 | 114.113.221.77	54321	HTTP
232 | 180.243.92.86	8080	HTTP
233 | 119.7.221.135	81	HTTP
234 | 221.178.174.171	888	HTTP
235 | 87.236.233.92	8080	HTTP
236 | 203.91.43.43	9988	HTTP
237 | 190.3.108.211	8080	HTTP
238 | 221.181.192.91	80	HTTP
239 | 206.130.99.82	8080	HTTP
240 | 200.71.86.50	8080	HTTP
241 | 61.135.223.4	7000	HTTP
242 | 119.110.69.70	8080	HTTP
243 | 202.74.241.196	8080	HTTP
244 | 119.7.221.135	82	HTTP
245 | 222.124.35.117	8080	HTTP
246 | 110.139.60.228	8080	HTTP
247 | 190.72.150.144	8080	HTTP
248 | 221.215.155.38	8090	HTTP
249 | 120.203.214.162	80	HTTP
250 | 211.232.93.13	808	HTTP
251 | 200.75.51.151	8080	HTTP
252 | 78.133.155.54	8080	HTTP
253 | 180.87.197.91	8080	HTTP
254 | 1.63.18.22	8080	HTTP
255 | 119.7.221.137	82	HTTP
256 | 222.187.222.118	8080	HTTP
257 | 111.13.87.150	80	HTTP
258 | 219.76.104.17	8080	HTTP
259 | 200.208.251.220	8080	HTTP
260 | 222.89.55.123	8080	HTTP
261 | 120.203.214.176	80	HTTP
262 | 218.102.39.153	8080	HTTP
263 | 200.109.228.67	8080	HTTP
264 | 109.207.61.189	8090	HTTP
265 | 180.249.119.252	8080	HTTP
266 | 2.133.92.106	9090	HTTP
267 | 119.59.193.175	8080	HTTP
268 | 62.84.67.170	8080	HTTP
269 | 111.161.30.236	80	HTTP
270 | 49.212.167.222	80	HTTP
271 | 58.20.230.131	8080	HTTP
272 | 222.217.99.72	9000	HTTP
273 | 120.203.214.187	80	HTTP
274 | 200.195.176.77	8080	HTTP
275 | 113.53.254.124	8080	HTTP
276 | 2.133.92.122	9090	HTTP
277 | 119.145.2.18	80	HTTP
278 | 171.101.144.18	8080	HTTP
279 | 112.5.254.172	80	HTTP
280 | 103.247.16.241	8080	HTTP
281 | 61.235.69.243	8080	HTTP
282 | 222.217.99.177	9000	HTTP
283 | 2.135.243.84	9090	HTTP
284 | 200.202.240.174	80	HTTP
285 | 123.235.12.118	8080	HTTP
286 | 200.169.162.132	80	HTTP
287 | 2.133.92.157	80	HTTP
288 | 119.233.255.51	80	HTTP
289 | 213.131.41.6	8080	HTTP
290 | 178.48.2.237	8080	HTTP
291 | 109.236.220.98	8080	HTTP
292 | 222.240.224.131	80	HTTP
293 | 123.134.95.142	80	HTTP
294 | 5.135.242.225	8080	HTTP
295 | 200.204.161.246	8080	HTTP
296 | 218.22.71.124	8080	HTTP
297 | 2.133.92.158	80	HTTP
298 | 119.233.255.60	80	HTTP
299 | 1.234.45.130	80	HTTP
300 | 180.247.120.217	8080	HTTP
301 | 116.255.234.73	3288	HTTP
302 | 61.177.248.202	1080	SOCKS4
303 | 201.56.208.233	8080	HTTP
304 | 177.83.122.189	8080	HTTP
305 | 218.22.71.126	8080	HTTP
306 | 2.133.93.82	9090	HTTP
307 | 119.235.21.10	8080	HTTP
308 | 27.116.21.163	8080	HTTP
309 | 124.207.170.230	8080	HTTP
310 | 121.204.0.2	80	HTTP
311 | 183.60.44.136	88	HTTP
312 | 49.0.96.1	8000	HTTP
313 | 201.86.70.162	80	HTTP
314 | 177.182.252.197	8080	HTTP
315 | 221.210.40.150	8080	HTTP
316 | 119.252.172.131	80	HTTP
317 | 49.0.110.1	8000	HTTP
318 | 124.240.187.81	82	HTTP
319 | 200.54.78.66	8080	HTTP
320 | 125.165.51.4	8080	HTTP
321 | 183.61.246.78	80	HTTP
322 | 62.201.207.14	8080	HTTP
323 | 201.249.192.74	8080	HTTP
324 | 190.116.87.4	8080	HTTP
325 | 2.135.238.92	9090	HTTP
326 | 120.194.100.46	8001	HTTP
327 | 58.215.88.12	80	HTTP
328 | 164.77.196.78	80	HTTP
329 | 202.154.225.229	8080	HTTP
330 | 186.5.102.162	8080	HTTP
331 | 114.80.136.112	7780	HTTP
332 | 183.129.249.82	80	HTTP
333 | 62.201.210.190	8080	HTTP
334 | 202.152.22.38	8080	HTTP
335 | 31.135.196.229	8080	HTTP
336 | 41.216.171.154	8080	HTTP
337 | 59.49.79.121	9527	HTTP
338 | 177.11.17.46	8080	HTTP
339 | 71.189.47.2	8081	HTTP
340 | 190.78.2.84	8080	HTTP
341 | 115.100.60.198	8000	HTTP
342 | 183.129.249.83	80	HTTP
343 | 63.141.216.176	80	HTTP
344 | 203.172.245.34	8080	HTTP
345 | 195.140.190.146	8080	HTTP
346 | 81.201.61.138	8080	HTTP
347 | 58.53.192.218	8123	HTTP
348 | 121.12.118.241	999	HTTP
349 | 59.57.15.71	80	HTTP
350 | 180.242.88.43	5311	HTTP
351 | 202.29.211.122	8080	HTTP
352 | 115.236.19.48	8080	HTTP
353 | 211.100.47.131	8990	HTTP
354 | 66.35.68.146	8080	HTTP
355 | 212.175.88.3	8080	HTTP
356 | 197.251.194.164	8080	HTTP
357 | 89.171.46.225	8080	HTTP
358 | 59.59.51.74	8001	HTTP
359 | 122.11.38.182	9090	HTTP
360 | 59.172.208.186	8080	HTTP
361 | 183.110.231.124	80	HTTP
362 | 202.202.1.189	80	HTTP
363 | 116.112.66.102	808	HTTP
364 | 211.100.47.244	8990	HTTP
365 | 74.221.211.117	8080	HTTP
366 | 213.110.196.11	80	HTTP
367 | 202.108.50.72	80	HTTP
368 | 94.137.239.19	81	HTTP
369 | 60.165.173.36	8003	HTTP
370 | 122.72.0.6	80	HTTP
371 | 61.156.217.166	8000	HTTP
372 | 187.20.25.42	8080	HTTP
373 | 203.93.104.20	80	HTTP
374 | 119.97.146.152	80	HTTP
375 | 211.100.52.42	8990	HTTP
376 | 77.65.22.245	8080	HTTP
377 | 217.117.14.247	80	HTTP
378 | 202.145.3.130	8080	HTTP
379 | 110.139.58.31	8080	HTTP
380 | 60.191.142.233	8360	HTTP
381 | 122.144.1.213	9999	HTTP
382 | 78.188.3.171	8080	HTTP
383 | 190.29.30.114	8080	HTTP
384 | 119.252.168.34	80	HTTP
385 | 120.194.100.42	8001	HTTP
386 | 211.142.236.133	80	HTTP
387 | 77.78.104.129	8080	HTTP
388 | 218.100.84.123	8080	HTTP
389 | 202.146.237.79	808	HTTP
390 | 114.113.221.70	54321	HTTP
391 | 61.136.93.38	8080	HTTP
392 | 122.252.181.20	8080	HTTP
393 | 78.188.47.21	8080	HTTP
394 | 190.128.170.18	8080	HTTP
395 | 178.233.149.172	8080	HTTP
396 | 120.203.214.182	80	HTTP
397 | 219.83.100.195	8080	HTTP
398 | 208.163.36.221	8080	HTTP
399 | 61.152.108.187	80	HTTP
400 | 123.30.174.61	8080	HTTP
401 | 83.17.80.124	8080	HTTP
402 | 200.60.11.20	8080	HTTP
403 | 177.70.17.154	8080	HTTP
404 | 187.5.122.231	8080	HTTP
405 | 122.72.2.180	80	HTTP
406 | 211.142.236.137	80	HTTP
407 | 77.238.209.194	8080	HTTP
408 | 222.124.19.210	8080	HTTP
409 | 221.179.173.170	8080	HTTP
410 | 118.97.58.166	8080	HTTP
411 | 61.155.140.154	55808	HTTP
412 | 92.39.54.161	80	HTTP
413 | 200.137.133.169	80	HTTP
414 | 177.85.233.190	8080	HTTP
415 | 122.72.120.63	80	HTTP
416 | 211.142.236.137	8080	HTTP
417 | 78.159.235.3	8080	HTTP
418 | 222.124.147.105	8080	HTTP
419 | 36.73.42.103	8080	HTTP
420 | 101.255.33.254	80	HTTP
421 | 201.41.66.212	8080	HTTP
422 | 178.219.103.205	8080	HTTP
423 | 200.24.17.46	80	HTTP
424 | 122.72.124.2	80	HTTP
425 | 218.22.71.125	8080	HTTP
426 | 80.90.27.60	8080	HTTP
427 | 222.124.207.29	8080	HTTP
428 | 60.166.13.182	80	HTTP
429 | 122.224.5.210	443	HTTP
430 | 85.207.17.146	8080	HTTP
431 | 123.164.148.134	80	HTTP
432 | 103.28.113.134	8080	HTTP
433 | 202.29.60.220	8080	HTTP
434 | 180.250.130.186	8080	HTTP
435 | 202.181.176.3	80	HTTP
436 | 122.225.22.22	8080	HTTP
437 | 218.22.71.210	8080	HTTP
438 | 222.124.218.164	8080	HTTP
439 | 60.216.7.28	3079	HTTP
440 | 89.218.100.90	9090	HTTP
441 | 123.164.148.134	82	HTTP
442 | 103.247.37.86	8080	HTTP
443 | 202.152.40.202	8080	HTTP
444 | 195.191.250.229	80	HTTP
445 | 213.24.60.52	8080	HTTP
446 | 202.97.159.227	8080	HTTP
447 | 218.104.193.102	80	HTTP
448 | 81.213.157.71	80	HTTP
449 | 223.25.195.68	8080	HTTP
450 | 78.38.80.142	8080	HTTP
451 | 186.192.17.138	8080	HTTP
452 | 89.237.134.10	8080	HTTP
453 | 124.81.113.183	8080	HTTP
454 | 109.74.236.165	8080	HTTP
455 | 217.29.117.162	8080	HTTP
456 | 203.110.169.76	9128	HTTP
457 | 218.201.21.175	80	HTTP
458 | 82.200.236.58	9090	HTTP
459 | 72.64.146.136	8080	HTTP
460 | 81.90.224.209	8080	HTTP
461 | 189.29.118.245	8080	HTTP
462 | 103.23.139.97	8080	HTTP
463 | 125.39.238.242	8080	HTTP
464 | 109.224.5.194	80	HTTP
465 | 60.214.67.86	8080	HTTP
466 | 203.110.169.83	9128	HTTP
467 | 218.201.21.176	80	HTTP
468 | 110.74.222.117	8080	HTTP
469 | 95.129.199.70	8080	HTTP
470 | 190.79.44.28	8080	HTTP
471 | 211.151.171.207	80	HTTP
472 | 218.108.242.100	48814	HTTP
473 | 93.189.28.106	8080	HTTP
474 | 211.144.76.58	9000	HTTP
475 | 218.201.21.177	80	HTTP
476 | 82.200.254.114	9090	HTTP
477 | 122.72.76.122	80	HTTP
478 | 103.5.49.37	8080	HTTP
479 | 201.12.116.18	8080	HTTP
480 | 109.207.61.182	8090	HTTP
481 | 150.165.75.129	8080	HTTP
482 | 111.161.30.237	80	HTTP
483 | 212.76.180.50	8080	HTTP
484 | 72.64.146.135	8080	HTTP
485 | 110.139.151.124	8080	HTTP
486 | 211.154.151.218	88	HTTP
487 | 218.201.21.178	80	HTTP
488 | 82.200.254.146	9090	HTTP
489 | 122.72.76.130	80	HTTP
490 | 109.207.61.167	8090	HTTP
491 | 202.43.188.9	8080	HTTP
492 | 110.136.245.31	8080	HTTP
493 | 151.236.194.2	8080	HTTP
494 | 113.108.92.104	80	HTTP
495 | 218.56.161.14	8118	HTTP
496 | 116.77.35.118	80	HTTP
497 | 211.167.112.14	80	HTTP
498 | 218.204.39.164	80	HTTP
499 | 89.188.224.70	8080	HTTP
500 | 190.121.154.246	8080	HTTP
501 | 124.240.187.79	81	HTTP
502 | 202.43.188.15	8080	HTTP
503 | 112.175.18.180	80	HTTP
504 | 164.77.196.75	80	HTTP
505 | 114.32.95.96	8080	HTTP
506 | 219.76.104.1	80	HTTP
507 | 111.161.30.233	80	HTTP
508 | 211.167.112.14	82	HTTP
509 | 221.130.17.37	80	HTTP
510 | 89.218.68.13	80	HTTP
511 | 200.55.206.210	8080	HTTP
512 | 124.240.187.79	82	HTTP
513 | 113.142.8.205	8080	HTTP
514 | 177.125.167.253	8080	HTTP
515 | 219.76.104.1	8080	HTTP
516 | 118.97.255.107	8080	HTTP
517 | 211.167.112.15	80	HTTP
518 | 221.130.18.45	80	HTTP
519 | 89.218.68.34	9090	HTTP
520 | 219.154.46.138	8080	HTTP
521 | 186.16.203.50	8080	HTTP
522 | 202.102.48.205	8080	HTTP
523 | 113.195.134.231	8080	HTTP
524 | 178.169.97.35	54321	HTTP
525 | 116.112.66.102	808	HTTP
526 | 219.231.164.40	45238	HTTP
527 | 176.33.138.156	8080	HTTP
528 | 211.167.112.15	82	HTTP
529 | 221.130.18.52	80	HTTP
530 | 89.218.68.130	9090	HTTP
531 | 197.251.194.126	8080	HTTP
532 | 203.93.28.166	8080	HTTP
533 | 183.110.231.240	80	HTTP
534 | 118.96.66.107	80	HTTP
535 | 219.242.50.50	8080	HTTP
536 | 211.167.112.16	80	HTTP
537 | 221.130.18.189	80	HTTP
538 | 89.218.68.132	80	HTTP
539 | 2.133.92.18	9090	HTTP
540 | 202.102.58.208	80	HTTP
541 | 218.108.242.105	41884	HTTP
542 | 116.226.46.19	8080	HTTP
543 | 183.221.250.137	80	HTTP
544 | 118.97.91.129	8080	HTTP
545 | 221.7.145.42	8080	HTTP
546 | 211.167.112.17	80	HTTP
547 | 221.130.18.253	80	HTTP
548 | 89.218.100.210	9090	HTTP
549 | 2.133.92.26	9090	HTTP
550 | 202.102.58.209	80	HTTP
551 | 27.116.21.162	8080	HTTP
552 | 118.26.231.104	5060	HTTP
553 | 183.221.250.141	80	HTTP
554 | 222.74.98.234	8080	HTTP
555 | 222.89.154.6	9000	HTTP
556 | 202.28.110.17	8080	HTTP
557 | 211.167.112.17	82	HTTP
558 | 221.176.14.72	80	HTTP
559 | 89.218.100.250	9090	HTTP
560 | 2.133.92.162	9090	HTTP
561 | 118.194.164.90	54321	HTTP
562 | 186.101.41.25	80	HTTP
563 | 118.195.65.243	80	HTTP
564 | 5.8.242.10	8080	HTTP
565 | 202.107.195.231	80	HTTP
566 | 218.28.254.77	880	HTTP
567 | 221.176.169.194	8001	HTTP
568 | 89.218.101.74	9090	HTTP
569 | 46.249.66.50	80	HTTP
570 | 119.7.221.137	81	HTTP
571 | 119.167.231.183	80	HTTP
572 | 41.78.25.69	8080	HTTP
573 | 210.212.152.5	80	HTTP
574 | 221.6.15.156	82	HTTP
575 | 221.181.192.25	80	HTTP
576 | 2.133.93.154	9090	HTTP
577 | 210.101.131.232	8080	HTTP
578 | 93.90.235.178	8080	HTTP
579 | 119.7.221.137	83	HTTP
580 | 186.101.65.115	80	HTTP
581 | 50.22.206.179	8080	HTTP
582 | 221.6.15.157	82	HTTP
583 | 93.94.180.15	8080	HTTP
584 | 2.133.94.42	9090	HTTP
585 | 218.94.1.166	82	HTTP
586 | 110.139.206.93	80	HTTP
587 | 119.62.128.172	80	HTTP
588 | 61.8.72.99	8080	HTTP
589 | 141.105.87.77	80	HTTP
590 | 41.215.3.98	80	HTTP
591 | 222.88.94.245	80	HTTP
592 | 101.255.36.30	808	HTTP
593 | 218.249.114.42	8088	HTTP
594 | 111.161.30.227	80	HTTP
595 | 121.14.9.76	80	HTTP
596 | 187.4.63.148	80	HTTP
597 | 68.71.76.242	8082	HTTP
598 | 197.251.194.121	8080	HTTP
599 | 119.187.148.102	8000	HTTP
600 | 222.92.141.155	8090	HTTP
601 | 101.255.60.158	8080	HTTP
602 | 2.135.237.194	9090	HTTP
603 | 111.161.30.232	80	HTTP
604 | 121.17.144.132	8080	HTTP
605 | 187.4.63.149	80	HTTP
606 | 121.52.144.245	80	HTTP
607 | 74.221.215.254	8080	HTTP
608 | 122.115.62.108	8081	HTTP
609 | 221.130.23.29	80	HTTP
610 | 222.187.222.118	8080	HTTP
611 | 103.11.99.162	8080	HTTP
612 | 2.135.237.250	9090	HTTP
613 | 2.133.94.26	9090	HTTP
614 | 112.125.120.145	10080	HTTP
615 | 122.4.78.26	34808	HTTP
616 | 122.72.0.145	80	HTTP
617 | 78.80.36.194	8080	HTTP
618 | 219.76.104.18	8080	HTTP
619 | 221.179.173.170	8080	HTTP
620 | 222.188.10.1	1080	SOCKS4
621 | 103.246.145.184	8080	HTTP
622 | 2.135.242.162	9090	HTTP
623 | 2.135.238.146	9090	HTTP
624 | 122.72.0.28	80	HTTP
625 | 187.33.208.250	8080	HTTP
626 | 122.72.2.180	80	HTTP
627 | 78.131.55.82	8080	HTTP
628 | 58.67.147.196	8080	HTTP
629 | 202.116.1.149	8128	HTTP
630 | 221.195.42.195	8080	HTTP
631 | 222.217.99.156	9000	HTTP
632 | 2.135.242.170	9090	HTTP
633 | 24.158.199.54	8082	HTTP
634 | 117.121.238.17	8080	HTTP
635 | 123.125.74.212	80	HTTP
636 | 187.115.65.187	80	HTTP
637 | 122.72.2.180	8080	HTTP
638 | 82.79.92.226	8080	HTTP
639 | 202.137.22.182	8080	HTTP
640 | 223.4.173.109	808	HTTP
641 | 2.135.242.186	9090	HTTP
642 | 118.96.192.84	8080	HTTP
643 | 124.240.187.79	80	HTTP
644 | 91.202.164.29	8080	HTTP
645 | 103.10.22.231	8080	HTTP
646 | 27.54.218.248	80	HTTP
647 | 222.74.98.234	8080	HTTP
648 | 27.50.132.145	88	HTTP
649 | 2.135.242.226	9090	HTTP
650 | 190.67.169.194	8080	HTTP
651 | 124.240.187.79	83	HTTP
652 | 123.164.148.132	80	HTTP
653 | 101.255.33.249	80	HTTP
654 | 69.29.105.153	8080	HTTP
655 | 58.67.147.205	8080	HTTP
656 | 61.167.49.188	8080	HTTP
657 | 124.240.187.80	80	HTTP
658 | 190.66.22.53	8080	HTTP
659 | 109.207.61.170	8090	HTTP
660 | 180.248.156.56	8080	HTTP
661 | 114.113.221.69	54321	HTTP
662 | 124.95.142.94	80	HTTP
663 | 58.248.254.38	80	HTTP
664 | 


--------------------------------------------------------------------------------
/proxycrawler/proxycrawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/proxycrawler/proxycrawler/__init__.py


--------------------------------------------------------------------------------
/proxycrawler/proxycrawler/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class ProxycrawlerItem(Item):
 9 |      address   = Field()
10 |      port      = Field()
11 |      protocol  = Field()
12 |      location  = Field()
13 |  
14 |      type      = Field() # 0: anonymity #1 nonanonymity
15 |      delay     = Field() # in second
16 |      timestamp = Field()
17 | 


--------------------------------------------------------------------------------
/proxycrawler/proxycrawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
 5 | #
 6 | # Reference: http://www.cnblogs.com/igloo1986/archive/2012/08/25/2655597.html
 7 | 
 8 | import urllib
 9 | from scrapy.exceptions import DropItem
10 | import socket
11 | 
12 | class ProxycrawlerPipeline(object):
13 |     def process_item(self, item, spider):
14 |         protocol = item['protocol']
15 |         address = item['address']
16 |         port = item['port']
17 |         proxies = {protocol:'%s:%s'% (address, port)}
18 | 
19 |         #check if proxy can be connected
20 |         try:
21 |             socket.setdefaulttimeout(3)
22 |             data  = urllib.urlopen('http://ifconfig.me/ip', proxies=proxies).read()
23 |         except:
24 |             raise DropItem("curl download the proxy %s:%s is bad" % (address, port))
25 | 
26 |         if data:
27 |             line = '%s\t%s\t%s\n' % (address, port, protocol)
28 |             file('proxies.txt', 'a+').write(line)
29 |             return item
30 |         else:
31 |             raise DropItem("Not valid proxy %s:%s" %(address, port))
32 | 


--------------------------------------------------------------------------------
/proxycrawler/proxycrawler/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for proxycrawler project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'proxycrawler'
10 | 
11 | SPIDER_MODULES = ['proxycrawler.spiders']
12 | NEWSPIDER_MODULE = 'proxycrawler.spiders'
13 | ITEM_PIPELINES = ['proxycrawler.pipelines.ProxycrawlerPipeline']
14 | 
15 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
16 | # USER_AGENT = 'proxycrawler (+http://www.yourdomain.com)'
17 | 


--------------------------------------------------------------------------------
/proxycrawler/proxycrawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/proxycrawler/proxycrawler/spiders/proxy.py:
--------------------------------------------------------------------------------
 1 | # See http://www.cnblogs.com/igloo1986/archive/2012/08/25/2655597.html
 2 | from scrapy.selector import HtmlXPathSelector
 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
 4 | from scrapy.contrib.spiders import CrawlSpider, Rule
 5 | from proxycrawler.items import ProxycrawlerItem
 6 | import re
 7 | 
 8 | class ProxySpider(CrawlSpider):
 9 |     name = 'proxy'
10 |     allowed_domains = ['www.cnproxy.com']
11 |     indexes    = [1,2,3,4,5,6,7,8,9,10]
12 |     start_urls = []
13 |     for i in indexes:
14 |         url = 'http://www.cnproxy.com/proxy%s.html' % i
15 |         start_urls.append(url)
16 |     start_urls.append('http://www.cnproxy.com/proxyedu1.html')
17 |     start_urls.append('http://www.cnproxy.com/proxyedu2.html')
18 | 
19 |     def parse(self, response):
20 |         hxs = HtmlXPathSelector(response)
21 |         addresses = hxs.select('//tr[position()>1]/td[position()=1]').re('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
22 |         protocols = hxs.select('//tr[position()>1]/td[position()=2]').re('<td>(.*)<\/td>')
23 |         locations = hxs.select('//tr[position()>1]/td[position()=4]').re('<td>(.*)<\/td>')
24 |         ports_re  = re.compile('write\(":"(.*)\)')
25 |         raw_ports = ports_re.findall(response.body);
26 |         port_map = {'z':'3','m':'4','k':'2','l':'9','d':'0','b':'5','i':'7','w':'6','r':'8','c':'1','+':''}
27 |         ports     = []
28 |         for port in raw_ports:
29 |             tmp = port
30 |             for key  in port_map:
31 |                 tmp = tmp.replace(key, port_map[key]);
32 |             ports.append(tmp)
33 |         items = []
34 |         for i in range(len(addresses)):
35 |             item = ProxycrawlerItem()
36 |             item['address']  = addresses[i]
37 |             item['protocol'] = protocols[i]
38 |             item['location'] = locations[i]
39 |             item['port']     = ports[i]
40 |             items.append(item)
41 |         return items
42 | 


--------------------------------------------------------------------------------
/proxycrawler/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = proxycrawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = proxycrawler
12 | 


--------------------------------------------------------------------------------
/scrapy-ws.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Example script to control a Scrapy server using its JSON-RPC web service.
  4 | 
  5 | It only provides a reduced functionality as its main purpose is to illustrate
  6 | how to write a web service client. Feel free to improve or write you own.
  7 | 
  8 | Also, keep in mind that the JSON-RPC API is not stable. The recommended way for
  9 | controlling a Scrapy server is through the execution queue (see the "queue"
 10 | command).
 11 | 
 12 | """
 13 | 
 14 | import sys, optparse, urllib, json
 15 | from urlparse import urljoin
 16 | 
 17 | from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError
 18 | 
 19 | def get_commands():
 20 |     return {
 21 |         'help': cmd_help,
 22 |         'stop': cmd_stop,
 23 |         'list-available': cmd_list_available,
 24 |         'list-running': cmd_list_running,
 25 |         'list-resources': cmd_list_resources,
 26 |         'get-global-stats': cmd_get_global_stats,
 27 |         'get-spider-stats': cmd_get_spider_stats,
 28 |     }
 29 | 
 30 | def cmd_help(args, opts):
 31 |     """help - list available commands"""
 32 |     print "Available commands:"
 33 |     for _, func in sorted(get_commands().items()):
 34 |         print "  ", func.__doc__
 35 | 
 36 | def cmd_stop(args, opts):
 37 |     """stop <spider> - stop a running spider"""
 38 |     jsonrpc_call(opts, 'crawler/engine', 'close_spider', args[0])
 39 | 
 40 | def cmd_list_running(args, opts):
 41 |     """list-running - list running spiders"""
 42 |     for x in json_get(opts, 'crawler/engine/open_spiders'):
 43 |         print x
 44 | 
 45 | def cmd_list_available(args, opts):
 46 |     """list-available - list name of available spiders"""
 47 |     for x in jsonrpc_call(opts, 'crawler/spiders', 'list'):
 48 |         print x
 49 | 
 50 | def cmd_list_resources(args, opts):
 51 |     """list-resources - list available web service resources"""
 52 |     for x in json_get(opts, '')['resources']:
 53 |         print x
 54 | 
 55 | def cmd_get_spider_stats(args, opts):
 56 |     """get-spider-stats <spider> - get stats of a running spider"""
 57 |     stats = jsonrpc_call(opts, 'stats', 'get_stats', args[0])
 58 |     for name, value in stats.items():
 59 |         print "%-40s %s" % (name, value)
 60 | 
 61 | def cmd_get_global_stats(args, opts):
 62 |     """get-global-stats - get global stats"""
 63 |     stats = jsonrpc_call(opts, 'stats', 'get_stats')
 64 |     for name, value in stats.items():
 65 |         print "%-40s %s" % (name, value)
 66 | 
 67 | def get_wsurl(opts, path):
 68 |     return urljoin("http://%s:%s/"% (opts.host, opts.port), path)
 69 | 
 70 | def jsonrpc_call(opts, path, method, *args, **kwargs):
 71 |     url = get_wsurl(opts, path)
 72 |     return jsonrpc_client_call(url, method, *args, **kwargs)
 73 | 
 74 | def json_get(opts, path):
 75 |     url = get_wsurl(opts, path)
 76 |     return json.loads(urllib.urlopen(url).read())
 77 | 
 78 | def parse_opts():
 79 |     usage = "%prog [options] <command> [arg] ..."
 80 |     description = "Scrapy web service control script. Use '%prog help' " \
 81 |         "to see the list of available commands."
 82 |     op = optparse.OptionParser(usage=usage, description=description)
 83 |     op.add_option("-H", dest="host", default="localhost", \
 84 |         help="Scrapy host to connect to")
 85 |     op.add_option("-P", dest="port", type="int", default=6080, \
 86 |         help="Scrapy port to connect to")
 87 |     opts, args = op.parse_args()
 88 |     if not args:
 89 |         op.print_help()
 90 |         sys.exit(2)
 91 |     cmdname, cmdargs, opts = args[0], args[1:], opts
 92 |     commands = get_commands()
 93 |     if cmdname not in commands:
 94 |         sys.stderr.write("Unknown command: %s\n\n" % cmdname)
 95 |         cmd_help(None, None)
 96 |         sys.exit(1)
 97 |     return commands[cmdname], cmdargs, opts
 98 | 
 99 | def main():
100 |     cmd, args, opts = parse_opts()
101 |     try:
102 |         cmd(args, opts)
103 |     except IndexError:
104 |         print cmd.__doc__
105 |     except JsonRpcError, e:
106 |         print str(e)
107 |         if e.data:
108 |             print "Server Traceback below:"
109 |             print e.data
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     main()
114 | 


--------------------------------------------------------------------------------
/server/code.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #coding=utf8
 3 | import os, sys
 4 | import web
 5 | import view
 6 | import config
 7 | import db
 8 | from view import render
 9 | 
10 | urls = (
11 |     '/', 'index',
12 |     '/page/(\d+)','index'
13 | )
14 | 
15 | class index:
16 |     def GET(self,page=1):
17 |         page = int(page)
18 |         limit=config.perpage
19 |         offset = (page -1)*limit
20 |         counting = db.counting()
21 |         pages = counting / limit
22 |         if counting % limit > 0:
23 |             pages += 1
24 |         if page > pages:
25 |             raise web.seeother('/')
26 |         else:
27 |             return render.base(view.listing(offset=offset,limit=limit), 
28 |                     pages=pages, curpage=page)
29 | 
30 | if __name__ == "__main__":
31 |     if len(sys.argv)==1:
32 |         port = os.environ.get("PORT", "8081")
33 |         sys.argv.append(port)
34 |     app = web.application(urls, globals())
35 |     app.internalerror = web.debugerror
36 |     app.run()
37 | 


--------------------------------------------------------------------------------
/server/config.py:
--------------------------------------------------------------------------------
1 | #coding=utf8
2 | import web
3 | DB = web.database(dbn='mysql', db='news', user='root', pw='feisky')
4 | cache = False
5 | perpage = 25
6 | 


--------------------------------------------------------------------------------
/server/db.py:
--------------------------------------------------------------------------------
1 | #coding=utf8
2 | import config
3 | 
4 | def listing(**k):
5 |     return config.DB.select('news', order='created desc', **k)
6 | 
7 | def counting():
8 |     return config.DB.query('select count(*) as count from news')[0].count
9 | 


--------------------------------------------------------------------------------
/server/sql/tables.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE items (
2 |     id serial primary key,
3 |     author_id int references users,
4 |     body text,
5 |     created timestamp default current_timestamp,
6 | );
7 | 


--------------------------------------------------------------------------------
/server/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/server/static/favicon.ico


--------------------------------------------------------------------------------
/server/templates/base.html:
--------------------------------------------------------------------------------
 1 | $def with (page, title=None, pages=1, curpage=1)
 2 | <html><head>
 3 | <title>
 4 | $if title: : $title\
 5 | $else: Tech News
 6 | </title>
 7 | <link rel="icon" href="/static/favicon.ico" type="image/x-icon" />
 8 | <link rel="shortcut icon" href="/static/favicon.ico" type="image/x-icon" />
 9 | </head><body>
10 | <h2><a href="/">Tech News Home</a></h2>
11 | $:page
12 | 
13 | $if curpage!=1:
14 |   <a style="font-size:150%;color:grey" href="/page/$(curpage-1)">Prev</a> 
15 | 
16 | $if curpage+1 in range(1,pages+1):
17 |   <a style="font-size:150%;color:grey" href="/page/$(curpage+1)">Next</a>
18 | 
19 | </body></html>
20 | 


--------------------------------------------------------------------------------
/server/templates/item.html:
--------------------------------------------------------------------------------
 1 | $def with (item)
 2 | 
 3 | <p>
 4 | <a style="MARGIN-TOP: 5pt; MARGIN-BOTTOM: 5pt; BACKGROUND: rgb(153,204,255)" href="$item.url">$item.title</a>
 5 | <span><font color="gray">($item.site)</font></span>
 6 | <span><font color="gray">$datestr(item.created)</font></span>
 7 | </p>
 8 | 
 9 | <p>
10 | <span>$:item.abstract</span>
11 | </p>
12 | 


--------------------------------------------------------------------------------
/server/templates/listing.html:
--------------------------------------------------------------------------------
1 | $def with (items)
2 | 
3 | $for item in items:
4 |     $:render.item(item)


--------------------------------------------------------------------------------
/server/view.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | import web
 3 | import db
 4 | import config
 5 | import datetime
 6 | 
 7 | def datestr(x):
 8 |     """
 9 |     Can't seem to set mysql creation ddl to UTC, so we'll have to adjust the datestr
10 |     function to localtime which we will assume is the same as your database server.
11 |     """
12 |     return web.datestr(x, datetime.datetime.now())
13 | 
14 | t_globals = dict(
15 |   datestr=datestr,
16 | )
17 | render = web.template.render('templates/', cache=config.cache, 
18 |     globals=t_globals)
19 | render._keywords['globals']['render'] = render
20 | 
21 | def listing(**k):
22 |     l = db.listing(**k)
23 |     return render.listing(l)
24 | 


--------------------------------------------------------------------------------
/server/webkit2png:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # webkit2png.py
  4 | #
  5 | # Creates screenshots of webpages using by QtWebkit.
  6 | #
  7 | # Copyright (c) 2008 Roland Tapken <roland@dau-sicher.de>
  8 | #
  9 | # This program is free software; you can redistribute it and/or
 10 | # modify it under the terms of the GNU General Public License
 11 | # as published by the Free Software Foundation; either version 2
 12 | # of the License, or (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program; if not, write to the Free Software
 21 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 22 | #
 23 | # Nice ideas "todo":
 24 | #  - Add QTcpSocket support to create a "screenshot daemon" that
 25 | #    can handle multiple requests at the same time.
 26 | 
 27 | from webkit2png import WebkitRenderer
 28 | 
 29 | import sys
 30 | import signal
 31 | import os
 32 | import urlparse
 33 | import logging
 34 | from optparse import OptionParser
 35 | 
 36 | from PyQt4.QtCore import *
 37 | from PyQt4.QtGui import *
 38 | from PyQt4.QtWebKit import *
 39 | from PyQt4.QtNetwork import *
 40 | 
 41 | VERSION="20091224"
 42 | LOG_FILENAME = 'webkit2png.log'
 43 | logger = logging.getLogger('webkit2png');
 44 | 
 45 | def init_qtgui(display=None, style=None, qtargs=None):
 46 |     """Initiates the QApplication environment using the given args."""
 47 |     if QApplication.instance():
 48 |         logger.debug("QApplication has already been instantiated. \
 49 |                         Ignoring given arguments and returning existing QApplication.")
 50 |         return QApplication.instance()
 51 | 
 52 |     qtargs2 = [sys.argv[0]]
 53 | 
 54 |     if display:
 55 |         qtargs2.append('-display')
 56 |         qtargs2.append(display)
 57 |         # Also export DISPLAY var as this may be used
 58 |         # by flash plugin
 59 |         os.environ["DISPLAY"] = display
 60 | 
 61 |     if style:
 62 |         qtargs2.append('-style')
 63 |         qtargs2.append(style)
 64 | 
 65 |     qtargs2.extend(qtargs or [])
 66 | 
 67 |     return QApplication(qtargs2)
 68 | 
 69 | 
 70 | if __name__ == '__main__':
 71 |     # This code will be executed if this module is run 'as-is'.
 72 | 
 73 |     # Enable HTTP proxy
 74 |     if 'http_proxy' in os.environ:
 75 |         proxy_url = urlparse.urlparse(os.environ.get('http_proxy'))
 76 |         proxy = QNetworkProxy(QNetworkProxy.HttpProxy, proxy_url.hostname, proxy_url.port)
 77 |         QNetworkProxy.setApplicationProxy(proxy)
 78 | 
 79 |     # Parse command line arguments.
 80 |     # Syntax:
 81 |     # $0 [--xvfb|--display=DISPLAY] [--debug] [--output=FILENAME] <URL>
 82 | 
 83 |     description = "Creates a screenshot of a website using QtWebkit." \
 84 |                 + "This program comes with ABSOLUTELY NO WARRANTY. " \
 85 |                 + "This is free software, and you are welcome to redistribute " \
 86 |                 + "it under the terms of the GNU General Public License v2."
 87 | 
 88 |     parser = OptionParser(usage="usage: %prog [options] <URL>",
 89 |                           version="%prog " + VERSION + ", Copyright (c) Roland Tapken",
 90 |                           description=description, add_help_option=True)
 91 |     parser.add_option("-x", "--xvfb", nargs=2, type="int", dest="xvfb",
 92 |                       help="Start an 'xvfb' instance with the given desktop size.", metavar="WIDTH HEIGHT")
 93 |     parser.add_option("-g", "--geometry", dest="geometry", nargs=2, default=(0, 0), type="int",
 94 |                       help="Geometry of the virtual browser window (0 means 'autodetect') [default: %default].", metavar="WIDTH HEIGHT")
 95 |     parser.add_option("-o", "--output", dest="output",
 96 |                       help="Write output to FILE instead of STDOUT.", metavar="FILE")
 97 |     parser.add_option("-f", "--format", dest="format", default="png",
 98 |                       help="Output image format [default: %default]", metavar="FORMAT")
 99 |     parser.add_option("--scale", dest="scale", nargs=2, type="int",
100 |                       help="Scale the image to this size", metavar="WIDTH HEIGHT")
101 |     parser.add_option("--aspect-ratio", dest="ratio", type="choice", choices=["ignore", "keep", "expand", "crop"],
102 |                       help="One of 'ignore', 'keep', 'crop' or 'expand' [default: %default]")
103 |     parser.add_option("-F", "--feature", dest="features", action="append", type="choice",
104 |                       choices=["javascript", "plugins"],
105 |                       help="Enable additional Webkit features ('javascript', 'plugins')", metavar="FEATURE")
106 |     parser.add_option("-w", "--wait", dest="wait", default=0, type="int",
107 |                       help="Time to wait after loading before the screenshot is taken [default: %default]", metavar="SECONDS")
108 |     parser.add_option("-t", "--timeout", dest="timeout", default=0, type="int",
109 |                       help="Time before the request will be canceled [default: %default]", metavar="SECONDS")
110 |     parser.add_option("-W", "--window", dest="window", action="store_true",
111 |                       help="Grab whole window instead of frame (may be required for plugins)", default=False)
112 |     parser.add_option("-T", "--transparent", dest="transparent", action="store_true",
113 |                       help="Render output on a transparent background (Be sure to have a transparent background defined in the html)", default=False)
114 |     parser.add_option("", "--style", dest="style",
115 |                       help="Change the Qt look and feel to STYLE (e.G. 'windows').", metavar="STYLE")
116 |     parser.add_option("", "--encoded-url", dest="encoded_url", action="store_true",
117 |         help="Treat URL as url-encoded", metavar="ENCODED_URL", default=False)
118 |     parser.add_option("-d", "--display", dest="display",
119 |                       help="Connect to X server at DISPLAY.", metavar="DISPLAY")
120 |     parser.add_option("--debug", action="store_true", dest="debug",
121 |                       help="Show debugging information.", default=False)
122 |     parser.add_option("--log", action="store", dest="logfile", default=LOG_FILENAME,
123 |                       help="Select the log output file",)
124 | 
125 |     # Parse command line arguments and validate them (as far as we can)
126 |     (options,args) = parser.parse_args()
127 |     if len(args) != 1:
128 |         parser.error("incorrect number of arguments")
129 |     if options.display and options.xvfb:
130 |         parser.error("options -x and -d are mutually exclusive")
131 |     options.url = args[0]
132 | 
133 |     logging.basicConfig(filename=options.logfile,level=logging.WARN,)
134 | 
135 |     # Enable output of debugging information
136 |     if options.debug:
137 |         logger.setLevel(logging.DEBUG)
138 | 
139 |     if options.xvfb:
140 |         # Start 'xvfb' instance by replacing the current process
141 |         server_num = int(os.getpid() + 1e6)
142 |         newArgs = ["xvfb-run", "--auto-servernum", "--server-num", str(server_num), "--server-args=-screen 0, %dx%dx24" % options.xvfb, sys.argv[0]]
143 |         skipArgs = 0
144 |         for i in range(1, len(sys.argv)):
145 |             if skipArgs > 0:
146 |                 skipArgs -= 1
147 |             elif sys.argv[i] in ["-x", "--xvfb"]:
148 |                 skipArgs = 2 # following: width and height
149 |             else:
150 |                 newArgs.append(sys.argv[i])
151 |         logger.debug("Executing %s" % " ".join(newArgs))
152 |         try:
153 |             os.execvp(newArgs[0],newArgs[1:])
154 |         except OSError:
155 |             logger.error("Unable to find '%s'" % newArgs[0])
156 |             print >> sys.stderr, "Error - Unable to find '%s' for -x/--xvfb option" % newArgs[0]
157 |             sys.exit(1)
158 | 
159 |     # Prepare output ("1" means STDOUT)
160 |     if options.output is None:
161 |         options.output = sys.stdout
162 |     else:
163 |         options.output = open(options.output, "w")
164 | 
165 |     logger.debug("Version %s, Python %s, Qt %s", VERSION, sys.version, qVersion());
166 | 
167 |     # Technically, this is a QtGui application, because QWebPage requires it
168 |     # to be. But because we will have no user interaction, and rendering can
169 |     # not start before 'app.exec_()' is called, we have to trigger our "main"
170 |     # by a timer event.
171 |     def __main_qt():
172 |         # Render the page.
173 |         # If this method times out or loading failed, a
174 |         # RuntimeException is thrown
175 |         try:
176 |             # Initialize WebkitRenderer object
177 |             renderer = WebkitRenderer()
178 |             renderer.logger = logger
179 |             renderer.width = options.geometry[0]
180 |             renderer.height = options.geometry[1]
181 |             renderer.timeout = options.timeout
182 |             renderer.wait = options.wait
183 |             renderer.format = options.format
184 |             renderer.grabWholeWindow = options.window
185 |             renderer.renderTransparentBackground = options.transparent
186 |             renderer.encodedUrl = options.encoded_url
187 | 
188 |             if options.scale:
189 |                 renderer.scaleRatio = options.ratio
190 |                 renderer.scaleToWidth = options.scale[0]
191 |                 renderer.scaleToHeight = options.scale[1]
192 | 
193 |             if options.features:
194 |                 if "javascript" in options.features:
195 |                     renderer.qWebSettings[QWebSettings.JavascriptEnabled] = True
196 |                 if "plugins" in options.features:
197 |                     renderer.qWebSettings[QWebSettings.PluginsEnabled] = True
198 | 
199 |             renderer.render_to_file(url=options.url, file_object=options.output)
200 |             options.output.close()
201 |             QApplication.exit(0)
202 |         except RuntimeError, e:
203 |             logger.error("main: %s" % e)
204 |             print >> sys.stderr, e
205 |             QApplication.exit(1)
206 | 
207 |     # Initialize Qt-Application, but make this script
208 |     # abortable via CTRL-C
209 |     app = init_qtgui(display = options.display, style=options.style)
210 |     signal.signal(signal.SIGINT, signal.SIG_DFL)
211 | 
212 |     QTimer.singleShot(0, __main_qt)
213 |     sys.exit(app.exec_())


--------------------------------------------------------------------------------
/server/webkit2png.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # webkit2png.py
  3 | #
  4 | # Creates screenshots of webpages using by QtWebkit.
  5 | #
  6 | # Copyright (c) 2008 Roland Tapken <roland@dau-sicher.de>
  7 | #
  8 | # This program is free software; you can redistribute it and/or
  9 | # modify it under the terms of the GNU General Public License
 10 | # as published by the Free Software Foundation; either version 2
 11 | # of the License, or (at your option) any later version.
 12 | #
 13 | # This program is distributed in the hope that it will be useful,
 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 16 | # GNU General Public License for more details.
 17 | #
 18 | # You should have received a copy of the GNU General Public License
 19 | # along with this program; if not, write to the Free Software
 20 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 21 | #
 22 | # Nice ideas "todo":
 23 | #  - Add QTcpSocket support to create a "screenshot daemon" that
 24 | #    can handle multiple requests at the same time.
 25 | 
 26 | import time
 27 | 
 28 | from PyQt4.QtCore import *
 29 | from PyQt4.QtGui import *
 30 | from PyQt4.QtWebKit import *
 31 | from PyQt4.QtNetwork import *
 32 | 
 33 | # Class for Website-Rendering. Uses QWebPage, which
 34 | # requires a running QtGui to work.
 35 | class WebkitRenderer(QObject):
 36 |     """A class that helps to create 'screenshots' of webpages using
 37 |     Qt's QWebkit. Requires PyQt4 library.
 38 | 
 39 |     Use "render()" to get a 'QImage' object, render_to_bytes() to get the
 40 |     resulting image as 'str' object or render_to_file() to write the image
 41 |     directly into a 'file' resource.
 42 | 
 43 |     These methods have to be called from within Qt's main (GUI) thread.
 44 |     An example on how to use this is the __qt_main() method at the end
 45 |     of the libraries source file. More generic examples:
 46 | 
 47 | def qt_main():
 48 |     while go_on():
 49 |         do_something_meaningful()
 50 |         while QApplication.hasPendingEvents():
 51 |              QApplication.processEvents()
 52 |     QApplication.quit()
 53 | 
 54 | app = init_qtgui()
 55 | QTimer.singleShot(0, qt_main)
 56 | sys.exit(app.exec_())
 57 | 
 58 |     Or let Qt handle event processing using a QTimer instance:
 59 | 
 60 |         def qt_main_loop():
 61 |             if not go_on():
 62 |                 QApplication.quit()
 63 |                 return
 64 |             do_something_meaningful()
 65 | 
 66 |         app = init_qtgui()
 67 |         main_timer = QTimer()
 68 |         QObject.connect(main_timer, QtCore.SIGNAL("timeout()"), qt_main_loop)
 69 |         sys.exit(app.exec_())
 70 | 
 71 |     Avaible properties:
 72 |     width -- The width of the "browser" window. 0 means autodetect (default).
 73 |     height -- The height of the window. 0 means autodetect (default).
 74 |     timeout -- Seconds after that the request is aborted (default: 0)
 75 |     wait -- Seconds to wait after loading has been finished (default: 0)
 76 |     scaleToWidth -- The resulting image is scaled to this width.
 77 |     scaleToHeight -- The resulting image is scaled to this height.
 78 |     scaleRatio -- The image is scaled using this method. Possible values are:
 79 |       keep
 80 |       expand
 81 |       crop
 82 |       ignore
 83 |     grabWhileWindow -- If this is True a screenshot of the whole window is taken. Otherwise only the current frame is rendered. This is required for plugins to be visible, but it is possible that another window overlays the current one while the screenshot is taken. To reduce this possibility, the window is activated just before it is rendered if this property is set to True (default: False).
 84 |     qWebSettings -- Settings that should be assigned to the created QWebPage instance. See http://doc.trolltech.com/4.6/qwebsettings.html for possible keys. Defaults:
 85 |       JavascriptEnabled: False
 86 |       PluginsEnabled: False
 87 |       PrivateBrowsingEnabled: True
 88 |       JavascriptCanOpenWindows: False
 89 |     """
 90 | 
 91 |     def __init__(self,**kwargs):
 92 |         """Sets default values for the properties."""
 93 | 
 94 |         if not QApplication.instance():
 95 |             raise RuntimeError(self.__class__.__name__ + " requires a running QApplication instance")
 96 |         QObject.__init__(self)
 97 | 
 98 |         # Initialize default properties
 99 |         self.width = kwargs.get('width', 0)
100 |         self.height = kwargs.get('height', 0)
101 |         self.timeout = kwargs.get('timeout', 0)
102 |         self.wait = kwargs.get('wait', 0)
103 |         self.scaleToWidth = kwargs.get('scaleToWidth', 0)
104 |         self.scaleToHeight = kwargs.get('scaleToHeight', 0)
105 |         self.scaleRatio = kwargs.get('scaleRatio', 'keep')
106 |         self.format = kwargs.get('format', 'png')
107 |         self.logger = kwargs.get('logger', None)
108 |         # Set this to true if you want to capture flash.
109 |         # Not that your desktop must be large enough for
110 |         # fitting the whole window.
111 |         self.grabWholeWindow = kwargs.get('grabWholeWindow', False)
112 |         self.renderTransparentBackground = kwargs.get('renderTransparentBackground', False)
113 |         self.ignoreAlert = kwargs.get('ignoreAlert', True)
114 |         self.ignoreConfirm = kwargs.get('ignoreConfirm', True)
115 |         self.ignorePrompt = kwargs.get('ignorePrompt', True)
116 |         self.interruptJavaScript = kwargs.get('interruptJavaScript', True)
117 |         self.encodedUrl = kwargs.get('encodedUrl', False)
118 | 
119 |         # Set some default options for QWebPage
120 |         self.qWebSettings = {
121 |             QWebSettings.JavascriptEnabled : False,
122 |             QWebSettings.PluginsEnabled : False,
123 |             QWebSettings.PrivateBrowsingEnabled : True,
124 |             QWebSettings.JavascriptCanOpenWindows : False
125 |         }
126 | 
127 | 
128 |     def render(self, url):
129 |         """Renders the given URL into a QImage object"""
130 |         # We have to use this helper object because
131 |         # QApplication.processEvents may be called, causing
132 |         # this method to get called while it has not returned yet.
133 |         helper = _WebkitRendererHelper(self)
134 |         helper._window.resize( self.width, self.height )
135 |         image = helper.render(url)
136 | 
137 |         # Bind helper instance to this image to prevent the
138 |         # object from being cleaned up (and with it the QWebPage, etc)
139 |         # before the data has been used.
140 |         image.helper = helper
141 | 
142 |         return image
143 | 
144 |     def render_to_file(self, url, file_object):
145 |         """Renders the image into a File resource.
146 |         Returns the size of the data that has been written.
147 |         """
148 |         format = self.format # this may not be constant due to processEvents()
149 |         image = self.render(url)
150 |         qBuffer = QBuffer()
151 |         image.save(qBuffer, format)
152 |         file_object.write(qBuffer.buffer().data())
153 |         return qBuffer.size()
154 | 
155 |     def render_to_bytes(self, url):
156 |         """Renders the image into an object of type 'str'"""
157 |         format = self.format # this may not be constant due to processEvents()
158 |         image = self.render(url)
159 |         qBuffer = QBuffer()
160 |         image.save(qBuffer, format)
161 |         return qBuffer.buffer().data()
162 | 
163 | class _WebkitRendererHelper(QObject):
164 |     """This helper class is doing the real work. It is required to
165 |     allow WebkitRenderer.render() to be called "asynchronously"
166 |     (but always from Qt's GUI thread).
167 |     """
168 | 
169 |     def __init__(self, parent):
170 |         """Copies the properties from the parent (WebkitRenderer) object,
171 |         creates the required instances of QWebPage, QWebView and QMainWindow
172 |         and registers some Slots.
173 |         """
174 |         QObject.__init__(self)
175 | 
176 |         # Copy properties from parent
177 |         for key,value in parent.__dict__.items():
178 |             setattr(self,key,value)
179 | 
180 |         # Create and connect required PyQt4 objects
181 |         self._page = CustomWebPage(logger=self.logger, ignore_alert=self.ignoreAlert,
182 |             ignore_confirm=self.ignoreConfirm, ignore_prompt=self.ignorePrompt,
183 |             interrupt_js=self.interruptJavaScript)
184 |         self._view = QWebView()
185 |         self._view.setPage(self._page)
186 |         self._window = QMainWindow()
187 |         self._window.setCentralWidget(self._view)
188 | 
189 |         # Import QWebSettings
190 |         for key, value in self.qWebSettings.iteritems():
191 |             self._page.settings().setAttribute(key, value)
192 | 
193 |         # Connect required event listeners
194 |         self.connect(self._page, SIGNAL("loadFinished(bool)"), self._on_load_finished)
195 |         self.connect(self._page, SIGNAL("loadStarted()"), self._on_load_started)
196 |         self.connect(self._page.networkAccessManager(), SIGNAL("sslErrors(QNetworkReply *,const QList<QSslError>&)"), self._on_ssl_errors)
197 |         self.connect(self._page.networkAccessManager(), SIGNAL("finished(QNetworkReply *)"), self._on_each_reply)
198 | 
199 |         # The way we will use this, it seems to be unesseccary to have Scrollbars enabled
200 |         self._page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff)
201 |         self._page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff)
202 |         self._page.settings().setUserStyleSheetUrl(QUrl("data:text/css,html,body{overflow-y:hidden !important;}"))
203 | 
204 |         # Show this widget
205 |         self._window.show()
206 | 
207 |     def __del__(self):
208 |         """Clean up Qt4 objects. """
209 |         self._window.close()
210 |         del self._window
211 |         del self._view
212 |         del self._page
213 | 
214 |     def render(self, url):
215 |         """The real worker. Loads the page (_load_page) and awaits
216 |         the end of the given 'delay'. While it is waiting outstanding
217 |         QApplication events are processed.
218 |         After the given delay, the Window or Widget (depends
219 |         on the value of 'grabWholeWindow' is drawn into a QPixmap
220 |         and postprocessed (_post_process_image).
221 |         """
222 |         self._load_page(url, self.width, self.height, self.timeout)
223 |         # Wait for end of timer. In this time, process
224 |         # other outstanding Qt events.
225 |         if self.wait > 0:
226 |             if self.logger: self.logger.debug("Waiting %d seconds " % self.wait)
227 |             waitToTime = time.time() + self.wait
228 |             while time.time() < waitToTime:
229 |                 if QApplication.hasPendingEvents():
230 |                     QApplication.processEvents()
231 | 
232 |         if self.renderTransparentBackground:
233 |             # Another possible drawing solution
234 |             image = QImage(self._page.viewportSize(), QImage.Format_ARGB32)
235 |             image.fill(QColor(255,0,0,0).rgba())
236 | 
237 |             # http://ariya.blogspot.com/2009/04/transparent-qwebview-and-qwebpage.html
238 |             palette = self._view.palette()
239 |             palette.setBrush(QPalette.Base, Qt.transparent)
240 |             self._page.setPalette(palette)
241 |             self._view.setAttribute(Qt.WA_OpaquePaintEvent, False)
242 | 
243 |             painter = QPainter(image)
244 |             painter.setBackgroundMode(Qt.TransparentMode)
245 |             self._page.mainFrame().render(painter)
246 |             painter.end()
247 |         else:
248 |             if self.grabWholeWindow:
249 |                 # Note that this does not fully ensure that the
250 |                 # window still has the focus when the screen is
251 |                 # grabbed. This might result in a race condition.
252 |                 self._view.activateWindow()
253 |                 image = QPixmap.grabWindow(self._window.winId())
254 |             else:
255 |                 image = QPixmap.grabWidget(self._window)
256 | 
257 |         return self._post_process_image(image)
258 | 
259 |     def _load_page(self, url, width, height, timeout):
260 |         """
261 |         This method implements the logic for retrieving and displaying
262 |         the requested page.
263 |         """
264 | 
265 |         # This is an event-based application. So we have to wait until
266 |         # "loadFinished(bool)" raised.
267 |         cancelAt = time.time() + timeout
268 |         self.__loading = True
269 |         self.__loadingResult = False # Default
270 |         if self.encodedUrl:
271 |             self._page.mainFrame().load(QUrl.fromEncoded(url))
272 |         else:
273 |             self._page.mainFrame().load(QUrl(url))
274 |         while self.__loading:
275 |             if timeout > 0 and time.time() >= cancelAt:
276 |                 raise RuntimeError("Request timed out on %s" % url)
277 |             while QApplication.hasPendingEvents() and self.__loading:
278 |                 QCoreApplication.processEvents()
279 | 
280 |         if self.logger: self.logger.debug("Processing result")
281 | 
282 |         if self.__loading_result == False:
283 |             if self.logger: self.logger.warning("Failed to load %s" % url)
284 | 
285 |         # Set initial viewport (the size of the "window")
286 |         size = self._page.mainFrame().contentsSize()
287 |         if self.logger: self.logger.debug("contentsSize: %s", size)
288 |         if width > 0:
289 |             size.setWidth(width)
290 |         if height > 0:
291 |             size.setHeight(height)
292 | 
293 |         self._window.resize(size)
294 | 
295 |     def _post_process_image(self, qImage):
296 |         """If 'scaleToWidth' or 'scaleToHeight' are set to a value
297 |         greater than zero this method will scale the image
298 |         using the method defined in 'scaleRatio'.
299 |         """
300 |         if self.scaleToWidth > 0 or self.scaleToHeight > 0:
301 |             # Scale this image
302 |             if self.scaleRatio == 'keep':
303 |                 ratio = Qt.KeepAspectRatio
304 |             elif self.scaleRatio in ['expand', 'crop']:
305 |                 ratio = Qt.KeepAspectRatioByExpanding
306 |             else: # 'ignore'
307 |                 ratio = Qt.IgnoreAspectRatio
308 |             qImage = qImage.scaled(self.scaleToWidth, self.scaleToHeight, ratio)
309 |             if self.scaleRatio == 'crop':
310 |                 qImage = qImage.copy(0, 0, self.scaleToWidth, self.scaleToHeight)
311 |         return qImage
312 | 
313 |     def _on_each_reply(self,reply):
314 |       """Logs each requested uri"""
315 |       self.logger.debug("Received %s" % (reply.url().toString()))
316 | 
317 |     # Eventhandler for "loadStarted()" signal
318 |     def _on_load_started(self):
319 |         """Slot that sets the '__loading' property to true."""
320 |         if self.logger: self.logger.debug("loading started")
321 |         self.__loading = True
322 | 
323 |     # Eventhandler for "loadFinished(bool)" signal
324 |     def _on_load_finished(self, result):
325 |         """Slot that sets the '__loading' property to false and stores
326 |         the result code in '__loading_result'.
327 |         """
328 |         if self.logger: self.logger.debug("loading finished with result %s", result)
329 |         self.__loading = False
330 |         self.__loading_result = result
331 | 
332 |     # Eventhandler for "sslErrors(QNetworkReply *,const QList<QSslError>&)" signal
333 |     def _on_ssl_errors(self, reply, errors):
334 |         """Slot that writes SSL warnings into the log but ignores them."""
335 |         for e in errors:
336 |             if self.logger: self.logger.warn("SSL: " + e.errorString())
337 |         reply.ignoreSslErrors()
338 | 
339 | 
340 | class CustomWebPage(QWebPage):
341 |     def __init__(self, **kwargs):
342 |         super(CustomWebPage, self).__init__()
343 |         self.logger = kwargs.get('logger', None)
344 |         self.ignore_alert = kwargs.get('ignore_alert', True)
345 |         self.ignore_confirm = kwargs.get('ignore_confirm', True)
346 |         self.ignore_prompt = kwargs.get('ignore_prompt', True)
347 |         self.interrupt_js = kwargs.get('interrupt_js', True)
348 | 
349 |     def javaScriptAlert(self, frame, message):
350 |         if self.logger: self.logger.debug('Alert: %s', message)
351 |         if not self.ignore_alert:
352 |             return super(CustomWebPage, self).javaScriptAlert(frame, message)
353 | 
354 |     def javaScriptConfirm(self, frame, message):
355 |         if self.logger: self.logger.debug('Confirm: %s', message)
356 |         if not self.ignore_confirm:
357 |             return super(CustomWebPage, self).javaScriptConfirm(frame, message)
358 |         else:
359 |             return False
360 | 
361 |     def javaScriptPrompt(self, frame, message, default, result):
362 |         """This function is called whenever a JavaScript program running inside frame tries to prompt
363 |         the user for input. The program may provide an optional message, msg, as well as a default value
364 |         for the input in defaultValue.
365 | 
366 |         If the prompt was cancelled by the user the implementation should return false;
367 |         otherwise the result should be written to result and true should be returned.
368 |         If the prompt was not cancelled by the user, the implementation should return true and
369 |         the result string must not be null.
370 |         """
371 |         if self.logger: self.logger.debug('Prompt: %s (%s)' % (message, default))
372 |         if not self.ignore_prompt:
373 |             return super(CustomWebPage, self).javaScriptPrompt(frame, message, default, result)
374 |         else:
375 |             return False
376 | 
377 |     def shouldInterruptJavaScript(self):
378 |         """This function is called when a JavaScript program is running for a long period of time.
379 |         If the user wanted to stop the JavaScript the implementation should return true; otherwise false.
380 |         """
381 |         if self.logger: self.logger.debug("WebKit ask to interrupt JavaScript")
382 |         return self.interrupt_js
383 | 


--------------------------------------------------------------------------------
/tutorial/README:
--------------------------------------------------------------------------------
1 | 
2 | A simple tutorial from http://doc.scrapy.org/en/0.16/intro/tutorial.html


--------------------------------------------------------------------------------
/tutorial/items.json:
--------------------------------------------------------------------------------
 1 | [{"desc": ["\n                "], "link": ["/"], "title": ["Top"]},
 2 | {"desc": [], "link": ["/Computers/"], "title": ["Computers"]},
 3 | {"desc": [], "link": ["/Computers/Programming/"], "title": ["Programming"]},
 4 | {"desc": [], "link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},
 5 | {"desc": [], "link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},
 6 | {"desc": ["\n                  \t", "\u00a0", "\n                "], "link": [], "title": []},
 7 | {"desc": ["\n                        ", " \n                        ", "\n                    "], "link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]},
 8 | {"desc": ["\n                        ", " \n                        ", "\n                    "], "link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]},
 9 | {"desc": ["\n                    \t", "\n                        ", "\n\t\t\t\t\t"], "link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher/"], "title": ["German"]},
10 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161769. Printed edition of official tutorial, for v2.x, from Python.org. [Network Theory, online]\n                    \n                "], "link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]},
11 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Wesley J. Chun; Prentice Hall PTR, 2001, ISBN 0130260363. For experienced developers to improve extant skills; professional level examples. Starts by introducing syntax, objects, error handling, functions, classes, built-ins. [Prentice Hall]\n                    \n                "], "link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]},
12 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - The primary goal of this book is to promote object-oriented design using Python and to illustrate the use of the emerging object-oriented design patterns.\r\nA secondary goal of the book is to present mathematical tools just in time. Analysis techniques and proofs are presented as needed and in the proper context.\n                    \n                "], "link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]},
13 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Mark Pilgrim, Guide to Python 3  and its differences from Python 2. Each chapter starts with a real code sample and explains it fully. Has a comprehensive appendix of all the syntactic and semantic changes in Python 3\r\n\r\n\n                    \n                "], "link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]},
14 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - This book covers a wide range of topics. From raw TCP and UDP to encryption with TSL, and then to HTTP, SMTP, POP, IMAP, and ssh. It gives you a good understanding of each field and how to do everything on the network with Python.\n                    \n                "], "link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]},
15 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - Free Python books and tutorials.\n                    \n                "], "link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]},
16 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - Annotated list of free online books on Python scripting language. Topics range from beginner to advanced.\n                    \n                "], "link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]},
17 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Allen B. Downey, Jeffrey Elkner, Chris Meyers; Green Tea Press, 2002, ISBN 0971677506. Teaches general principles of programming, via Python as subject language. Thorough, in-depth approach to many basic and intermediate programming topics. Full text online and downloads: HTML, PDF, PS, LaTeX. [Free, Green Tea Press]\n                    \n                "], "link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]},
18 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - Book by Alan Gauld with full text online. Introduction for those learning programming basics: terminology, concepts, methods to write code. Assumes no prior knowledge but basic computer skills.\n                    \n                "], "link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]},
19 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Rashi Gupta; John Wiley and Sons, 2002, ISBN 0471219754. Covers language basics, use for CGI scripting, GUI development, network programming; shows why it is one of more sophisticated of popular scripting languages. [Wiley]\n                    \n                "], "link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]},
20 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Magnus Lie Hetland; Apress LP, 2002, ISBN 1590590066. Readable guide to ideas most vital to new users, from basics common to high level languages, to more specific aspects, to a series of 10 ever more complex programs. [Apress]\n                    \n                "], "link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]},
21 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Rytis Sileika, ISBN13: 978-1-4302-2605-5,  Uses real-world system administration examples like manage devices with SNMP and SOAP, build a distributed monitoring system, manage web applications and parse complex log files, monitor and manage MySQL databases.\r\n\n                    \n                "], "link": ["http://www.sysadminpy.com/"], "title": ["Pro Python System Administration"]},
22 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - A Complete Introduction to the Python 3.\n                    \n                "], "link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]},
23 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Dave Brueck, Stephen Tanner; John Wiley and Sons, 2001, ISBN 0764548077. Full coverage, clear explanations, hands-on examples, full language reference; shows step by step how to use components, assemble them, form full-featured programs. [John Wiley and Sons]\n                    \n                "], "link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]},
24 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - A step-by-step tutorial for OOP in Python 3, including discussion and examples of abstraction, encapsulation, information hiding, and raise, handle, define, and manipulate exceptions.\n                    \n                "], "link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]},
25 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161785. Printed edition of official language reference, for v2.x, from Python.org, describes syntax, built-in datatypes. [Network Theory, online]\n                    \n                "], "link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]},
26 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Thomas W. Christopher; Prentice Hall PTR, 2002, ISBN 0130409561. Shows how to write large programs, introduces powerful design patterns that deliver high levels of robustness, scalability, reuse.\n                    \n                "], "link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]},
27 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Richard Hightower; Addison-Wesley, 2002, 0201616165. Begins with Python basics, many exercises, interactive sessions. Shows programming novices concepts and practical methods. Shows programming experts Python's abilities and ways to interface with Java APIs. [publisher website]\n                    \n                "], "link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]},
28 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Chris Fehily; Peachpit Press, 2002, ISBN 0201748843. Task-based, step-by-step visual reference guide, many screen shots, for courses in digital graphics; Web design, scripting, development; multimedia, page layout, office tools, operating systems. [Prentice Hall]\n                    \n                "], "link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]},
29 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Ivan Van Laningham; Sams Publishing, 2000, ISBN 0672317354. Split into 24 hands-on, 1 hour lessons; steps needed to learn topic: syntax, language features, OO design and programming, GUIs (Tkinter), system administration, CGI. [Sams Publishing]\n                    \n                "], "link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]},
30 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\n                    \n                "], "link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]},
31 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n                    \n                "], "link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]},
32 | {"desc": ["\n                "], "link": ["/"], "title": ["Top"]},
33 | {"desc": [], "link": ["/Computers/"], "title": ["Computers"]},
34 | {"desc": [], "link": ["/Computers/Programming/"], "title": ["Programming"]},
35 | {"desc": [], "link": ["/Computers/Programming/Languages/"], "title": ["Languages"]},
36 | {"desc": [], "link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]},
37 | {"desc": ["\n                  \t", "\u00a0", "\n                "], "link": [], "title": []},
38 | {"desc": ["\n                        ", " \n                        ", "\n                    "], "link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]},
39 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - A directory of free Python and Zope hosting providers, with reviews and ratings.\n                    \n                "], "link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]},
40 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - Features Python books, resources, news and articles.\n                    \n                "], "link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]},
41 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - Resources for reporting bugs, accessing the Python source tree with CVS and taking part in the development of Python.\n                    \n                "], "link": ["http://www.python.org/dev/"], "title": ["Python Developer's Guide"]},
42 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - Scripts, examples and news about Python programming for the Windows platform.\n                    \n                "], "link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]},
43 | {"desc": ["\n\t\t\t\n                \t", " \n\t\t\t\n\t\t\t\t\t\n                    - Contains links to assorted resources from the Python universe, compiled by PythonWare.\n                    \n                "], "link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}]


--------------------------------------------------------------------------------
/tutorial/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = tutorial.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tutorial
12 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/tutorial/tutorial/__init__.py


--------------------------------------------------------------------------------
/tutorial/tutorial/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class TutorialItem(Item):
 9 |     # define the fields for your item here like:
10 |     title=Field()
11 |     link=Field()
12 |     desc=Field()
13 | 
14 | class DmozItem(Item):
15 |     # define the fields for your item here like:
16 |     title=Field()
17 |     link=Field()
18 |     desc=Field()
19 | 
20 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
5 | 
6 | class TutorialPipeline(object):
7 |     def process_item(self, item, spider):
8 |         return item
9 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for tutorial project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'tutorial'
10 | 
11 | SPIDER_MODULES = ['tutorial.spiders']
12 | NEWSPIDER_MODULE = 'tutorial.spiders'
13 | 
14 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
15 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
16 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/tutorial/tutorial/spiders/dmoz_spider.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spider import BaseSpider
 2 | from scrapy.selector import HtmlXPathSelector
 3 | from tutorial.items import DmozItem
 4 | 
 5 | class DmozSpider(BaseSpider):
 6 |     name = "dmoz"
 7 |     allowed_domains = ["dmoz.org"]
 8 |     start_urls = [
 9 |         "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
10 |         "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
11 |     ]
12 | 
13 |     def parse(self, response):
14 |        hxs = HtmlXPathSelector(response)
15 |        sites = hxs.select('//ul/li')
16 |        items = []
17 |        for site in sites:
18 |            item = DmozItem()
19 |            item['title'] = site.select('a/text()').extract()
20 |            item['link'] = site.select('a/@href').extract()
21 |            item['desc'] = site.select('text()').extract()
22 |            items.append(item)
23 |        return items
24 | 


--------------------------------------------------------------------------------
/web.py/README:
--------------------------------------------------------------------------------
 1 | web.py simple examples.
 2 | 
 3 | Web sites: `http://webpy.org`
 4 | 
 5 | Install guide:
 6 | 
 7 |     pip install web.py
 8 |     pip install markdown
 9 | 
10 | Code examples of web.py is taken from `http://webpy.org/src/`
11 | 


--------------------------------------------------------------------------------
/web.py/simple-blog/blog.py:
--------------------------------------------------------------------------------
 1 | """ Basic blog using webpy 0.3 """
 2 | import web
 3 | import model
 4 | 
 5 | ### Url mappings
 6 | 
 7 | urls = (
 8 |     '/', 'Index',
 9 |     '/view/(\d+)', 'View',
10 |     '/new', 'New',
11 |     '/delete/(\d+)', 'Delete',
12 |     '/edit/(\d+)', 'Edit',
13 | )
14 | 
15 | 
16 | ### Templates
17 | t_globals = {
18 |     'datestr': web.datestr
19 | }
20 | render = web.template.render('templates', base='base', globals=t_globals)
21 | 
22 | class Index:
23 | 
24 |     def GET(self):
25 |         """ Show page """
26 |         posts = model.get_posts()
27 |         return render.index(posts)
28 | 
29 | 
30 | class View:
31 | 
32 |     def GET(self, id):
33 |         """ View single post """
34 |         post = model.get_post(int(id))
35 |         return render.view(post)
36 | 
37 | 
38 | class New:
39 | 
40 |     form = web.form.Form(
41 |         web.form.Textbox('title', web.form.notnull, 
42 |             size=30,
43 |             description="Post title:"),
44 |         web.form.Textarea('content', web.form.notnull, 
45 |             rows=30, cols=80,
46 |             description="Post content:"),
47 |         web.form.Button('Post entry'),
48 |     )
49 | 
50 |     def GET(self):
51 |         form = self.form()
52 |         return render.new(form)
53 | 
54 |     def POST(self):
55 |         form = self.form()
56 |         if not form.validates():
57 |             return render.new(form)
58 |         model.new_post(form.d.title, form.d.content)
59 |         raise web.seeother('/')
60 | 
61 | 
62 | class Delete:
63 | 
64 |     def POST(self, id):
65 |         model.del_post(int(id))
66 |         raise web.seeother('/')
67 | 
68 | 
69 | class Edit:
70 | 
71 |     def GET(self, id):
72 |         post = model.get_post(int(id))
73 |         form = New.form()
74 |         form.fill(post)
75 |         return render.edit(post, form)
76 | 
77 | 
78 |     def POST(self, id):
79 |         form = New.form()
80 |         post = model.get_post(int(id))
81 |         if not form.validates():
82 |             return render.edit(post, form)
83 |         model.update_post(int(id), form.d.title, form.d.content)
84 |         raise web.seeother('/')
85 | 
86 | 
87 | app = web.application(urls, globals())
88 | 
89 | if __name__ == '__main__':
90 |     app.run()
91 | 


--------------------------------------------------------------------------------
/web.py/simple-blog/model.py:
--------------------------------------------------------------------------------
 1 | import web, datetime
 2 | 
 3 | db = web.database(dbn='mysql', db='todo', user='root', passwd='feisky')
 4 | 
 5 | def get_posts():
 6 |     return db.select('entries', order='id DESC')
 7 | 
 8 | def get_post(id):
 9 |     try:
10 |         return db.select('entries', where='id=$id', vars=locals())[0]
11 |     except IndexError:
12 |         return None
13 | 
14 | def new_post(title, text):
15 |     db.insert('entries', title=title, content=text, posted_on=datetime.datetime.utcnow())
16 | 
17 | def del_post(id):
18 |     db.delete('entries', where="id=$id", vars=locals())
19 | 
20 | def update_post(id, title, text):
21 |     db.update('entries', where="id=$id", vars=locals(),
22 |         title=title, content=text)
23 | 


--------------------------------------------------------------------------------
/web.py/simple-blog/schema.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE entries (
2 |     id INT AUTO_INCREMENT,
3 |     title TEXT,
4 |     content TEXT,
5 |     posted_on DATETIME,
6 |     primary key (id)
7 | );
8 | 


--------------------------------------------------------------------------------
/web.py/simple-blog/templates/base.html:
--------------------------------------------------------------------------------
 1 | $def with (page)
 2 | 
 3 | <html>
 4 | <head>
 5 |     <title>My Blog</title>
 6 |     <style>
 7 |         #menu {
 8 |             width: 200px;
 9 |             float: right;
10 |         }
11 |     </style>
12 | </head>
13 | <body>
14 | 
15 | <ul id="menu">
16 |     <li><a href="/">Home</a></li>
17 |     <li><a href="/new">New Post</a></li>
18 | </ul>
19 | 
20 | $:page
21 | 
22 | </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/web.py/simple-blog/templates/edit.html:
--------------------------------------------------------------------------------
 1 | $def with (post, form)
 2 | 
 3 | <h1>Edit $form.d.title</h1>
 4 | 
 5 | <form action="" method="post">
 6 | $:form.render()
 7 | </form>
 8 | 
 9 | 
10 | <h2>Delete post</h2>
11 | <form action="/delete/$post.id" method="post">
12 |     <input type="submit" value="Delete post"/>
13 | </form>
14 | 


--------------------------------------------------------------------------------
/web.py/simple-blog/templates/index.html:
--------------------------------------------------------------------------------
 1 | $def with (posts)
 2 | 
 3 | <h1>Blog posts</h1>
 4 | 
 5 | <ul>
 6 | $for post in posts:
 7 |     <li>
 8 |         <a href="/view/$post.id">$post.title</a> 
 9 |         from $datestr(post.posted_on) 
10 |         <a href="/edit/$post.id">Edit</a>
11 |     </li>
12 | </ul>
13 | 


--------------------------------------------------------------------------------
/web.py/simple-blog/templates/new.html:
--------------------------------------------------------------------------------
1 | $def with (form)
2 | 
3 | 
4 | <h1>New Blog Post</h1>
5 | <form action="" method="post">
6 | $:form.render()
7 | </form>
8 | 
9 | 


--------------------------------------------------------------------------------
/web.py/simple-blog/templates/view.html:
--------------------------------------------------------------------------------
1 | $def with (post)
2 | 
3 | <h1>$post.title</h1>
4 | $datestr(post.posted_on)<br/>
5 | 
6 | $post.content
7 | 
8 | 


--------------------------------------------------------------------------------
/web.py/simple-todo/model.py:
--------------------------------------------------------------------------------
 1 | import web
 2 | 
 3 | db=web.database(dbn="mysql", db="todo", user="root", passwd="feisky")
 4 | 
 5 | def get_todos():
 6 |     return db.select('todo', order='id')
 7 | 
 8 | def new_todo(text):
 9 |     db.insert('todo', title=text)
10 | 
11 | def del_todo(id):
12 |     db.delete('todo', where="id=$id", vars=locals())
13 | 
14 | 


--------------------------------------------------------------------------------
/web.py/simple-todo/schema.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE todo (
2 |         id INT AUTO_INCREMENT,
3 |             title TEXT,
4 |                 primary key (id)
5 |             );
6 | 


--------------------------------------------------------------------------------
/web.py/simple-todo/templates/base.html:
--------------------------------------------------------------------------------
 1 | $def with (page)
 2 | 
 3 | <html>
 4 |   <head>
 5 |         <title>Todo list</title>
 6 |   </head>
 7 | <body>
 8 | $:page
 9 | 
10 | </body>
11 | </html>
12 | 


--------------------------------------------------------------------------------
/web.py/simple-todo/templates/index.html:
--------------------------------------------------------------------------------
 1 | $def with(todos, form)
 2 | 
 3 | <table>
 4 |   <tr>
 5 |     <th>What to do?</th>
 6 |     <th></th>
 7 |   </tr>
 8 | $for todo in todos:
 9 |   <tr>
10 |     <td>$todo.title</td>
11 |     <td>
12 |       <form action="/del/$todo.id" method="POST">
13 |         <input type="SUBMIT" value="Delete"/>
14 |       </form>
15 |     </td>
16 |   </tr>
17 | 
18 | </table>
19 | <form action="" method="POST">
20 |   $:form.render()
21 | </form>
22 | 
23 | 


--------------------------------------------------------------------------------
/web.py/simple-todo/todo.py:
--------------------------------------------------------------------------------
 1 | import web
 2 | import model
 3 | 
 4 | urls=(
 5 |         '/', 'Index',
 6 |         '/del/(\d+)', 'Delete'
 7 |         )
 8 | 
 9 | render = web.template.render('templates', base='base')
10 | 
11 | class Index:
12 |     form=web.form.Form(
13 |             web.form.Textbox('title', web.form.notnull,
14 |                 description='I need to:'),
15 |             web.form.Button('Add todo'),
16 |     )
17 | 
18 |     def GET(self):
19 |         todos=model.get_todos()
20 |         form=self.form()
21 |         return render.index(todos, form)
22 | 
23 |     def POST(self):
24 |         form=self.form()
25 |         if not form.validates():
26 |             todos=model.get_todos()
27 |             return render.index(todos, form)
28 |         model.new_todo(form.d.title)
29 |         raise web.seeother('/')
30 | 
31 | class Delete:
32 |     def POST(self, id):
33 |         id=int(id)
34 |         model.del_todo(id)
35 |         raise web.seeother('/')
36 | 
37 | if __name__=='__main__':
38 |     app=web.application(urls, globals())
39 |     app.run()
40 | 


--------------------------------------------------------------------------------
/web.py/simple-wiki/model.py:
--------------------------------------------------------------------------------
 1 | import web
 2 | 
 3 | db = web.database(dbn='mysql', db='todo', user='root', passwd='feisky')
 4 | 
 5 | def get_pages():
 6 |     return db.select('pages', order='id DESC')
 7 | 
 8 | def get_page_by_url(url):
 9 |     try:
10 |         return db.select('pages', where='url=$url', vars=locals())[0]
11 |     except IndexError:
12 |         return None
13 | 
14 | def get_page_by_id(id):
15 |     try:
16 |         return db.select('pages', where='id=$id', vars=locals())[0]
17 |     except IndexError:
18 |         return None
19 | 
20 | def new_page(url, title, text):
21 |     db.insert('pages', url=url, title=title, content=text)
22 | 
23 | def del_page(id):
24 |     db.delete('pages', where="id=$id", vars=locals())
25 | 
26 | def update_page(id, url, title, text):
27 |     db.update('pages', where="id=$id", vars=locals(),
28 |         url=url, title=title, content=text)
29 | 


--------------------------------------------------------------------------------
/web.py/simple-wiki/schema.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE pages (
2 |     id INT AUTO_INCREMENT,
3 |     url TEXT,
4 |     title TEXT,
5 |     content TEXT,
6 |     primary key (id)
7 | );
8 | 


--------------------------------------------------------------------------------
/web.py/simple-wiki/templates/base.html:
--------------------------------------------------------------------------------
 1 | $def with (page)
 2 | 
 3 | <html>
 4 | <head>
 5 |     $if page.has_key('title'):
 6 |         <title>$page.title</title>
 7 |     $else:
 8 |         <title>My Wiki</title>
 9 |     <style>
10 |     </style>
11 | </head>
12 | <body>
13 | 
14 | <ul>
15 |     <li><a href="/">Home</a></li>
16 |     <li><a href="/new">New Page</a></li>
17 | </ul>
18 | 
19 | $:page
20 | 
21 | </body>
22 | </html>
23 | 


--------------------------------------------------------------------------------
/web.py/simple-wiki/templates/edit.html:
--------------------------------------------------------------------------------
 1 | $def with (page, form)
 2 | 
 3 | <h1>Edit $form.d.title</h1>
 4 | 
 5 | <form action="" method="post">
 6 | $:form.render()
 7 | </form>
 8 | 
 9 | 
10 | <h2>Delete page?</h2>
11 | <form action="/delete/$page.id" method="post">
12 |     <input type="submit" value="Delete page"/>
13 | </form>
14 | 


--------------------------------------------------------------------------------
/web.py/simple-wiki/templates/index.html:
--------------------------------------------------------------------------------
 1 | $def with (pages)
 2 | 
 3 | <h1>Webpy Wiki</h1>
 4 | 
 5 | <h2>Pages:</h2>
 6 | 
 7 | <ul>
 8 | $for page in pages:
 9 |     <li><a href="/$page.url">$page.url</a></li>
10 | 
11 | </ul>
12 | 


--------------------------------------------------------------------------------
/web.py/simple-wiki/templates/new.html:
--------------------------------------------------------------------------------
1 | $def with (form)
2 | 
3 | 
4 | <h1>New Wiki Page</h1>
5 | <form action="" method="post">
6 | $:form.render()
7 | </form>
8 | 


--------------------------------------------------------------------------------
/web.py/simple-wiki/templates/view.html:
--------------------------------------------------------------------------------
 1 | $def with (page)
 2 | 
 3 | $var title: $page.title
 4 | 
 5 | <h1>$page.title</h1>
 6 | 
 7 | $:markdown(page.content)
 8 | 
 9 | 
10 | <a href="/edit/$page.id">Edit</a>
11 | 


--------------------------------------------------------------------------------
/web.py/simple-wiki/wiki.py:
--------------------------------------------------------------------------------
  1 | """ Basic wiki using webpy 0.3 """
  2 | import web
  3 | import model
  4 | import markdown
  5 | 
  6 | ### Url mappings
  7 | 
  8 | urls = (
  9 |     '/', 'Index',
 10 |     '/new', 'New',
 11 |     '/edit/(\d+)', 'Edit',
 12 |     '/delete/(\d+)', 'Delete',
 13 |     '/(.*)', 'Page',
 14 | )
 15 | 
 16 | 
 17 | ### Templates
 18 | t_globals = {
 19 |     'datestr': web.datestr,
 20 |     'markdown': markdown.markdown,
 21 | }
 22 | render = web.template.render('templates', base='base', globals=t_globals)
 23 | 
 24 | 
 25 | class Index:
 26 | 
 27 |     def GET(self):
 28 |         """ Show page """
 29 |         pages = model.get_pages()
 30 |         return render.index(pages)
 31 | 
 32 | 
 33 | class Page:
 34 | 
 35 |     def GET(self, url):
 36 |         """ View single page """
 37 |         page = model.get_page_by_url(url)
 38 |         if not page:
 39 |             raise web.seeother('/new?url=%s' % web.websafe(url))
 40 |         return render.view(page)
 41 | 
 42 | 
 43 | class New:
 44 | 
 45 |     def not_page_exists(url):
 46 |         return not bool(model.get_page_by_url(url))
 47 | 
 48 |     page_exists_validator = web.form.Validator('Page already exists', 
 49 |                                 not_page_exists)
 50 | 
 51 |     form = web.form.Form(
 52 |         web.form.Textbox('url', web.form.notnull, page_exists_validator,
 53 |             size=30,
 54 |             description="Location:"),
 55 |         web.form.Textbox('title', web.form.notnull, 
 56 |             size=30,
 57 |             description="Page title:"),
 58 |         web.form.Textarea('content', web.form.notnull, 
 59 |             rows=30, cols=80,
 60 |             description="Page content:", post="Use markdown syntax"),
 61 |         web.form.Button('Create page'),
 62 |     )
 63 | 
 64 |     def GET(self):
 65 |         url = web.input(url='').url
 66 |         form = self.form()
 67 |         form.fill({'url':url})
 68 |         return render.new(form)
 69 | 
 70 |     def POST(self):
 71 |         form = self.form()
 72 |         if not form.validates():
 73 |             return render.new(form)
 74 |         model.new_page(form.d.url, form.d.title, form.d.content)
 75 |         raise web.seeother('/' + form.d.url)
 76 | 
 77 | 
 78 | class Delete:
 79 | 
 80 |     def POST(self, id):
 81 |         model.del_page(int(id))
 82 |         raise web.seeother('/')
 83 | 
 84 | 
 85 | class Edit:
 86 | 
 87 |     form = web.form.Form(
 88 |         web.form.Textbox('url', web.form.notnull, 
 89 |             size=30,
 90 |             description="Location:"),
 91 |         web.form.Textbox('title', web.form.notnull, 
 92 |             size=30,
 93 |             description="Page title:"),
 94 |         web.form.Textarea('content', web.form.notnull, 
 95 |             rows=30, cols=80,
 96 |             description="Page content:", post="Use markdown syntax"),
 97 |         web.form.Button('Update page'),
 98 |     )
 99 | 
100 |     def GET(self, id):
101 |         page = model.get_page_by_id(int(id))
102 |         form = self.form()
103 |         form.fill(page)
104 |         return render.edit(page, form)
105 | 
106 | 
107 |     def POST(self, id):
108 |         form = self.form()
109 |         page = model.get_page_by_id(int(id))
110 |         if not form.validates():
111 |             return render.edit(page, form)
112 |         model.update_page(int(id), form.d.url, form.d.title, form.d.content)
113 |         raise web.seeother('/')
114 | 
115 | 
116 | app = web.application(urls, globals())
117 | 
118 | if __name__ == '__main__':
119 |     app.run()
120 | 


--------------------------------------------------------------------------------