├── README.md ├── blog_crawl ├── README ├── blog_crawl │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── mindhacks_spider.py └── scrapy.cfg ├── docs ├── README ├── scrapyd └── webkit2png ├── newsspider ├── .scrapy │ └── scrapyd │ │ ├── dbs │ │ └── default.db │ │ ├── items │ │ └── default │ │ │ ├── hackernews │ │ │ ├── 325d2fa68a0e11e2adba7e97b6ad9650.jl │ │ │ └── c58abb2c87d411e2adba7e97b6ad9650.jl │ │ │ └── mindhacks │ │ │ └── ff9b53c687d411e2adba7e97b6ad9650.jl │ │ └── logs │ │ └── default │ │ ├── all │ │ └── b29488fe87d411e2adba7e97b6ad9650.log │ │ ├── hackernews │ │ ├── 325d2fa68a0e11e2adba7e97b6ad9650.log │ │ └── c58abb2c87d411e2adba7e97b6ad9650.log │ │ ├── mindhacks │ │ └── ff9b53c687d411e2adba7e97b6ad9650.log │ │ └── somespider │ │ └── 28e7b04a8a0e11e2adba7e97b6ad9650.log ├── README.md ├── dbs │ └── default.db ├── newsspider │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ └── allcrawl.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── cnblogs.py │ │ ├── dbanotes.py │ │ ├── hackernews.py │ │ ├── jobbole.py │ │ ├── mindhacks.py │ │ └── reddit.py ├── proxies.txt ├── query_db.py ├── scrapy.cfg ├── webkit2png ├── webkit2png.log └── webkit2png.py ├── proxycrawler ├── proxies.txt ├── proxycrawler │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── proxy.py └── scrapy.cfg ├── scrapy-ws.py ├── server ├── code.py ├── config.py ├── db.py ├── sql │ └── tables.sql ├── static │ └── favicon.ico ├── templates │ ├── base.html │ ├── item.html │ └── listing.html ├── view.py ├── webkit2png └── webkit2png.py ├── tutorial ├── README ├── items.json ├── scrapy.cfg └── tutorial │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── dmoz_spider.py └── web.py ├── README ├── simple-blog ├── blog.py ├── model.py ├── schema.sql └── templates │ ├── base.html │ ├── edit.html │ ├── index.html │ ├── new.html │ └── view.html ├── simple-todo ├── model.py ├── schema.sql ├── templates │ ├── base.html │ └── index.html └── todo.py └── simple-wiki ├── model.py ├── schema.sql ├── templates ├── base.html ├── edit.html ├── index.html ├── new.html └── view.html └── wiki.py /README.md: -------------------------------------------------------------------------------- 1 | Scrapy-Examples 2 | =============== 3 | Some examples for scrapy. 4 | 5 | Install methods: 6 | 7 | pip install twisted scrapy 8 | pip install BeautifulSoup 9 | 10 | cp newsspider/webkit2png* /usr/bin 11 | -------------------------------------------------------------------------------- /blog_crawl/README: -------------------------------------------------------------------------------- 1 | 2 | Crawl posts from mindhacks.cn 3 | 4 | This exmaple demonstrates: 5 | 6 | Crawl urls from mindhacks.cn 7 | Save post contents to sqlite3 8 | How to follow next page link 9 | 10 | 11 | 12 | 13 | Reference: 14 | http://blog.pluskid.org/?p=366 -------------------------------------------------------------------------------- /blog_crawl/blog_crawl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/blog_crawl/blog_crawl/__init__.py -------------------------------------------------------------------------------- /blog_crawl/blog_crawl/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class BlogCrawlItem(Item): 9 | # define the fields for your item here like: 10 | url = Field() 11 | raw = Field() 12 | 13 | def __str__(self): 14 | return self['url'] 15 | -------------------------------------------------------------------------------- /blog_crawl/blog_crawl/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | 6 | import sqlite3 7 | from os import path 8 | 9 | from scrapy import signals 10 | from scrapy.xlib.pydispatch import dispatcher 11 | 12 | class BlogCrawlPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | 17 | class SQLiteStorePipeline(object): 18 | filename = 'data.sqlite' 19 | 20 | def __init__(self): 21 | self.conn = None 22 | dispatcher.connect(self.initialize, signals.engine_started) 23 | dispatcher.connect(self.finalize, signals.engine_stopped) 24 | 25 | def process_item(self, item, domain): 26 | try: 27 | self.conn.execute('insert into blog values(?,?,?)', 28 | (item['url'], item['raw'], unicode(domain))) 29 | except: 30 | print 'Failed to insert item: ' + item['url'] 31 | return item 32 | 33 | def initialize(self): 34 | if path.exists(self.filename): 35 | self.conn = sqlite3.connect(self.filename) 36 | else: 37 | self.conn = self.create_table(self.filename) 38 | 39 | def finalize(self): 40 | if self.conn is not None: 41 | self.conn.commit() 42 | self.conn.close() 43 | self.conn = None 44 | 45 | def create_table(self, filename): 46 | conn = sqlite3.connect(filename) 47 | conn.execute("""create table blog 48 | (url text primary key, raw text, domain text)""") 49 | conn.commit() 50 | return conn 51 | -------------------------------------------------------------------------------- /blog_crawl/blog_crawl/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for blog_crawl project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'blog_crawl' 10 | 11 | SPIDER_MODULES = ['blog_crawl.spiders'] 12 | NEWSPIDER_MODULE = 'blog_crawl.spiders' 13 | 14 | ITEM_PIPELINES = ['blog_crawl.pipelines.SQLiteStorePipeline'] 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'blog_crawl (+http://www.yourdomain.com)' 18 | -------------------------------------------------------------------------------- /blog_crawl/blog_crawl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /blog_crawl/blog_crawl/spiders/mindhacks_spider.py: -------------------------------------------------------------------------------- 1 | from scrapy.spider import BaseSpider 2 | from scrapy.selector import HtmlXPathSelector 3 | from blog_crawl.items import BlogCrawlItem 4 | 5 | class MindhacksSpider(BaseSpider): 6 | name = "mindhacks.cn" 7 | allowed_domains = [ "mindhacks.cn"] 8 | start_urls = ["http://mindhacks.cn/"] 9 | 10 | def parse(self, response): 11 | hxs = HtmlXPathSelector(response) 12 | sites = hxs.select('//h3/a/@href') 13 | items = [] 14 | # save all urls 15 | #for site in sites: 16 | # item = BlogCrawlItem() 17 | # item['url'] = site.extract() 18 | # items.append(item) 19 | 20 | # process each post 21 | items.extend([self.make_requests_from_url(url.extract()).replace(callback=self.parse_post) 22 | for url in sites]) 23 | 24 | # process next page 25 | page_links=hxs.select('//div[@class="wp-pagenavi"]/a[not(@title)]') 26 | for link in page_links: 27 | if link.select('text()').extract()[0] == u'\xbb': 28 | url = link.select('@href').extract()[0] 29 | items.append(self.make_requests_from_url(url)) 30 | return items 31 | 32 | def parse_post(self, response): 33 | item = BlogCrawlItem() 34 | item['url'] = unicode(response.url) 35 | item['raw'] = response.body_as_unicode() 36 | return [item] 37 | -------------------------------------------------------------------------------- /blog_crawl/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = blog_crawl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = blog_crawl 12 | -------------------------------------------------------------------------------- /docs/README: -------------------------------------------------------------------------------- 1 | Create project: 2 | 3 | scrapy startproject tutorial 4 | 5 | Generate a spider: 6 | 7 | scrapy genspider mydomain mydomain.com 8 | scrapy genspider -t crawl/xmlfeed/csvfeed spider mydomain mydomain.com 9 | 10 | Start crawling: 11 | 12 | scrapy crawl dmoz 13 | 14 | Scrapy shell: 15 | 16 | scrapy shell http://www.dmoz.org/Computers/Programming/Languages/Python/Books/ 17 | 18 | Export json: 19 | 20 | scrapy crawl dmoz -o items.json -t json 21 | 22 | -------------------------------------------------------------------------------- /docs/scrapyd: -------------------------------------------------------------------------------- 1 | Scrapy Service (scrapyd) 2 | 3 | Run as a server: 4 | # scrapy server 5 | 2013-03-11 13:36:27+0800 [-] Log opened. 6 | 2013-03-11 13:36:27+0800 [-] Scrapyd web console available at http://0.0.0.0:6800/ 7 | 2013-03-11 13:36:27+0800 [Launcher] Scrapyd started: max_proc=8, runner='scrapyd.runner' 8 | 2013-03-11 13:36:27+0800 [-] Site starting on 6800 9 | 2013-03-11 13:36:27+0800 [-] Starting factory 10 | 11 | start a spider: 12 | # curl http://localhost:6800/listprojects.json 13 | {"status": "ok", "projects": ["default"]} 14 | 15 | # curl http://localhost:6800/schedule.json -d project=default -d spider=hackernews 16 | {"status": "ok", "jobid": "325d2fa68a0e11e2adba7e97b6ad9650"} 17 | 18 | # curl http://localhost:6800/cancel.json -d project=default -d job=325d2fa68a0e11e2adba7e97b6ad9650 19 | {"status": "ok", "prevstate": null} 20 | 21 | # curl http://localhost:6800/listversions.json?project=default 22 | {"status": "ok", "versions": []} 23 | 24 | # curl http://localhost:6800/listspiders.json?project=default 25 | {"status": "ok", "spiders": ["mindhacks", "hackernews"]} 26 | 27 | # curl http://localhost:6800/listjobs.json?project=default 28 | {"status": "ok", "running": [], "finished": [{"start_time": "2013-03-11 13:40:22.537393", "end_time": "2013-03-11 13:40:23.159254", "id": "28e7b04a8a0e11e2adba7e97b6ad9650", "spider": "somespider"}, {"start_time": "2013-03-11 13:40:37.538718", "end_time": "2013-03-11 13:40:58.144857", "id": "325d2fa68a0e11e2adba7e97b6ad9650", "spider": "hackernews"}], "pending": []} 29 | 30 | 31 | -------------------------------------------------------------------------------- /docs/webkit2png: -------------------------------------------------------------------------------- 1 | #Reference 2 | https://github.com/AdamN/python-webkit2png/ 3 | 4 | #安装xvfb用于虚拟X环境 5 | apt-get install xvfb 6 | 7 | #安装中文字体 8 | apt-get install xfonts-wqy 9 | 10 | #配置字体 11 | fontconfig-voodoo -f -s zh_CN 12 | 13 | #Get webkit2png 14 | wget https://raw.github.com/adamn/python-webkit2png/master/scripts/webkit2png 15 | wget https://raw.github.com/adamn/python-webkit2png/master/webkit2png/webkit2png.py 16 | chmod +x webkit2png webkit2png.py 17 | 18 | #Using 19 | ./webkit2png -x 1366 768 http://www.sina.com.cn -o test2.png 20 | ./webkit2png -x 1366 768 -F javascript http://www.sina.com.cn -o test2.png 21 | 22 | -------------------------------------------------------------------------------- /newsspider/.scrapy/scrapyd/dbs/default.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/newsspider/.scrapy/scrapyd/dbs/default.db -------------------------------------------------------------------------------- /newsspider/.scrapy/scrapyd/items/default/mindhacks/ff9b53c687d411e2adba7e97b6ad9650.jl: -------------------------------------------------------------------------------- 1 | {"url": "http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/", "created": "2011-11-04T01:18:00+00:00", "site": "mindhacks", "title": "\u600e\u6837\u82b1\u4e24\u5e74\u65f6\u95f4\u53bb\u9762\u8bd5\u4e00\u4e2a\u4eba"} 2 | {"url": "http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/", "created": "2012-06-04T23:10:49+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e94\uff09\uff1a\u770b\u4e0d\u89c1\u7684\u7262\u7b3c\uff08\u4e0a\uff09"} 3 | {"url": "http://mindhacks.cn/2012/08/27/modern-cpp-practices/", "created": "2012-08-27T14:09:47+00:00", "site": "mindhacks", "title": "C++11\uff08\u53ca\u73b0\u4ee3C++\u98ce\u683c\uff09\u548c\u5feb\u901f\u8fed\u4ee3\u5f0f\u5f00\u53d1"} 4 | {"url": "http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/", "created": "2009-07-06T21:48:07+00:00", "site": "mindhacks", "title": "[BetterExplained]\u9047\u5230\u95ee\u9898\u4e3a\u4ec0\u4e48\u5e94\u8be5\u81ea\u5df1\u52a8\u624b"} 5 | {"url": "http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/", "created": "2009-10-05T01:18:30+00:00", "site": "mindhacks", "title": "\u4e0d\u662f\u4e66\u8bc4 \uff1a\u300a\u6211\u662f\u4e00\u53eaIT\u5c0f\u5c0f\u9e1f\u300b"} 6 | {"url": "http://mindhacks.cn/2009/12/20/dark-time/", "created": "2009-12-20T13:39:00+00:00", "site": "mindhacks", "title": "\u6697\u65f6\u95f4"} 7 | {"url": "http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/", "created": "2010-03-18T00:28:14+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e09\uff09\uff1a\u9047\u89c120\u4e07\u5e74\u524d\u7684\u81ea\u5df1"} 8 | {"url": "http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/", "created": "2010-11-14T17:41:00+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u7eed\uff09"} 9 | {"url": "http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/", "created": "2011-01-23T20:28:34+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u56db\uff09\uff1a\u7406\u667a\u4e0e\u60c5\u611f"} 10 | {"url": "http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/", "created": "2011-07-10T00:24:32+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4e09\uff09\uff1a\u4e3a\u4ec0\u4e48\u7b97\u6cd5\u8fd9\u4e48\u96be\uff1f"} 11 | {"url": "http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/", "created": "2011-11-04T01:18:00+00:00", "site": "mindhacks", "title": "\u600e\u6837\u82b1\u4e24\u5e74\u65f6\u95f4\u53bb\u9762\u8bd5\u4e00\u4e2a\u4eba"} 12 | {"url": "http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/", "created": "2012-06-04T23:10:49+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e94\uff09\uff1a\u770b\u4e0d\u89c1\u7684\u7262\u7b3c\uff08\u4e0a\uff09"} 13 | {"url": "http://mindhacks.cn/2012/08/27/modern-cpp-practices/", "created": "2012-08-27T14:09:47+00:00", "site": "mindhacks", "title": "C++11\uff08\u53ca\u73b0\u4ee3C++\u98ce\u683c\uff09\u548c\u5feb\u901f\u8fed\u4ee3\u5f0f\u5f00\u53d1"} 14 | {"url": "http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/", "created": "2009-02-15T19:57:26+00:00", "site": "mindhacks", "title": "[BetterExplained]\u4e3a\u4ec0\u4e48\u4f60\u5e94\u8be5\uff08\u4ece\u73b0\u5728\u5f00\u59cb\u5c31\uff09\u5199\u535a\u5ba2"} 15 | {"url": "http://mindhacks.cn/2009/03/09/first-principles-of-programming/", "created": "2009-03-09T15:12:00+00:00", "site": "mindhacks", "title": "\u7f16\u7a0b\u7684\u9996\u8981\u539f\u5219(s)\u662f\u4ec0\u4e48\uff1f"} 16 | {"url": "http://mindhacks.cn/2009/03/15/preconception-explained/", "created": "2009-03-15T18:49:27+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e8c\uff09\uff1a\u4ec1\u8005\u89c1\u4ec1\u667a\u8005\u89c1\u667a\uff1f\u4ece\u89c6\u89c9\u9519\u89c9\u5230\u504f\u89c1"} 17 | {"url": "http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/", "created": "2009-03-28T19:23:02+00:00", "site": "mindhacks", "title": "[BetterExplained]\u5982\u4f55\u6709\u6548\u5730\u8bb0\u5fc6\u4e0e\u5b66\u4e60"} 18 | {"url": "http://mindhacks.cn/2009/05/17/seven-years-in-nju/", "created": "2009-05-17T23:57:30+00:00", "site": "mindhacks", "title": "\u6211\u5728\u5357\u5927\u7684\u4e03\u5e74"} 19 | {"url": "http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/", "created": "2009-07-06T21:48:07+00:00", "site": "mindhacks", "title": "[BetterExplained]\u9047\u5230\u95ee\u9898\u4e3a\u4ec0\u4e48\u5e94\u8be5\u81ea\u5df1\u52a8\u624b"} 20 | {"url": "http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/", "created": "2009-10-05T01:18:30+00:00", "site": "mindhacks", "title": "\u4e0d\u662f\u4e66\u8bc4 \uff1a\u300a\u6211\u662f\u4e00\u53eaIT\u5c0f\u5c0f\u9e1f\u300b"} 21 | {"url": "http://mindhacks.cn/2009/12/20/dark-time/", "created": "2009-12-20T13:39:00+00:00", "site": "mindhacks", "title": "\u6697\u65f6\u95f4"} 22 | {"url": "http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/", "created": "2010-03-18T00:28:14+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e09\uff09\uff1a\u9047\u89c120\u4e07\u5e74\u524d\u7684\u81ea\u5df1"} 23 | {"url": "http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/", "created": "2010-11-14T17:41:00+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u7eed\uff09"} 24 | {"url": "http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/", "created": "2011-01-23T20:28:34+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u56db\uff09\uff1a\u7406\u667a\u4e0e\u60c5\u611f"} 25 | {"url": "http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/", "created": "2011-07-10T00:24:32+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4e09\uff09\uff1a\u4e3a\u4ec0\u4e48\u7b97\u6cd5\u8fd9\u4e48\u96be\uff1f"} 26 | {"url": "http://mindhacks.cn/2009/01/16/hammers-and-nails/", "created": "2009-01-16T21:25:00+00:00", "site": "mindhacks", "title": "\u9524\u5b50\u548c\u9489\u5b50"} 27 | {"url": "http://mindhacks.cn/2009/01/18/escape-from-your-shawshank-part1/", "created": "2009-01-18T21:32:00+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e00\uff09\uff1a\u4e3a\u4ec0\u4e48\u4e00\u5b9a\u8981\u4eb2\u8eab\u7ecf\u5386\u4e86\u4e4b\u540e\u624d\u80fd\u660e\u767d\uff1f"} 28 | {"url": "http://mindhacks.cn/2009/02/07/independence-day/", "created": "2009-02-07T12:57:34+00:00", "site": "mindhacks", "title": "\u72ec\u7acb\u65e5"} 29 | {"url": "http://mindhacks.cn/2009/02/07/better-explained-conflicts-in-intimate-relationship/", "created": "2009-02-07T20:35:17+00:00", "site": "mindhacks", "title": "[BetterExplained]\u4eb2\u5bc6\u5173\u7cfb\u4e2d\u7684\u51b2\u7a81\u89e3\u51b3"} 30 | {"url": "http://mindhacks.cn/2009/02/09/writing-is-better-thinking/", "created": "2009-02-09T22:24:00+00:00", "site": "mindhacks", "title": "[BetterExplained]\u4e66\u5199\u662f\u4e3a\u4e86\u66f4\u597d\u7684\u601d\u8003"} 31 | {"url": "http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/", "created": "2009-02-15T19:57:26+00:00", "site": "mindhacks", "title": "[BetterExplained]\u4e3a\u4ec0\u4e48\u4f60\u5e94\u8be5\uff08\u4ece\u73b0\u5728\u5f00\u59cb\u5c31\uff09\u5199\u535a\u5ba2"} 32 | {"url": "http://mindhacks.cn/2009/03/09/first-principles-of-programming/", "created": "2009-03-09T15:12:00+00:00", "site": "mindhacks", "title": "\u7f16\u7a0b\u7684\u9996\u8981\u539f\u5219(s)\u662f\u4ec0\u4e48\uff1f"} 33 | {"url": "http://mindhacks.cn/2009/03/15/preconception-explained/", "created": "2009-03-15T18:49:27+00:00", "site": "mindhacks", "title": "\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e8c\uff09\uff1a\u4ec1\u8005\u89c1\u4ec1\u667a\u8005\u89c1\u667a\uff1f\u4ece\u89c6\u89c9\u9519\u89c9\u5230\u504f\u89c1"} 34 | {"url": "http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/", "created": "2009-03-28T19:23:02+00:00", "site": "mindhacks", "title": "[BetterExplained]\u5982\u4f55\u6709\u6548\u5730\u8bb0\u5fc6\u4e0e\u5b66\u4e60"} 35 | {"url": "http://mindhacks.cn/2009/05/17/seven-years-in-nju/", "created": "2009-05-17T23:57:30+00:00", "site": "mindhacks", "title": "\u6211\u5728\u5357\u5927\u7684\u4e03\u5e74"} 36 | {"url": "http://mindhacks.cn/2008/07/07/the-importance-of-knowing-why/", "created": "2008-07-07T21:05:00+00:00", "site": "mindhacks", "title": "\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4ee5\u7b97\u6cd5\u5b66\u4e60\u4e3a\u4f8b\uff09"} 37 | {"url": "http://mindhacks.cn/2008/07/08/learning-habits-part1/", "created": "2008-07-08T21:13:00+00:00", "site": "mindhacks", "title": "\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e00)\uff1a\u5b66\u4e60\u4e0e\u601d\u8003"} 38 | {"url": "http://mindhacks.cn/2008/07/20/learning-habits-part2/", "created": "2008-07-20T21:16:00+00:00", "site": "mindhacks", "title": "\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e8c)\uff1a\u65f6\u95f4\u7ba1\u7406"} 39 | {"url": "http://mindhacks.cn/2008/09/11/machine-learning-and-ai-resources/", "created": "2008-09-11T19:29:00+00:00", "site": "mindhacks", "title": "\u673a\u5668\u5b66\u4e60\u4e0e\u4eba\u5de5\u667a\u80fd\u5b66\u4e60\u8d44\u6e90\u5bfc\u5f15"} 40 | {"url": "http://mindhacks.cn/2008/09/17/learning-habits-part3/", "created": "2008-09-17T21:18:00+00:00", "site": "mindhacks", "title": "\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e09)\uff1a\u9605\u8bfb\u65b9\u6cd5"} 41 | {"url": "http://mindhacks.cn/2008/09/21/the-magical-bayesian-method/", "created": "2008-09-21T19:34:00+00:00", "site": "mindhacks", "title": "\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u5e73\u51e1\u800c\u53c8\u795e\u5947\u7684\u8d1d\u53f6\u65af\u65b9\u6cd5"} 42 | {"url": "http://mindhacks.cn/2008/10/29/methodology-for-programmers/", "created": "2008-10-29T21:09:00+00:00", "site": "mindhacks", "title": "\u65b9\u6cd5\u8bba\u3001\u65b9\u6cd5\u8bba\u2014\u2014\u7a0b\u5e8f\u5458\u7684\u963f\u5580\u7409\u65af\u4e4b\u8e35"} 43 | {"url": "http://mindhacks.cn/2008/12/05/learning-habits-part4/", "created": "2008-12-05T21:21:00+00:00", "site": "mindhacks", "title": "\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u56db)\uff1a\u77e5\u8bc6\u7ed3\u6784"} 44 | {"url": "http://mindhacks.cn/2008/12/18/how-to-think-straight/", "created": "2008-12-18T20:10:00+00:00", "site": "mindhacks", "title": "\u5982\u4f55\u6e05\u6670\u5730\u601d\u8003\uff08\u8fd1\u4e00\u5e74\u6765\u4e1a\u4f59\u9605\u8bfb\u7684\u5173\u4e8e\u601d\u7ef4\u65b9\u9762\u7684\u77e5\u8bc6\u7ed3\u6784\u6574\u7406\uff09"} 45 | {"url": "http://mindhacks.cn/2009/01/14/make-yourself-irreplacable/", "created": "2009-01-14T21:28:00+00:00", "site": "mindhacks", "title": "\u4ec0\u4e48\u624d\u662f\u4f60\u7684\u4e0d\u53ef\u66ff\u4ee3\u6027\u548c\u6838\u5fc3\u7ade\u4e89\u529b"} 46 | {"url": "http://mindhacks.cn/2006/10/15/cantor-godel-turing-an-eternal-golden-diagonal/", "created": "2006-10-15T19:16:00+00:00", "site": "mindhacks", "title": "\u5eb7\u6258\u5c14\u3001\u54e5\u5fb7\u5c14\u3001\u56fe\u7075\u2014\u2014\u6c38\u6052\u7684\u91d1\u8272\u5bf9\u89d2\u7ebf(rev#2)"} 47 | {"url": "http://mindhacks.cn/2007/05/24/learn-to-focus/", "created": "2007-05-24T20:30:00+00:00", "site": "mindhacks", "title": "\u5b66\u4e60\u5bc6\u5ea6\u4e0e\u4e13\u6ce8\u529b"} 48 | {"url": "http://mindhacks.cn/2007/12/02/probability-theory-in-evolution/", "created": "2007-12-02T18:55:00+00:00", "site": "mindhacks", "title": "\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u8fdb\u5316\u8bba\u4e2d\u7684\u6982\u7387\u8bba"} 49 | {"url": "http://mindhacks.cn/2008/03/03/failing-to-see-the-big-picture/", "created": "2008-03-03T15:42:00+00:00", "site": "mindhacks", "title": "Failing To See the Big Picture \u2013 Mistakes we make when learning programming"} 50 | {"url": "http://mindhacks.cn/2008/04/08/reading-method/", "created": "2008-04-08T21:00:00+00:00", "site": "mindhacks", "title": "\u9605\u8bfb\u4e0e\u601d\u8003"} 51 | {"url": "http://mindhacks.cn/2008/04/18/learning-from-polya/", "created": "2008-04-18T21:37:00+00:00", "site": "mindhacks", "title": "\u8ddf\u6ce2\u5229\u4e9a\u5b66\u89e3\u9898(rev#3)"} 52 | {"url": "http://mindhacks.cn/2008/06/05/how-memory-works/", "created": "2008-06-05T21:40:00+00:00", "site": "mindhacks", "title": "\u5b66\u4e60\u4e0e\u8bb0\u5fc6"} 53 | {"url": "http://mindhacks.cn/2008/06/13/why-is-quicksort-so-quick/", "created": "2008-06-13T19:53:00+00:00", "site": "mindhacks", "title": "\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u5feb\u6392\u4e3a\u4ec0\u4e48\u90a3\u6837\u5feb"} 54 | -------------------------------------------------------------------------------- /newsspider/.scrapy/scrapyd/logs/default/all/b29488fe87d411e2adba7e97b6ad9650.log: -------------------------------------------------------------------------------- 1 | 2013-03-08 17:43:58+0800 [scrapy] INFO: Scrapy 0.17.0 started (bot: Baiduspider) 2 | 2013-03-08 17:43:58+0800 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState 3 | 2013-03-08 17:43:58+0800 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats 4 | 2013-03-08 17:43:58+0800 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware 5 | 2013-03-08 17:43:58+0800 [scrapy] DEBUG: Enabled item pipelines: NewsspiderPipeline 6 | -------------------------------------------------------------------------------- /newsspider/.scrapy/scrapyd/logs/default/mindhacks/ff9b53c687d411e2adba7e97b6ad9650.log: -------------------------------------------------------------------------------- 1 | 2013-03-08 17:46:08+0800 [scrapy] INFO: Scrapy 0.17.0 started (bot: Baiduspider) 2 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState 3 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats 4 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware 5 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Enabled item pipelines: NewsspiderPipeline 6 | 2013-03-08 17:46:08+0800 [mindhacks] INFO: Spider opened 7 | 2013-03-08 17:46:08+0800 [mindhacks] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 8 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023 9 | 2013-03-08 17:46:08+0800 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080 10 | 2013-03-08 17:46:08+0800 [mindhacks] DEBUG: Redirecting (301) to from 11 | 2013-03-08 17:46:08+0800 [mindhacks] DEBUG: Crawled (200) (referer: None) 12 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 13 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/> 14 | {'created': u'2011-11-04T01:18:00+00:00', 15 | 'site': 'mindhacks', 16 | 'title': u'\u600e\u6837\u82b1\u4e24\u5e74\u65f6\u95f4\u53bb\u9762\u8bd5\u4e00\u4e2a\u4eba', 17 | 'url': u'http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/'} 18 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 19 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/> 20 | {'created': u'2012-06-04T23:10:49+00:00', 21 | 'site': 'mindhacks', 22 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e94\uff09\uff1a\u770b\u4e0d\u89c1\u7684\u7262\u7b3c\uff08\u4e0a\uff09', 23 | 'url': u'http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/'} 24 | 2013-03-08 17:46:09+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 25 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2012/08/27/modern-cpp-practices/> 26 | {'created': u'2012-08-27T14:09:47+00:00', 27 | 'site': 'mindhacks', 28 | 'title': u'C++11\uff08\u53ca\u73b0\u4ee3C++\u98ce\u683c\uff09\u548c\u5feb\u901f\u8fed\u4ee3\u5f0f\u5f00\u53d1', 29 | 'url': u'http://mindhacks.cn/2012/08/27/modern-cpp-practices/'} 30 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 31 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/> 32 | {'created': u'2009-07-06T21:48:07+00:00', 33 | 'site': 'mindhacks', 34 | 'title': u'[BetterExplained]\u9047\u5230\u95ee\u9898\u4e3a\u4ec0\u4e48\u5e94\u8be5\u81ea\u5df1\u52a8\u624b', 35 | 'url': u'http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/'} 36 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 37 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/> 38 | {'created': u'2009-10-05T01:18:30+00:00', 39 | 'site': 'mindhacks', 40 | 'title': u'\u4e0d\u662f\u4e66\u8bc4 \uff1a\u300a\u6211\u662f\u4e00\u53eaIT\u5c0f\u5c0f\u9e1f\u300b', 41 | 'url': u'http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/'} 42 | 2013-03-08 17:46:10+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 43 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/12/20/dark-time/> 44 | {'created': u'2009-12-20T13:39:00+00:00', 45 | 'site': 'mindhacks', 46 | 'title': u'\u6697\u65f6\u95f4', 47 | 'url': u'http://mindhacks.cn/2009/12/20/dark-time/'} 48 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 49 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/> 50 | {'created': u'2010-03-18T00:28:14+00:00', 51 | 'site': 'mindhacks', 52 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e09\uff09\uff1a\u9047\u89c120\u4e07\u5e74\u524d\u7684\u81ea\u5df1', 53 | 'url': u'http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/'} 54 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 55 | 2013-03-08 17:46:11+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/> 56 | {'created': u'2010-11-14T17:41:00+00:00', 57 | 'site': 'mindhacks', 58 | 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u7eed\uff09', 59 | 'url': u'http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/'} 60 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 61 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/> 62 | {'created': u'2011-01-23T20:28:34+00:00', 63 | 'site': 'mindhacks', 64 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u56db\uff09\uff1a\u7406\u667a\u4e0e\u60c5\u611f', 65 | 'url': u'http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/'} 66 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 67 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/> 68 | {'created': u'2011-07-10T00:24:32+00:00', 69 | 'site': 'mindhacks', 70 | 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4e09\uff09\uff1a\u4e3a\u4ec0\u4e48\u7b97\u6cd5\u8fd9\u4e48\u96be\uff1f', 71 | 'url': u'http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/'} 72 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 73 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/> 74 | {'created': u'2011-11-04T01:18:00+00:00', 75 | 'site': 'mindhacks', 76 | 'title': u'\u600e\u6837\u82b1\u4e24\u5e74\u65f6\u95f4\u53bb\u9762\u8bd5\u4e00\u4e2a\u4eba', 77 | 'url': u'http://mindhacks.cn/2011/11/04/how-to-interview-a-person-for-two-years/'} 78 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 79 | 2013-03-08 17:46:12+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/> 80 | {'created': u'2012-06-04T23:10:49+00:00', 81 | 'site': 'mindhacks', 82 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e94\uff09\uff1a\u770b\u4e0d\u89c1\u7684\u7262\u7b3c\uff08\u4e0a\uff09', 83 | 'url': u'http://mindhacks.cn/2012/06/04/escape-from-your-shawshank-part5-the-invisible-cage/'} 84 | 2013-03-08 17:46:13+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 85 | 2013-03-08 17:46:13+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2012/08/27/modern-cpp-practices/> 86 | {'created': u'2012-08-27T14:09:47+00:00', 87 | 'site': 'mindhacks', 88 | 'title': u'C++11\uff08\u53ca\u73b0\u4ee3C++\u98ce\u683c\uff09\u548c\u5feb\u901f\u8fed\u4ee3\u5f0f\u5f00\u53d1', 89 | 'url': u'http://mindhacks.cn/2012/08/27/modern-cpp-practices/'} 90 | 2013-03-08 17:46:13+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 91 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 92 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/> 93 | {'created': u'2009-02-15T19:57:26+00:00', 94 | 'site': 'mindhacks', 95 | 'title': u'[BetterExplained]\u4e3a\u4ec0\u4e48\u4f60\u5e94\u8be5\uff08\u4ece\u73b0\u5728\u5f00\u59cb\u5c31\uff09\u5199\u535a\u5ba2', 96 | 'url': u'http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/'} 97 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 98 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/09/first-principles-of-programming/> 99 | {'created': u'2009-03-09T15:12:00+00:00', 100 | 'site': 'mindhacks', 101 | 'title': u'\u7f16\u7a0b\u7684\u9996\u8981\u539f\u5219(s)\u662f\u4ec0\u4e48\uff1f', 102 | 'url': u'http://mindhacks.cn/2009/03/09/first-principles-of-programming/'} 103 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 104 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/15/preconception-explained/> 105 | {'created': u'2009-03-15T18:49:27+00:00', 106 | 'site': 'mindhacks', 107 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e8c\uff09\uff1a\u4ec1\u8005\u89c1\u4ec1\u667a\u8005\u89c1\u667a\uff1f\u4ece\u89c6\u89c9\u9519\u89c9\u5230\u504f\u89c1', 108 | 'url': u'http://mindhacks.cn/2009/03/15/preconception-explained/'} 109 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 110 | 2013-03-08 17:46:14+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/> 111 | {'created': u'2009-03-28T19:23:02+00:00', 112 | 'site': 'mindhacks', 113 | 'title': u'[BetterExplained]\u5982\u4f55\u6709\u6548\u5730\u8bb0\u5fc6\u4e0e\u5b66\u4e60', 114 | 'url': u'http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/'} 115 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 116 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/05/17/seven-years-in-nju/> 117 | {'created': u'2009-05-17T23:57:30+00:00', 118 | 'site': 'mindhacks', 119 | 'title': u'\u6211\u5728\u5357\u5927\u7684\u4e03\u5e74', 120 | 'url': u'http://mindhacks.cn/2009/05/17/seven-years-in-nju/'} 121 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 122 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/> 123 | {'created': u'2009-07-06T21:48:07+00:00', 124 | 'site': 'mindhacks', 125 | 'title': u'[BetterExplained]\u9047\u5230\u95ee\u9898\u4e3a\u4ec0\u4e48\u5e94\u8be5\u81ea\u5df1\u52a8\u624b', 126 | 'url': u'http://mindhacks.cn/2009/07/06/why-you-should-do-it-yourself/'} 127 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 128 | 2013-03-08 17:46:15+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/> 129 | {'created': u'2009-10-05T01:18:30+00:00', 130 | 'site': 'mindhacks', 131 | 'title': u'\u4e0d\u662f\u4e66\u8bc4 \uff1a\u300a\u6211\u662f\u4e00\u53eaIT\u5c0f\u5c0f\u9e1f\u300b', 132 | 'url': u'http://mindhacks.cn/2009/10/05/im-a-tiny-bird-book-review/'} 133 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 134 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/12/20/dark-time/> 135 | {'created': u'2009-12-20T13:39:00+00:00', 136 | 'site': 'mindhacks', 137 | 'title': u'\u6697\u65f6\u95f4', 138 | 'url': u'http://mindhacks.cn/2009/12/20/dark-time/'} 139 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 140 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/> 141 | {'created': u'2010-03-18T00:28:14+00:00', 142 | 'site': 'mindhacks', 143 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e09\uff09\uff1a\u9047\u89c120\u4e07\u5e74\u524d\u7684\u81ea\u5df1', 144 | 'url': u'http://mindhacks.cn/2010/03/18/escape-from-your-shawshank-part3/'} 145 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 146 | 2013-03-08 17:46:16+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/> 147 | {'created': u'2010-11-14T17:41:00+00:00', 148 | 'site': 'mindhacks', 149 | 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u7eed\uff09', 150 | 'url': u'http://mindhacks.cn/2010/11/14/the-importance-of-knowing-why-part2/'} 151 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 152 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/> 153 | {'created': u'2011-01-23T20:28:34+00:00', 154 | 'site': 'mindhacks', 155 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u56db\uff09\uff1a\u7406\u667a\u4e0e\u60c5\u611f', 156 | 'url': u'http://mindhacks.cn/2011/01/23/escape-from-your-shawshank-4/'} 157 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/) 158 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 159 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/> 160 | {'created': u'2011-07-10T00:24:32+00:00', 161 | 'site': 'mindhacks', 162 | 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4e09\uff09\uff1a\u4e3a\u4ec0\u4e48\u7b97\u6cd5\u8fd9\u4e48\u96be\uff1f', 163 | 'url': u'http://mindhacks.cn/2011/07/10/the-importance-of-knowing-why-part3/'} 164 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 165 | 2013-03-08 17:46:17+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/01/16/hammers-and-nails/> 166 | {'created': u'2009-01-16T21:25:00+00:00', 167 | 'site': 'mindhacks', 168 | 'title': u'\u9524\u5b50\u548c\u9489\u5b50', 169 | 'url': u'http://mindhacks.cn/2009/01/16/hammers-and-nails/'} 170 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 171 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/01/18/escape-from-your-shawshank-part1/> 172 | {'created': u'2009-01-18T21:32:00+00:00', 173 | 'site': 'mindhacks', 174 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e00\uff09\uff1a\u4e3a\u4ec0\u4e48\u4e00\u5b9a\u8981\u4eb2\u8eab\u7ecf\u5386\u4e86\u4e4b\u540e\u624d\u80fd\u660e\u767d\uff1f', 175 | 'url': u'http://mindhacks.cn/2009/01/18/escape-from-your-shawshank-part1/'} 176 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 177 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/07/independence-day/> 178 | {'created': u'2009-02-07T12:57:34+00:00', 179 | 'site': 'mindhacks', 180 | 'title': u'\u72ec\u7acb\u65e5', 181 | 'url': u'http://mindhacks.cn/2009/02/07/independence-day/'} 182 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 183 | 2013-03-08 17:46:18+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/07/better-explained-conflicts-in-intimate-relationship/> 184 | {'created': u'2009-02-07T20:35:17+00:00', 185 | 'site': 'mindhacks', 186 | 'title': u'[BetterExplained]\u4eb2\u5bc6\u5173\u7cfb\u4e2d\u7684\u51b2\u7a81\u89e3\u51b3', 187 | 'url': u'http://mindhacks.cn/2009/02/07/better-explained-conflicts-in-intimate-relationship/'} 188 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 189 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/09/writing-is-better-thinking/> 190 | {'created': u'2009-02-09T22:24:00+00:00', 191 | 'site': 'mindhacks', 192 | 'title': u'[BetterExplained]\u4e66\u5199\u662f\u4e3a\u4e86\u66f4\u597d\u7684\u601d\u8003', 193 | 'url': u'http://mindhacks.cn/2009/02/09/writing-is-better-thinking/'} 194 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 195 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/> 196 | {'created': u'2009-02-15T19:57:26+00:00', 197 | 'site': 'mindhacks', 198 | 'title': u'[BetterExplained]\u4e3a\u4ec0\u4e48\u4f60\u5e94\u8be5\uff08\u4ece\u73b0\u5728\u5f00\u59cb\u5c31\uff09\u5199\u535a\u5ba2', 199 | 'url': u'http://mindhacks.cn/2009/02/15/why-you-should-start-blogging-now/'} 200 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 201 | 2013-03-08 17:46:19+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/09/first-principles-of-programming/> 202 | {'created': u'2009-03-09T15:12:00+00:00', 203 | 'site': 'mindhacks', 204 | 'title': u'\u7f16\u7a0b\u7684\u9996\u8981\u539f\u5219(s)\u662f\u4ec0\u4e48\uff1f', 205 | 'url': u'http://mindhacks.cn/2009/03/09/first-principles-of-programming/'} 206 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 207 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/15/preconception-explained/> 208 | {'created': u'2009-03-15T18:49:27+00:00', 209 | 'site': 'mindhacks', 210 | 'title': u'\u9003\u51fa\u4f60\u7684\u8096\u7533\u514b\uff08\u4e8c\uff09\uff1a\u4ec1\u8005\u89c1\u4ec1\u667a\u8005\u89c1\u667a\uff1f\u4ece\u89c6\u89c9\u9519\u89c9\u5230\u504f\u89c1', 211 | 'url': u'http://mindhacks.cn/2009/03/15/preconception-explained/'} 212 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 213 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/> 214 | {'created': u'2009-03-28T19:23:02+00:00', 215 | 'site': 'mindhacks', 216 | 'title': u'[BetterExplained]\u5982\u4f55\u6709\u6548\u5730\u8bb0\u5fc6\u4e0e\u5b66\u4e60', 217 | 'url': u'http://mindhacks.cn/2009/03/28/effective-learning-and-memorization/'} 218 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/2/) 219 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/05/17/seven-years-in-nju/> 220 | {'created': u'2009-05-17T23:57:30+00:00', 221 | 'site': 'mindhacks', 222 | 'title': u'\u6211\u5728\u5357\u5927\u7684\u4e03\u5e74', 223 | 'url': u'http://mindhacks.cn/2009/05/17/seven-years-in-nju/'} 224 | 2013-03-08 17:46:20+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 225 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 226 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/07/07/the-importance-of-knowing-why/> 227 | {'created': u'2008-07-07T21:05:00+00:00', 228 | 'site': 'mindhacks', 229 | 'title': u'\u77e5\u5176\u6240\u4ee5\u7136\uff08\u4ee5\u7b97\u6cd5\u5b66\u4e60\u4e3a\u4f8b\uff09', 230 | 'url': u'http://mindhacks.cn/2008/07/07/the-importance-of-knowing-why/'} 231 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 232 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/07/08/learning-habits-part1/> 233 | {'created': u'2008-07-08T21:13:00+00:00', 234 | 'site': 'mindhacks', 235 | 'title': u'\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e00)\uff1a\u5b66\u4e60\u4e0e\u601d\u8003', 236 | 'url': u'http://mindhacks.cn/2008/07/08/learning-habits-part1/'} 237 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 238 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/07/20/learning-habits-part2/> 239 | {'created': u'2008-07-20T21:16:00+00:00', 240 | 'site': 'mindhacks', 241 | 'title': u'\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e8c)\uff1a\u65f6\u95f4\u7ba1\u7406', 242 | 'url': u'http://mindhacks.cn/2008/07/20/learning-habits-part2/'} 243 | 2013-03-08 17:46:21+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 244 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/09/11/machine-learning-and-ai-resources/> 245 | {'created': u'2008-09-11T19:29:00+00:00', 246 | 'site': 'mindhacks', 247 | 'title': u'\u673a\u5668\u5b66\u4e60\u4e0e\u4eba\u5de5\u667a\u80fd\u5b66\u4e60\u8d44\u6e90\u5bfc\u5f15', 248 | 'url': u'http://mindhacks.cn/2008/09/11/machine-learning-and-ai-resources/'} 249 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 250 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/09/17/learning-habits-part3/> 251 | {'created': u'2008-09-17T21:18:00+00:00', 252 | 'site': 'mindhacks', 253 | 'title': u'\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u4e09)\uff1a\u9605\u8bfb\u65b9\u6cd5', 254 | 'url': u'http://mindhacks.cn/2008/09/17/learning-habits-part3/'} 255 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 256 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/09/21/the-magical-bayesian-method/> 257 | {'created': u'2008-09-21T19:34:00+00:00', 258 | 'site': 'mindhacks', 259 | 'title': u'\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u5e73\u51e1\u800c\u53c8\u795e\u5947\u7684\u8d1d\u53f6\u65af\u65b9\u6cd5', 260 | 'url': u'http://mindhacks.cn/2008/09/21/the-magical-bayesian-method/'} 261 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 262 | 2013-03-08 17:46:22+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/10/29/methodology-for-programmers/> 263 | {'created': u'2008-10-29T21:09:00+00:00', 264 | 'site': 'mindhacks', 265 | 'title': u'\u65b9\u6cd5\u8bba\u3001\u65b9\u6cd5\u8bba\u2014\u2014\u7a0b\u5e8f\u5458\u7684\u963f\u5580\u7409\u65af\u4e4b\u8e35', 266 | 'url': u'http://mindhacks.cn/2008/10/29/methodology-for-programmers/'} 267 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 268 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/12/05/learning-habits-part4/> 269 | {'created': u'2008-12-05T21:21:00+00:00', 270 | 'site': 'mindhacks', 271 | 'title': u'\u4e00\u76f4\u4ee5\u6765\u4f34\u968f\u6211\u7684\u4e00\u4e9b\u5b66\u4e60\u4e60\u60ef(\u56db)\uff1a\u77e5\u8bc6\u7ed3\u6784', 272 | 'url': u'http://mindhacks.cn/2008/12/05/learning-habits-part4/'} 273 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 274 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/12/18/how-to-think-straight/> 275 | {'created': u'2008-12-18T20:10:00+00:00', 276 | 'site': 'mindhacks', 277 | 'title': u'\u5982\u4f55\u6e05\u6670\u5730\u601d\u8003\uff08\u8fd1\u4e00\u5e74\u6765\u4e1a\u4f59\u9605\u8bfb\u7684\u5173\u4e8e\u601d\u7ef4\u65b9\u9762\u7684\u77e5\u8bc6\u7ed3\u6784\u6574\u7406\uff09', 278 | 'url': u'http://mindhacks.cn/2008/12/18/how-to-think-straight/'} 279 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/3/) 280 | 2013-03-08 17:46:23+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2009/01/14/make-yourself-irreplacable/> 281 | {'created': u'2009-01-14T21:28:00+00:00', 282 | 'site': 'mindhacks', 283 | 'title': u'\u4ec0\u4e48\u624d\u662f\u4f60\u7684\u4e0d\u53ef\u66ff\u4ee3\u6027\u548c\u6838\u5fc3\u7ade\u4e89\u529b', 284 | 'url': u'http://mindhacks.cn/2009/01/14/make-yourself-irreplacable/'} 285 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/4/) 286 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2006/10/15/cantor-godel-turing-an-eternal-golden-diagonal/> 287 | {'created': u'2006-10-15T19:16:00+00:00', 288 | 'site': 'mindhacks', 289 | 'title': u'\u5eb7\u6258\u5c14\u3001\u54e5\u5fb7\u5c14\u3001\u56fe\u7075\u2014\u2014\u6c38\u6052\u7684\u91d1\u8272\u5bf9\u89d2\u7ebf(rev#2)', 290 | 'url': u'http://mindhacks.cn/2006/10/15/cantor-godel-turing-an-eternal-golden-diagonal/'} 291 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/4/) 292 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2007/05/24/learn-to-focus/> 293 | {'created': u'2007-05-24T20:30:00+00:00', 294 | 'site': 'mindhacks', 295 | 'title': u'\u5b66\u4e60\u5bc6\u5ea6\u4e0e\u4e13\u6ce8\u529b', 296 | 'url': u'http://mindhacks.cn/2007/05/24/learn-to-focus/'} 297 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/4/) 298 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2007/12/02/probability-theory-in-evolution/> 299 | {'created': u'2007-12-02T18:55:00+00:00', 300 | 'site': 'mindhacks', 301 | 'title': u'\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u8fdb\u5316\u8bba\u4e2d\u7684\u6982\u7387\u8bba', 302 | 'url': u'http://mindhacks.cn/2007/12/02/probability-theory-in-evolution/'} 303 | 2013-03-08 17:46:24+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/4/) 304 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/03/03/failing-to-see-the-big-picture/> 305 | {'created': u'2008-03-03T15:42:00+00:00', 306 | 'site': 'mindhacks', 307 | 'title': u'Failing To See the Big Picture \u2013 Mistakes we make when learning programming', 308 | 'url': u'http://mindhacks.cn/2008/03/03/failing-to-see-the-big-picture/'} 309 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/4/) 310 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/04/08/reading-method/> 311 | {'created': u'2008-04-08T21:00:00+00:00', 312 | 'site': 'mindhacks', 313 | 'title': u'\u9605\u8bfb\u4e0e\u601d\u8003', 314 | 'url': u'http://mindhacks.cn/2008/04/08/reading-method/'} 315 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/4/) 316 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/04/18/learning-from-polya/> 317 | {'created': u'2008-04-18T21:37:00+00:00', 318 | 'site': 'mindhacks', 319 | 'title': u'\u8ddf\u6ce2\u5229\u4e9a\u5b66\u89e3\u9898(rev#3)', 320 | 'url': u'http://mindhacks.cn/2008/04/18/learning-from-polya/'} 321 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/4/) 322 | 2013-03-08 17:46:25+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/06/05/how-memory-works/> 323 | {'created': u'2008-06-05T21:40:00+00:00', 324 | 'site': 'mindhacks', 325 | 'title': u'\u5b66\u4e60\u4e0e\u8bb0\u5fc6', 326 | 'url': u'http://mindhacks.cn/2008/06/05/how-memory-works/'} 327 | 2013-03-08 17:46:26+0800 [mindhacks] DEBUG: Crawled (200) (referer: http://mindhacks.cn/page/4/) 328 | 2013-03-08 17:46:26+0800 [mindhacks] DEBUG: Scraped from <200 http://mindhacks.cn/2008/06/13/why-is-quicksort-so-quick/> 329 | {'created': u'2008-06-13T19:53:00+00:00', 330 | 'site': 'mindhacks', 331 | 'title': u'\u6570\u5b66\u4e4b\u7f8e\u756a\u5916\u7bc7\uff1a\u5feb\u6392\u4e3a\u4ec0\u4e48\u90a3\u6837\u5feb', 332 | 'url': u'http://mindhacks.cn/2008/06/13/why-is-quicksort-so-quick/'} 333 | 2013-03-08 17:46:26+0800 [mindhacks] INFO: Closing spider (finished) 334 | 2013-03-08 17:46:26+0800 [mindhacks] INFO: Stored jsonlines feed (53 items) in: /root/newsspider/.scrapy/scrapyd/items/default/mindhacks/ff9b53c687d411e2adba7e97b6ad9650.jl 335 | 2013-03-08 17:46:26+0800 [mindhacks] INFO: Dumping Scrapy stats: 336 | {'downloader/request_bytes': 17825, 337 | 'downloader/request_count': 58, 338 | 'downloader/request_method_count/GET': 58, 339 | 'downloader/response_bytes': 1501575, 340 | 'downloader/response_count': 58, 341 | 'downloader/response_status_count/200': 57, 342 | 'downloader/response_status_count/301': 1, 343 | 'finish_reason': 'finished', 344 | 'finish_time': datetime.datetime(2013, 3, 8, 9, 46, 26, 149294), 345 | 'item_scraped_count': 53, 346 | 'log_count/DEBUG': 117, 347 | 'log_count/INFO': 5, 348 | 'request_depth_max': 4, 349 | 'response_received_count': 57, 350 | 'scheduler/dequeued': 58, 351 | 'scheduler/dequeued/memory': 58, 352 | 'scheduler/enqueued': 58, 353 | 'scheduler/enqueued/memory': 58, 354 | 'start_time': datetime.datetime(2013, 3, 8, 9, 46, 8, 299627)} 355 | 2013-03-08 17:46:26+0800 [mindhacks] INFO: Spider closed (finished) 356 | -------------------------------------------------------------------------------- /newsspider/.scrapy/scrapyd/logs/default/somespider/28e7b04a8a0e11e2adba7e97b6ad9650.log: -------------------------------------------------------------------------------- 1 | 2013-03-11 13:40:22+0800 [scrapy] INFO: Scrapy 0.17.0 started (bot: Baiduspider) 2 | 2013-03-11 13:40:23+0800 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState 3 | 2013-03-11 13:40:23+0800 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats 4 | 2013-03-11 13:40:23+0800 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware 5 | 2013-03-11 13:40:23+0800 [scrapy] DEBUG: Enabled item pipelines: NewsspiderPipeline 6 | -------------------------------------------------------------------------------- /newsspider/README.md: -------------------------------------------------------------------------------- 1 | Tech news spider 2 | 3 | TODOs 4 | 5 | * Support more tech sites 6 | * Extract abstract for links 7 | * Improve the FrontEnd for viewing crawled results 8 | -------------------------------------------------------------------------------- /newsspider/dbs/default.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/newsspider/dbs/default.db -------------------------------------------------------------------------------- /newsspider/newsspider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/newsspider/newsspider/__init__.py -------------------------------------------------------------------------------- /newsspider/newsspider/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/newsspider/newsspider/commands/__init__.py -------------------------------------------------------------------------------- /newsspider/newsspider/commands/allcrawl.py: -------------------------------------------------------------------------------- 1 | from scrapy.command import ScrapyCommand 2 | import urllib 3 | import urllib2 4 | from scrapy import log 5 | 6 | class AllCrawlCommand(ScrapyCommand): 7 | requires_project = True 8 | default_settings = {'LOG_ENABLED': False} 9 | 10 | def short_desc(self): 11 | return "Schedule a run for all available spiders (run scrapy server first)" 12 | 13 | def run(self, args, opts): 14 | url = 'http://localhost:6800/schedule.json' 15 | for s in self.crawler.spiders.list(): 16 | values = {'project' : 'default', 'spider' : s} 17 | data = urllib.urlencode(values) 18 | req = urllib2.Request(url, data) 19 | response = urllib2.urlopen(req) 20 | log.msg(response) 21 | -------------------------------------------------------------------------------- /newsspider/newsspider/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class NewsspiderItem(Item): 9 | # define the fields for your item here like: 10 | title = Field() 11 | url = Field() 12 | site = Field() 13 | abstract = Field() 14 | created = Field() 15 | -------------------------------------------------------------------------------- /newsspider/newsspider/middlewares.py: -------------------------------------------------------------------------------- 1 | # Importing base64 library because we'll need it ONLY in case if the proxy 2 | # we are going to use requires authentication 3 | import base64 4 | import random 5 | 6 | class ProxyMiddleware(object): 7 | # overwrite process request 8 | def process_request(self, request, spider): 9 | data = file('proxies.txt','r').readlines() 10 | length = len(data) 11 | index = random.randint(0, length -1) 12 | item = data[index] 13 | arr = item.split() 14 | request.meta['proxy'] = "http://%s:%s" % (arr[0], arr[1]) 15 | 16 | # Use the following lines if your proxy requires authentication 17 | # proxy_user_pass = "USERNAME:PASSWORD" 18 | # setup basic authentication for the proxy 19 | # encoded_user_pass = base64.encodestring(proxy_user_pass) 20 | # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_passwq# 21 | 22 | -------------------------------------------------------------------------------- /newsspider/newsspider/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | import sys 6 | import MySQLdb 7 | import hashlib 8 | from scrapy.exceptions import DropItem 9 | from scrapy import log 10 | import time 11 | import os 12 | import uuid 13 | 14 | class NewsspiderPipeline(object): 15 | 16 | def __init__(self): 17 | self.conn = MySQLdb.connect(user='root', 18 | passwd='feisky', db='news', host='localhost', 19 | charset="utf8", 20 | use_unicode=True) 21 | self.cursor = self.conn.cursor() 22 | self.downloadPreview=False 23 | self.pngPath='/var/scrapy' 24 | self.filename='' 25 | 26 | def process_item(self, item, spider): 27 | settings = spider.settings 28 | if settings['DOWNLOAD_PREVIEW']: 29 | self.downloadPreview=True 30 | if settings['PNG_PATH']: 31 | self.pngPath=settings['PNG_PATH'] 32 | 33 | url = item.get('url', '') 34 | if len(url)==0: 35 | return item 36 | 37 | try: 38 | if self.downloadPreview: 39 | self.filename = "%s/%s.png" % (self.pngPath, str(uuid.uuid1())) 40 | cmd = os.popen(u'''/usr/bin/webkit2png -x 1366 768 -F javascript "%s" -o "%s"''' % 41 | (url, self.filename)) 42 | result = cmd.read() 43 | if 'Failed' in result: 44 | os.unlink(filename) 45 | cmd.close() 46 | 47 | if self.cursor.execute('''select * from news where url='%s' ''' % 48 | item.get('url', '')) == 0: 49 | self.cursor.execute( 50 | """INSERT INTO news(title,url,site,abstract, created, file) 51 | VALUES (%s, %s, %s, %s, %s, %s)""", 52 | ( item.get('title','').encode('utf-8'), 53 | url, 54 | item.get('site', ''), 55 | item.get('abstract', '').encode('utf-8'), 56 | item.get('created', time.strftime('%Y-%m-%d %H:%M:%S')), 57 | self.filename.split('/')[-1]) ) 58 | self.conn.commit() 59 | else: 60 | log.msg('%s already exists' % url , level=log.WARNING) 61 | except MySQLdb.Error, e: 62 | log.msg("Error %d: %s" % (e.args[0], e.args[1]), level=log.ERROR) 63 | return item 64 | 65 | def finalize(self): 66 | if self.conn is not None: 67 | self.conn.commit() 68 | self.conn.close() 69 | self.conn=None 70 | 71 | -------------------------------------------------------------------------------- /newsspider/newsspider/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for newsspider project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'Newsspider' 10 | USER_AGENT = 'Newsspider+(+http://www.www.com/)' 11 | # Baiduspider+(+http://www.baidu.com/search/spider.htm") 12 | # Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) 13 | # Googlebot/2.1 (+http://www.googlebot.com/bot.html) 14 | # Googlebot/2.1 (+http://www.google.com/bot.html) 15 | # Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html") 16 | # Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp") 17 | # iaskspider/2.0(+http://iask.com/help/help_index.html") 18 | # Mozilla/5.0 (compatible; iaskspider/1.0; MSIE 6.0) 19 | # Sogou web spider/3.0(+http://www.sogou.com/docs/help/webmasters.htm#07") 20 | # Sogou Push Spider/3.0(+http://www.sogou.com/docs/help/webmasters.htm#07") 21 | # Mozilla/5.0 (compatible; YodaoBot/1.0; http://www.yodao.com/help/webmaster/spider/"; ) 22 | # msnbot/1.0 (+http://search.msn.com/msnbot.htm") 23 | 24 | SPIDER_MODULES = ['newsspider.spiders'] 25 | NEWSPIDER_MODULE = 'newsspider.spiders' 26 | COMMANDS_MODULE = 'newsspider.commands' 27 | DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter' 28 | ITEM_PIPELINES = ['newsspider.pipelines.NewsspiderPipeline'] 29 | SCHEDULER = 'scrapy.core.scheduler.Scheduler' 30 | 31 | DOWNLOADER_MIDDLEWARES = { 32 | # 'newsspider.middlewares.ProxyMiddleware': 100, 33 | 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100, 34 | 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110, 35 | 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300, 36 | 'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350, 37 | 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400, 38 | 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500, 39 | 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550, 40 | 'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600, 41 | 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700, 42 | 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750, 43 | 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 800, 44 | 'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830, 45 | 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850, 46 | 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900, 47 | } 48 | 49 | CONCURRENT_ITEMS = 100 50 | CONCURRENT_REQUESTS = 16 51 | CONCURRENT_REQUESTS_PER_DOMAIN =8 52 | # CONCURRENT_REQUESTS_PER_IP = 8 53 | ROBOTSTXT_OBEY = False 54 | 55 | DEPTH_LIMIT = 6 56 | DOWNLOAD_DELAY = 0.25 57 | RANDOMIZE_DOWNLOAD_DELAY = True 58 | DOWNLOAD_TIMEOUT = 30 59 | DNSCACHE_ENABLED = True 60 | 61 | 62 | #LOG_FILE = '' 63 | LOG_LEVEL = 'DEBUG' 64 | 65 | PNG_PATH = '/var/scrapy' 66 | DOWNLOAD_PREVIEW = False 67 | -------------------------------------------------------------------------------- /newsspider/newsspider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /newsspider/newsspider/spiders/cnblogs.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import XMLFeedSpider 2 | from newsspider.items import NewsspiderItem 3 | from scrapy import log 4 | 5 | class FeedSpider(XMLFeedSpider): 6 | name = 'cnblogs' 7 | allowed_domains = ['cnblogs.com'] 8 | start_urls = ['http://feed.cnblogs.com/blog/u/53116/rss'] 9 | iterator = 'iternodes' # you can change this; see the docs 10 | itertag = 'item' # change it accordingly 11 | #namespaces = [ ('content', 'http://purl.org/rss/1.0/modules/content'), 12 | # ('dc', 'http://purl.org/dc/elements/1.1/') ] 13 | 14 | def parse_node(self, response, selector): 15 | #for prefix, uri in self.namespaces: 16 | # selector.register_namespace (prefix, uri) 17 | 18 | item = NewsspiderItem() 19 | item['url'] = selector.select('id/text()').extract()[0] 20 | item['title'] = selector.select('title/text()').extract()[0] 21 | item['created'] = selector.select('published/text()').extract()[0] 22 | item['abstract'] = selector.select('summary/text()').extract()[0] 23 | item['site']= FeedSpider.name 24 | return item 25 | -------------------------------------------------------------------------------- /newsspider/newsspider/spiders/dbanotes.py: -------------------------------------------------------------------------------- 1 | #from scrapy.selector import HtmlXPathSelector 2 | #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 3 | from scrapy.contrib.spiders import CrawlSpider, Rule 4 | from newsspider.items import NewsspiderItem 5 | from BeautifulSoup import BeautifulSoup 6 | import time 7 | 8 | class HackernewsSpider(CrawlSpider): 9 | name = 'dbanotes' 10 | allowed_domains = ['news.dbanotes.net'] 11 | start_urls = ['http://news.dbanotes.net/'] 12 | 13 | #rules = ( 14 | # Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), 15 | #) 16 | 17 | def parse(self, response): 18 | #hxs = HtmlXPathSelector(response) 19 | #i = NewsspiderItem() 20 | #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract() 21 | #i['name'] = hxs.select('//div[@id="name"]').extract() 22 | #i['description'] = hxs.select('//div[@id="description"]').extract() 23 | soup=BeautifulSoup(response.body) 24 | links=soup.findAll('td', {'class':'title'}) 25 | for link in links: 26 | linkinfo = link.findChild() 27 | if not linkinfo: continue 28 | title = linkinfo.text 29 | url = linkinfo.get('href', '') 30 | if not url.startswith('http'): 31 | if url.startswith('/'): 32 | url = 'http://news.dbanotes.net' + url 33 | else: 34 | url = 'http://news.dbanotes.net/' + url 35 | 36 | # deal with next page 37 | if title == 'More' or title == 'next': # next page 38 | yield self.make_requests_from_url(url).replace(callback=self.parse) 39 | # get a new news item 40 | elif len(title)>0 and len(url)>0: 41 | item = NewsspiderItem() 42 | item['title']=title 43 | item['url']=url 44 | item['site']='dbanotes' 45 | yield item 46 | 47 | -------------------------------------------------------------------------------- /newsspider/newsspider/spiders/hackernews.py: -------------------------------------------------------------------------------- 1 | #from scrapy.selector import HtmlXPathSelector 2 | #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 3 | from scrapy.contrib.spiders import CrawlSpider, Rule 4 | from newsspider.items import NewsspiderItem 5 | from BeautifulSoup import BeautifulSoup 6 | import time 7 | 8 | class HackernewsSpider(CrawlSpider): 9 | name = 'hackernews' 10 | allowed_domains = ['news.ycombinator.com'] 11 | start_urls = ['http://news.ycombinator.com/'] 12 | 13 | #rules = ( 14 | # Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True), 15 | #) 16 | 17 | def parse(self, response): 18 | #hxs = HtmlXPathSelector(response) 19 | #i = NewsspiderItem() 20 | #i['domain_id'] = hxs.select('//input[@id="sid"]/@value').extract() 21 | #i['name'] = hxs.select('//div[@id="name"]').extract() 22 | #i['description'] = hxs.select('//div[@id="description"]').extract() 23 | soup=BeautifulSoup(response.body) 24 | links=soup.findAll('td', {'class':'title'}) 25 | for link in links: 26 | linkinfo = link.findChild() 27 | if not linkinfo: continue 28 | title = linkinfo.text 29 | url = linkinfo.get('href', '') 30 | if not url.startswith('http'): 31 | if url.startswith('/'): 32 | url = 'http://news.ycombinator.com' + url 33 | else: 34 | url = 'http://news.ycombinator.com/' + url 35 | 36 | # deal with next page 37 | if title == 'More' or title == 'next': # next page 38 | yield self.make_requests_from_url(url).replace(callback=self.parse) 39 | # get a new news item 40 | elif len(title)>0 and len(url)>0: 41 | item = NewsspiderItem() 42 | item['title']=title 43 | item['url']=url 44 | item['site']='hackernews' 45 | yield item 46 | 47 | -------------------------------------------------------------------------------- /newsspider/newsspider/spiders/jobbole.py: -------------------------------------------------------------------------------- 1 | from scrapy.contrib.spiders import XMLFeedSpider 2 | from newsspider.items import NewsspiderItem 3 | from scrapy import log 4 | 5 | class FeedSpider(XMLFeedSpider): 6 | name = 'jobbole' 7 | allowed_domains = ['blog.jobbole.com'] 8 | start_urls = ['http://blog.jobbole.com/feed/'] 9 | iterator = 'iternodes' # you can change this; see the docs 10 | itertag = 'item' # change it accordingly 11 | namespaces = [ ('content', 'http://purl.org/rss/1.0/modules/content'), 12 | ('dc', 'http://purl.org/dc/elements/1.1/') ] 13 | 14 | def parse_node(self, response, selector): 15 | #for prefix, uri in self.namespaces: 16 | # selector.register_namespace (prefix, uri) 17 | selector.remove_namespaces() 18 | item = NewsspiderItem() 19 | item['url'] = selector.select('link/text()').extract()[0] 20 | item['title'] = selector.select('title/text()').extract()[0] 21 | item['created'] = selector.select('pubDate/text()').extract()[0] 22 | item['abstract'] = selector.select('description/text()').extract()[0] 23 | item['site']= FeedSpider.name 24 | return item 25 | -------------------------------------------------------------------------------- /newsspider/newsspider/spiders/mindhacks.py: -------------------------------------------------------------------------------- 1 | from scrapy.selector import HtmlXPathSelector 2 | from scrapy.contrib.spiders import CrawlSpider, Rule 3 | from newsspider.items import NewsspiderItem 4 | 5 | class MindhacksSpider(CrawlSpider): 6 | name = 'mindhacks' 7 | allowed_domains = ['mindhacks.cn'] 8 | start_urls = ['http://www.mindhacks.cn/'] 9 | 10 | def parse(self, response): 11 | hxs = HtmlXPathSelector(response) 12 | sites = hxs.select('//h3/a/@href') 13 | 14 | for url in sites: 15 | yield self.make_requests_from_url(url.extract()).replace(callback=self.parse_post) 16 | 17 | # process next page 18 | page_links=hxs.select('//div[@class="wp-pagenavi"]/a[not(@title)]') 19 | for link in page_links: 20 | if link.select('text()').extract()[0] == u'\xbb': 21 | url = link.select('@href').extract()[0] 22 | yield self.make_requests_from_url(url) 23 | 24 | def parse_post(self, response): 25 | hxs = HtmlXPathSelector(response) 26 | title = hxs.select('//h1/a/text()').extract()[0] 27 | url = hxs.select('//h1/a/@href').extract()[0] 28 | created = hxs.select('//*[@class="published"]/@title').extract()[0] 29 | 30 | if len(title) >0 and len(url) > 0: 31 | item = NewsspiderItem() 32 | item['url'] = url 33 | item['title'] = title 34 | item['created'] = created 35 | item['site']= MindhacksSpider.name 36 | yield item 37 | -------------------------------------------------------------------------------- /newsspider/newsspider/spiders/reddit.py: -------------------------------------------------------------------------------- 1 | from scrapy.selector import HtmlXPathSelector 2 | #from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 3 | from scrapy.contrib.spiders import CrawlSpider, Rule 4 | from newsspider.items import NewsspiderItem 5 | import time 6 | 7 | class RedditSpider(CrawlSpider): 8 | name = 'reddit' 9 | allowed_domains = ['reddit.com'] 10 | start_urls = ['http://www.reddit.com/r/programming/','http://www.reddit.com/'] 11 | 12 | def parse(self, response): 13 | hxs = HtmlXPathSelector(response) 14 | 15 | # deal with next page 16 | nextlink = hxs.select('//p[@class="nextprev"]//a').select('@href').extract()[0] 17 | if len(nextlink)>0: 18 | yield self.make_requests_from_url(nextlink) 19 | 20 | links = hxs.select('//*[@id="siteTable"]//div//p[1]/a') 21 | for link in links: 22 | url = link.select('@href').extract()[0] 23 | title = link.select('text()').extract()[0] 24 | if len(url)>0 and len(title)>0: 25 | item = NewsspiderItem() 26 | item['title']=title 27 | item['url']=url 28 | item['site']=RedditSpider.name 29 | yield item 30 | 31 | 32 | -------------------------------------------------------------------------------- /newsspider/proxies.txt: -------------------------------------------------------------------------------- 1 | 5.199.132.164 443 HTTP 2 | 64.208.21.16 80 HTTP 3 | 110.153.9.250 80 HTTP 4 | 110.139.206.93 8080 HTTP 5 | 72.247.48.10 80 HTTP 6 | 180.250.130.186 80 HTTP 7 | 114.80.149.183 80 HTTP 8 | 123.108.14.39 8080 HTTP 9 | 211.167.112.14 80 HTTP 10 | 89.218.100.178 9090 HTTP 11 | 202.149.78.234 8080 HTTP 12 | 101.255.33.250 80 HTTP 13 | 80.79.179.10 8181 HTTP 14 | 180.250.165.197 8080 HTTP 15 | 119.110.71.109 8080 HTTP 16 | 123.129.242.131 8081 HTTP 17 | 211.167.112.14 82 HTTP 18 | 89.218.100.218 9090 HTTP 19 | 202.201.1.119 8001 HTTP 20 | 80.90.12.36 8080 HTTP 21 | 186.47.84.139 8080 HTTP 22 | 125.141.206.36 8080 HTTP 23 | 211.239.84.130 443 HTTP 24 | 89.218.120.114 9090 HTTP 25 | 203.92.47.202 8082 HTTP 26 | 114.247.21.244 3131 HTTP 27 | 190.92.87.98 8080 HTTP 28 | 87.110.149.88 8080 HTTP 29 | 186.208.71.70 8080 HTTP 30 | 119.235.21.11 80 HTTP 31 | 148.236.5.91 8080 HTTP 32 | 218.28.112.114 809 HTTP 33 | 119.252.168.34 8080 HTTP 34 | 202.52.244.110 8080 HTTP 35 | 218.108.85.59 82 HTTP 36 | 14.31.11.70 9009 HTTP 37 | 186.225.212.245 8080 HTTP 38 | 120.85.132.234 80 HTTP 39 | 175.25.243.27 80 HTTP 40 | 218.61.8.124 88 HTTP 41 | 89.251.103.130 8080 HTTP 42 | 195.202.159.123 8080 HTTP 43 | 212.233.147.48 8080 HTTP 44 | 218.206.204.254 80 HTTP 45 | 14.31.11.78 9009 HTTP 46 | 110.4.12.170 83 HTTP 47 | 186.226.98.254 8080 HTTP 48 | 124.227.191.68 9000 HTTP 49 | 190.29.22.247 8080 HTTP 50 | 218.89.165.131 6060 HTTP 51 | 91.218.84.195 80 HTTP 52 | 210.14.143.53 7020 HTTP 53 | 178.18.17.208 8080 HTTP 54 | 202.43.65.130 8080 HTTP 55 | 37.229.231.253 8080 HTTP 56 | 218.206.204.254 443 HTTP 57 | 58.210.247.18 1337 HTTP 58 | 112.213.118.48 80 HTTP 59 | 187.53.150.62 8080 HTTP 60 | 125.210.188.35 80 HTTP 61 | 195.69.191.203 80 HTTP 62 | 218.102.39.154 8080 HTTP 63 | 91.228.53.28 8080 HTTP 64 | 210.14.143.122 80 HTTP 65 | 202.43.188.5 8080 HTTP 66 | 202.46.146.22 8080 HTTP 67 | 92.126.217.47 80 HTTP 68 | 218.249.83.87 8080 HTTP 69 | 58.211.114.107 443 HTTP 70 | 113.160.50.51 80 HTTP 71 | 178.18.17.250 8080 HTTP 72 | 219.153.5.3 8181 HTTP 73 | 93.186.97.236 8080 HTTP 74 | 210.177.139.89 8080 HTTP 75 | 222.168.65.130 80 HTTP 76 | 103.28.227.78 8080 HTTP 77 | 219.150.254.158 8080 HTTP 78 | 58.221.129.158 1337 HTTP 79 | 119.82.253.88 8080 HTTP 80 | 187.85.89.167 8080 HTTP 81 | 182.93.206.92 8080 HTTP 82 | 202.106.179.141 10160 HTTP 83 | 219.223.252.150 56142 HTTP 84 | 95.77.97.146 8080 HTTP 85 | 211.99.28.21 808 HTTP 86 | 211.142.236.137 8080 HTTP 87 | 41.75.201.146 8080 HTTP 88 | 109.207.63.89 8090 HTTP 89 | 219.159.105.180 8080 HTTP 90 | 59.34.57.88 8080 HTTP 91 | 187.110.169.186 8080 HTTP 92 | 190.40.80.144 8080 HTTP 93 | 210.212.98.228 80 HTTP 94 | 220.246.4.74 8080 HTTP 95 | 211.142.236.133 8080 HTTP 96 | 218.22.71.122 8080 HTTP 97 | 58.210.212.107 80 HTTP 98 | 112.25.15.18 9098 HTTP 99 | 59.37.168.16 8081 HTTP 100 | 185.8.2.50 8080 HTTP 101 | 190.102.17.121 80 HTTP 102 | 218.106.99.22 888 HTTP 103 | 221.3.153.74 80 HTTP 104 | 106.3.98.79 80 HTTP 105 | 211.142.236.137 80 HTTP 106 | 2.133.93.170 9090 HTTP 107 | 78.9.164.162 8080 HTTP 108 | 119.6.73.235 80 HTTP 109 | 221.130.17.48 80 HTTP 110 | 59.57.15.71 80 HTTP 111 | 186.5.65.164 8080 HTTP 112 | 190.0.17.202 8080 HTTP 113 | 197.254.11.30 8080 HTTP 114 | 5.8.242.12 8080 HTTP 115 | 221.130.18.218 80 HTTP 116 | 110.74.220.50 8080 HTTP 117 | 211.144.72.153 80 HTTP 118 | 2.135.237.154 9090 HTTP 119 | 82.200.253.202 9090 HTTP 120 | 119.110.69.70 80 HTTP 121 | 221.130.17.139 80 HTTP 122 | 59.172.208.189 8080 HTTP 123 | 190.14.255.169 8080 HTTP 124 | 190.0.46.66 8080 HTTP 125 | 200.93.115.248 8080 HTTP 126 | 5.10.224.62 80 HTTP 127 | 221.130.18.253 80 HTTP 128 | 110.93.211.11 80 HTTP 129 | 82.209.195.5 8080 HTTP 130 | 125.39.68.195 80 HTTP 131 | 221.130.23.4 80 HTTP 132 | 59.172.208.190 8080 HTTP 133 | 198.15.119.111 8080 HTTP 134 | 190.0.61.194 8080 HTTP 135 | 200.107.32.127 8080 HTTP 136 | 5.10.224.62 8080 HTTP 137 | 221.130.23.4 80 HTTP 138 | 110.93.211.11 8080 HTTP 139 | 218.206.204.254 80 HTTP 140 | 2.135.238.26 9090 HTTP 141 | 180.250.192.222 8080 HTTP 142 | 221.130.23.5 80 HTTP 143 | 200.137.133.171 80 HTTP 144 | 31.170.178.2 8080 HTTP 145 | 221.130.23.6 80 HTTP 146 | 112.5.254.30 80 HTTP 147 | 218.206.204.254 443 HTTP 148 | 2.135.238.108 9090 HTTP 149 | 88.249.127.222 8080 HTTP 150 | 186.46.122.250 8080 HTTP 151 | 221.130.23.6 80 HTTP 152 | 61.166.55.153 11808 HTTP 153 | 201.218.63.4 8080 HTTP 154 | 190.211.97.71 8080 HTTP 155 | 200.213.4.4 8080 HTTP 156 | 41.78.26.45 8080 HTTP 157 | 221.130.23.29 80 HTTP 158 | 112.175.248.22 8080 HTTP 159 | 2.135.242.42 9090 HTTP 160 | 110.138.160.170 8080 HTTP 161 | 187.85.225.185 80 HTTP 162 | 221.130.23.8 80 HTTP 163 | 106.3.98.82 80 HTTP 164 | 201.63.184.5 8080 HTTP 165 | 46.249.66.50 8080 HTTP 166 | 222.169.11.34 8080 HTTP 167 | 114.113.221.72 54321 HTTP 168 | 221.10.40.232 80 HTTP 169 | 2.135.243.42 9090 HTTP 170 | 110.138.163.58 8080 HTTP 171 | 221.130.23.78 80 HTTP 172 | 106.3.98.82 82 HTTP 173 | 202.171.253.98 80 HTTP 174 | 202.108.77.153 80 HTTP 175 | 77.236.209.236 8080 HTTP 176 | 116.68.171.70 8080 HTTP 177 | 221.10.40.232 82 HTTP 178 | 27.50.11.165 80 HTTP 179 | 118.96.137.140 8080 HTTP 180 | 5.10.224.58 80 HTTP 181 | 106.3.98.82 83 HTTP 182 | 202.171.253.103 80 HTTP 183 | 198.154.114.100 8080 HTTP 184 | 202.162.198.178 8080 HTTP 185 | 91.202.164.185 8080 HTTP 186 | 223.4.205.37 808 HTTP 187 | 117.34.72.51 808 HTTP 188 | 221.10.40.232 83 HTTP 189 | 36.73.40.189 8080 HTTP 190 | 200.208.251.218 8080 HTTP 191 | 72.64.146.136 43 HTTP 192 | 221.130.23.80 80 HTTP 193 | 112.5.254.19 80 HTTP 194 | 202.171.253.103 85 HTTP 195 | 200.27.114.228 8080 HTTP 196 | 202.182.49.41 8080 HTTP 197 | 103.10.22.226 8080 HTTP 198 | 58.252.56.148 8080 HTTP 199 | 118.97.206.28 8080 HTTP 200 | 221.130.18.76 80 HTTP 201 | 58.67.147.204 8080 HTTP 202 | 201.64.247.3 8080 HTTP 203 | 81.169.154.244 8080 HTTP 204 | 221.130.23.81 80 HTTP 205 | 112.5.254.20 80 HTTP 206 | 202.171.253.108 80 HTTP 207 | 200.27.114.233 8080 HTTP 208 | 110.138.208.50 8080 HTTP 209 | 122.72.15.231 80 HTTP 210 | 118.97.212.162 8080 HTTP 211 | 221.130.199.19 80 HTTP 212 | 77.89.233.54 8080 HTTP 213 | 202.46.85.107 8080 HTTP 214 | 125.69.132.100 8080 HTTP 215 | 221.130.23.82 80 HTTP 216 | 117.41.182.188 8080 HTTP 217 | 202.171.253.108 83 HTTP 218 | 200.54.92.187 80 HTTP 219 | 37.77.50.133 80 HTTP 220 | 111.161.30.228 80 HTTP 221 | 124.81.208.34 8080 HTTP 222 | 119.4.250.105 80 HTTP 223 | 221.130.199.98 80 HTTP 224 | 85.172.4.154 80 HTTP 225 | 202.93.136.98 8080 HTTP 226 | 221.130.23.91 80 HTTP 227 | 118.145.0.76 10086 HTTP 228 | 203.124.12.71 8080 HTTP 229 | 200.61.31.69 8080 HTTP 230 | 61.55.141.11 80 HTTP 231 | 114.113.221.77 54321 HTTP 232 | 180.243.92.86 8080 HTTP 233 | 119.7.221.135 81 HTTP 234 | 221.178.174.171 888 HTTP 235 | 87.236.233.92 8080 HTTP 236 | 203.91.43.43 9988 HTTP 237 | 190.3.108.211 8080 HTTP 238 | 221.181.192.91 80 HTTP 239 | 206.130.99.82 8080 HTTP 240 | 200.71.86.50 8080 HTTP 241 | 61.135.223.4 7000 HTTP 242 | 119.110.69.70 8080 HTTP 243 | 202.74.241.196 8080 HTTP 244 | 119.7.221.135 82 HTTP 245 | 222.124.35.117 8080 HTTP 246 | 110.139.60.228 8080 HTTP 247 | 190.72.150.144 8080 HTTP 248 | 221.215.155.38 8090 HTTP 249 | 120.203.214.162 80 HTTP 250 | 211.232.93.13 808 HTTP 251 | 200.75.51.151 8080 HTTP 252 | 78.133.155.54 8080 HTTP 253 | 180.87.197.91 8080 HTTP 254 | 1.63.18.22 8080 HTTP 255 | 119.7.221.137 82 HTTP 256 | 222.187.222.118 8080 HTTP 257 | 111.13.87.150 80 HTTP 258 | 219.76.104.17 8080 HTTP 259 | 200.208.251.220 8080 HTTP 260 | 222.89.55.123 8080 HTTP 261 | 120.203.214.176 80 HTTP 262 | 218.102.39.153 8080 HTTP 263 | 200.109.228.67 8080 HTTP 264 | 109.207.61.189 8090 HTTP 265 | 180.249.119.252 8080 HTTP 266 | 2.133.92.106 9090 HTTP 267 | 119.59.193.175 8080 HTTP 268 | 62.84.67.170 8080 HTTP 269 | 111.161.30.236 80 HTTP 270 | 49.212.167.222 80 HTTP 271 | 58.20.230.131 8080 HTTP 272 | 222.217.99.72 9000 HTTP 273 | 120.203.214.187 80 HTTP 274 | 200.195.176.77 8080 HTTP 275 | 113.53.254.124 8080 HTTP 276 | 2.133.92.122 9090 HTTP 277 | 119.145.2.18 80 HTTP 278 | 171.101.144.18 8080 HTTP 279 | 112.5.254.172 80 HTTP 280 | 103.247.16.241 8080 HTTP 281 | 61.235.69.243 8080 HTTP 282 | 222.217.99.177 9000 HTTP 283 | 2.135.243.84 9090 HTTP 284 | 200.202.240.174 80 HTTP 285 | 123.235.12.118 8080 HTTP 286 | 200.169.162.132 80 HTTP 287 | 2.133.92.157 80 HTTP 288 | 119.233.255.51 80 HTTP 289 | 213.131.41.6 8080 HTTP 290 | 178.48.2.237 8080 HTTP 291 | 109.236.220.98 8080 HTTP 292 | 222.240.224.131 80 HTTP 293 | 123.134.95.142 80 HTTP 294 | 5.135.242.225 8080 HTTP 295 | 200.204.161.246 8080 HTTP 296 | 218.22.71.124 8080 HTTP 297 | 2.133.92.158 80 HTTP 298 | 119.233.255.60 80 HTTP 299 | 1.234.45.130 80 HTTP 300 | 180.247.120.217 8080 HTTP 301 | 116.255.234.73 3288 HTTP 302 | 61.177.248.202 1080 SOCKS4 303 | 201.56.208.233 8080 HTTP 304 | 177.83.122.189 8080 HTTP 305 | 218.22.71.126 8080 HTTP 306 | 2.133.93.82 9090 HTTP 307 | 119.235.21.10 8080 HTTP 308 | 27.116.21.163 8080 HTTP 309 | 124.207.170.230 8080 HTTP 310 | 121.204.0.2 80 HTTP 311 | 183.60.44.136 88 HTTP 312 | 49.0.96.1 8000 HTTP 313 | 201.86.70.162 80 HTTP 314 | 177.182.252.197 8080 HTTP 315 | 221.210.40.150 8080 HTTP 316 | 119.252.172.131 80 HTTP 317 | 49.0.110.1 8000 HTTP 318 | 124.240.187.81 82 HTTP 319 | 200.54.78.66 8080 HTTP 320 | 125.165.51.4 8080 HTTP 321 | 183.61.246.78 80 HTTP 322 | 62.201.207.14 8080 HTTP 323 | 201.249.192.74 8080 HTTP 324 | 190.116.87.4 8080 HTTP 325 | 2.135.238.92 9090 HTTP 326 | 120.194.100.46 8001 HTTP 327 | 58.215.88.12 80 HTTP 328 | 164.77.196.78 80 HTTP 329 | 202.154.225.229 8080 HTTP 330 | 186.5.102.162 8080 HTTP 331 | 114.80.136.112 7780 HTTP 332 | 183.129.249.82 80 HTTP 333 | 62.201.210.190 8080 HTTP 334 | 202.152.22.38 8080 HTTP 335 | 31.135.196.229 8080 HTTP 336 | 41.216.171.154 8080 HTTP 337 | 59.49.79.121 9527 HTTP 338 | 177.11.17.46 8080 HTTP 339 | 71.189.47.2 8081 HTTP 340 | 190.78.2.84 8080 HTTP 341 | 115.100.60.198 8000 HTTP 342 | 183.129.249.83 80 HTTP 343 | 63.141.216.176 80 HTTP 344 | 203.172.245.34 8080 HTTP 345 | 195.140.190.146 8080 HTTP 346 | 81.201.61.138 8080 HTTP 347 | 58.53.192.218 8123 HTTP 348 | 121.12.118.241 999 HTTP 349 | 59.57.15.71 80 HTTP 350 | 180.242.88.43 5311 HTTP 351 | 202.29.211.122 8080 HTTP 352 | 115.236.19.48 8080 HTTP 353 | 211.100.47.131 8990 HTTP 354 | 66.35.68.146 8080 HTTP 355 | 212.175.88.3 8080 HTTP 356 | 197.251.194.164 8080 HTTP 357 | 89.171.46.225 8080 HTTP 358 | 59.59.51.74 8001 HTTP 359 | 122.11.38.182 9090 HTTP 360 | 59.172.208.186 8080 HTTP 361 | 183.110.231.124 80 HTTP 362 | 202.202.1.189 80 HTTP 363 | 116.112.66.102 808 HTTP 364 | 211.100.47.244 8990 HTTP 365 | 74.221.211.117 8080 HTTP 366 | 213.110.196.11 80 HTTP 367 | 202.108.50.72 80 HTTP 368 | 94.137.239.19 81 HTTP 369 | 60.165.173.36 8003 HTTP 370 | 122.72.0.6 80 HTTP 371 | 61.156.217.166 8000 HTTP 372 | 187.20.25.42 8080 HTTP 373 | 203.93.104.20 80 HTTP 374 | 119.97.146.152 80 HTTP 375 | 211.100.52.42 8990 HTTP 376 | 77.65.22.245 8080 HTTP 377 | 217.117.14.247 80 HTTP 378 | 202.145.3.130 8080 HTTP 379 | 110.139.58.31 8080 HTTP 380 | 60.191.142.233 8360 HTTP 381 | 122.144.1.213 9999 HTTP 382 | 78.188.3.171 8080 HTTP 383 | 190.29.30.114 8080 HTTP 384 | 119.252.168.34 80 HTTP 385 | 120.194.100.42 8001 HTTP 386 | 211.142.236.133 80 HTTP 387 | 77.78.104.129 8080 HTTP 388 | 218.100.84.123 8080 HTTP 389 | 202.146.237.79 808 HTTP 390 | 114.113.221.70 54321 HTTP 391 | 61.136.93.38 8080 HTTP 392 | 122.252.181.20 8080 HTTP 393 | 78.188.47.21 8080 HTTP 394 | 190.128.170.18 8080 HTTP 395 | 178.233.149.172 8080 HTTP 396 | 120.203.214.182 80 HTTP 397 | 219.83.100.195 8080 HTTP 398 | 208.163.36.221 8080 HTTP 399 | 61.152.108.187 80 HTTP 400 | 123.30.174.61 8080 HTTP 401 | 83.17.80.124 8080 HTTP 402 | 200.60.11.20 8080 HTTP 403 | 177.70.17.154 8080 HTTP 404 | 187.5.122.231 8080 HTTP 405 | 122.72.2.180 80 HTTP 406 | 211.142.236.137 80 HTTP 407 | 77.238.209.194 8080 HTTP 408 | 222.124.19.210 8080 HTTP 409 | 221.179.173.170 8080 HTTP 410 | 118.97.58.166 8080 HTTP 411 | 61.155.140.154 55808 HTTP 412 | 92.39.54.161 80 HTTP 413 | 200.137.133.169 80 HTTP 414 | 177.85.233.190 8080 HTTP 415 | 122.72.120.63 80 HTTP 416 | 211.142.236.137 8080 HTTP 417 | 78.159.235.3 8080 HTTP 418 | 222.124.147.105 8080 HTTP 419 | 36.73.42.103 8080 HTTP 420 | 101.255.33.254 80 HTTP 421 | 201.41.66.212 8080 HTTP 422 | 178.219.103.205 8080 HTTP 423 | 200.24.17.46 80 HTTP 424 | 122.72.124.2 80 HTTP 425 | 218.22.71.125 8080 HTTP 426 | 80.90.27.60 8080 HTTP 427 | 222.124.207.29 8080 HTTP 428 | 60.166.13.182 80 HTTP 429 | 122.224.5.210 443 HTTP 430 | 85.207.17.146 8080 HTTP 431 | 123.164.148.134 80 HTTP 432 | 103.28.113.134 8080 HTTP 433 | 202.29.60.220 8080 HTTP 434 | 180.250.130.186 8080 HTTP 435 | 202.181.176.3 80 HTTP 436 | 122.225.22.22 8080 HTTP 437 | 218.22.71.210 8080 HTTP 438 | 222.124.218.164 8080 HTTP 439 | 60.216.7.28 3079 HTTP 440 | 89.218.100.90 9090 HTTP 441 | 123.164.148.134 82 HTTP 442 | 103.247.37.86 8080 HTTP 443 | 202.152.40.202 8080 HTTP 444 | 195.191.250.229 80 HTTP 445 | 213.24.60.52 8080 HTTP 446 | 202.97.159.227 8080 HTTP 447 | 218.104.193.102 80 HTTP 448 | 81.213.157.71 80 HTTP 449 | 223.25.195.68 8080 HTTP 450 | 78.38.80.142 8080 HTTP 451 | 186.192.17.138 8080 HTTP 452 | 89.237.134.10 8080 HTTP 453 | 124.81.113.183 8080 HTTP 454 | 109.74.236.165 8080 HTTP 455 | 217.29.117.162 8080 HTTP 456 | 203.110.169.76 9128 HTTP 457 | 218.201.21.175 80 HTTP 458 | 82.200.236.58 9090 HTTP 459 | 72.64.146.136 8080 HTTP 460 | 81.90.224.209 8080 HTTP 461 | 189.29.118.245 8080 HTTP 462 | 103.23.139.97 8080 HTTP 463 | 125.39.238.242 8080 HTTP 464 | 109.224.5.194 80 HTTP 465 | 60.214.67.86 8080 HTTP 466 | 203.110.169.83 9128 HTTP 467 | 218.201.21.176 80 HTTP 468 | 110.74.222.117 8080 HTTP 469 | 95.129.199.70 8080 HTTP 470 | 190.79.44.28 8080 HTTP 471 | 211.151.171.207 80 HTTP 472 | 218.108.242.100 48814 HTTP 473 | 93.189.28.106 8080 HTTP 474 | 211.144.76.58 9000 HTTP 475 | 218.201.21.177 80 HTTP 476 | 82.200.254.114 9090 HTTP 477 | 122.72.76.122 80 HTTP 478 | 103.5.49.37 8080 HTTP 479 | 201.12.116.18 8080 HTTP 480 | 109.207.61.182 8090 HTTP 481 | 150.165.75.129 8080 HTTP 482 | 111.161.30.237 80 HTTP 483 | 212.76.180.50 8080 HTTP 484 | 72.64.146.135 8080 HTTP 485 | 110.139.151.124 8080 HTTP 486 | 211.154.151.218 88 HTTP 487 | 218.201.21.178 80 HTTP 488 | 82.200.254.146 9090 HTTP 489 | 122.72.76.130 80 HTTP 490 | 109.207.61.167 8090 HTTP 491 | 202.43.188.9 8080 HTTP 492 | 110.136.245.31 8080 HTTP 493 | 151.236.194.2 8080 HTTP 494 | 113.108.92.104 80 HTTP 495 | 218.56.161.14 8118 HTTP 496 | 116.77.35.118 80 HTTP 497 | 211.167.112.14 80 HTTP 498 | 218.204.39.164 80 HTTP 499 | 89.188.224.70 8080 HTTP 500 | 190.121.154.246 8080 HTTP 501 | 124.240.187.79 81 HTTP 502 | 202.43.188.15 8080 HTTP 503 | 112.175.18.180 80 HTTP 504 | 164.77.196.75 80 HTTP 505 | 114.32.95.96 8080 HTTP 506 | 219.76.104.1 80 HTTP 507 | 111.161.30.233 80 HTTP 508 | 211.167.112.14 82 HTTP 509 | 221.130.17.37 80 HTTP 510 | 89.218.68.13 80 HTTP 511 | 200.55.206.210 8080 HTTP 512 | 124.240.187.79 82 HTTP 513 | 113.142.8.205 8080 HTTP 514 | 177.125.167.253 8080 HTTP 515 | 219.76.104.1 8080 HTTP 516 | 118.97.255.107 8080 HTTP 517 | 211.167.112.15 80 HTTP 518 | 221.130.18.45 80 HTTP 519 | 89.218.68.34 9090 HTTP 520 | 219.154.46.138 8080 HTTP 521 | 186.16.203.50 8080 HTTP 522 | 202.102.48.205 8080 HTTP 523 | 113.195.134.231 8080 HTTP 524 | 178.169.97.35 54321 HTTP 525 | 116.112.66.102 808 HTTP 526 | 219.231.164.40 45238 HTTP 527 | 176.33.138.156 8080 HTTP 528 | 211.167.112.15 82 HTTP 529 | 221.130.18.52 80 HTTP 530 | 89.218.68.130 9090 HTTP 531 | 197.251.194.126 8080 HTTP 532 | 203.93.28.166 8080 HTTP 533 | 183.110.231.240 80 HTTP 534 | 118.96.66.107 80 HTTP 535 | 219.242.50.50 8080 HTTP 536 | 211.167.112.16 80 HTTP 537 | 221.130.18.189 80 HTTP 538 | 89.218.68.132 80 HTTP 539 | 2.133.92.18 9090 HTTP 540 | 202.102.58.208 80 HTTP 541 | 218.108.242.105 41884 HTTP 542 | 116.226.46.19 8080 HTTP 543 | 183.221.250.137 80 HTTP 544 | 118.97.91.129 8080 HTTP 545 | 221.7.145.42 8080 HTTP 546 | 211.167.112.17 80 HTTP 547 | 221.130.18.253 80 HTTP 548 | 89.218.100.210 9090 HTTP 549 | 2.133.92.26 9090 HTTP 550 | 202.102.58.209 80 HTTP 551 | 27.116.21.162 8080 HTTP 552 | 118.26.231.104 5060 HTTP 553 | 183.221.250.141 80 HTTP 554 | 222.74.98.234 8080 HTTP 555 | 222.89.154.6 9000 HTTP 556 | 202.28.110.17 8080 HTTP 557 | 211.167.112.17 82 HTTP 558 | 221.176.14.72 80 HTTP 559 | 89.218.100.250 9090 HTTP 560 | 2.133.92.162 9090 HTTP 561 | 118.194.164.90 54321 HTTP 562 | 186.101.41.25 80 HTTP 563 | 118.195.65.243 80 HTTP 564 | 5.8.242.10 8080 HTTP 565 | 202.107.195.231 80 HTTP 566 | 218.28.254.77 880 HTTP 567 | 221.176.169.194 8001 HTTP 568 | 89.218.101.74 9090 HTTP 569 | 46.249.66.50 80 HTTP 570 | 119.7.221.137 81 HTTP 571 | 119.167.231.183 80 HTTP 572 | 41.78.25.69 8080 HTTP 573 | 210.212.152.5 80 HTTP 574 | 221.6.15.156 82 HTTP 575 | 221.181.192.25 80 HTTP 576 | 2.133.93.154 9090 HTTP 577 | 210.101.131.232 8080 HTTP 578 | 93.90.235.178 8080 HTTP 579 | 119.7.221.137 83 HTTP 580 | 186.101.65.115 80 HTTP 581 | 50.22.206.179 8080 HTTP 582 | 221.6.15.157 82 HTTP 583 | 93.94.180.15 8080 HTTP 584 | 2.133.94.42 9090 HTTP 585 | 218.94.1.166 82 HTTP 586 | 110.139.206.93 80 HTTP 587 | 119.62.128.172 80 HTTP 588 | 61.8.72.99 8080 HTTP 589 | 141.105.87.77 80 HTTP 590 | 41.215.3.98 80 HTTP 591 | 222.88.94.245 80 HTTP 592 | 101.255.36.30 808 HTTP 593 | 218.249.114.42 8088 HTTP 594 | 111.161.30.227 80 HTTP 595 | 121.14.9.76 80 HTTP 596 | 187.4.63.148 80 HTTP 597 | 68.71.76.242 8082 HTTP 598 | 197.251.194.121 8080 HTTP 599 | 119.187.148.102 8000 HTTP 600 | 222.92.141.155 8090 HTTP 601 | 101.255.60.158 8080 HTTP 602 | 2.135.237.194 9090 HTTP 603 | 111.161.30.232 80 HTTP 604 | 121.17.144.132 8080 HTTP 605 | 187.4.63.149 80 HTTP 606 | 121.52.144.245 80 HTTP 607 | 74.221.215.254 8080 HTTP 608 | 122.115.62.108 8081 HTTP 609 | 221.130.23.29 80 HTTP 610 | 222.187.222.118 8080 HTTP 611 | 103.11.99.162 8080 HTTP 612 | 2.135.237.250 9090 HTTP 613 | 2.133.94.26 9090 HTTP 614 | 112.125.120.145 10080 HTTP 615 | 122.4.78.26 34808 HTTP 616 | 122.72.0.145 80 HTTP 617 | 78.80.36.194 8080 HTTP 618 | 219.76.104.18 8080 HTTP 619 | 221.179.173.170 8080 HTTP 620 | 222.188.10.1 1080 SOCKS4 621 | 103.246.145.184 8080 HTTP 622 | 2.135.242.162 9090 HTTP 623 | 2.135.238.146 9090 HTTP 624 | 122.72.0.28 80 HTTP 625 | 187.33.208.250 8080 HTTP 626 | 122.72.2.180 80 HTTP 627 | 78.131.55.82 8080 HTTP 628 | 58.67.147.196 8080 HTTP 629 | 202.116.1.149 8128 HTTP 630 | 221.195.42.195 8080 HTTP 631 | 222.217.99.156 9000 HTTP 632 | 2.135.242.170 9090 HTTP 633 | 24.158.199.54 8082 HTTP 634 | 117.121.238.17 8080 HTTP 635 | 123.125.74.212 80 HTTP 636 | 187.115.65.187 80 HTTP 637 | 122.72.2.180 8080 HTTP 638 | 82.79.92.226 8080 HTTP 639 | 202.137.22.182 8080 HTTP 640 | 223.4.173.109 808 HTTP 641 | 2.135.242.186 9090 HTTP 642 | 118.96.192.84 8080 HTTP 643 | 124.240.187.79 80 HTTP 644 | 91.202.164.29 8080 HTTP 645 | 103.10.22.231 8080 HTTP 646 | 27.54.218.248 80 HTTP 647 | 222.74.98.234 8080 HTTP 648 | 27.50.132.145 88 HTTP 649 | 2.135.242.226 9090 HTTP 650 | 190.67.169.194 8080 HTTP 651 | 124.240.187.79 83 HTTP 652 | 123.164.148.132 80 HTTP 653 | 101.255.33.249 80 HTTP 654 | 69.29.105.153 8080 HTTP 655 | 58.67.147.205 8080 HTTP 656 | 61.167.49.188 8080 HTTP 657 | 124.240.187.80 80 HTTP 658 | 190.66.22.53 8080 HTTP 659 | 109.207.61.170 8090 HTTP 660 | 180.248.156.56 8080 HTTP 661 | 114.113.221.69 54321 HTTP 662 | 124.95.142.94 80 HTTP 663 | 58.248.254.38 80 HTTP 664 | -------------------------------------------------------------------------------- /newsspider/query_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding: utf-8 3 | 4 | def query(): 5 | import MySQLdb 6 | conn = MySQLdb.connect(host='localhost', user='root',passwd='feisky', 7 | db='news' , charset="utf8") 8 | try: 9 | cursor = conn.cursor() 10 | cursor.execute("select * from news order by created desc") 11 | data = cursor.fetchall() 12 | finally: 13 | conn.close() 14 | 15 | for d in data: 16 | print '%-60s %-30s' % (d[1],d[2]) 17 | 18 | if __name__ == '__main__': 19 | query() 20 | -------------------------------------------------------------------------------- /newsspider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = newsspider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = newsspider 12 | -------------------------------------------------------------------------------- /newsspider/webkit2png: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # webkit2png.py 4 | # 5 | # Creates screenshots of webpages using by QtWebkit. 6 | # 7 | # Copyright (c) 2008 Roland Tapken 8 | # 9 | # This program is free software; you can redistribute it and/or 10 | # modify it under the terms of the GNU General Public License 11 | # as published by the Free Software Foundation; either version 2 12 | # of the License, or (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program; if not, write to the Free Software 21 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 | # 23 | # Nice ideas "todo": 24 | # - Add QTcpSocket support to create a "screenshot daemon" that 25 | # can handle multiple requests at the same time. 26 | 27 | from webkit2png import WebkitRenderer 28 | 29 | import sys 30 | import signal 31 | import os 32 | import urlparse 33 | import logging 34 | from optparse import OptionParser 35 | 36 | from PyQt4.QtCore import * 37 | from PyQt4.QtGui import * 38 | from PyQt4.QtWebKit import * 39 | from PyQt4.QtNetwork import * 40 | 41 | VERSION="20091224" 42 | LOG_FILENAME = 'webkit2png.log' 43 | logger = logging.getLogger('webkit2png'); 44 | 45 | def init_qtgui(display=None, style=None, qtargs=None): 46 | """Initiates the QApplication environment using the given args.""" 47 | if QApplication.instance(): 48 | logger.debug("QApplication has already been instantiated. \ 49 | Ignoring given arguments and returning existing QApplication.") 50 | return QApplication.instance() 51 | 52 | qtargs2 = [sys.argv[0]] 53 | 54 | if display: 55 | qtargs2.append('-display') 56 | qtargs2.append(display) 57 | # Also export DISPLAY var as this may be used 58 | # by flash plugin 59 | os.environ["DISPLAY"] = display 60 | 61 | if style: 62 | qtargs2.append('-style') 63 | qtargs2.append(style) 64 | 65 | qtargs2.extend(qtargs or []) 66 | 67 | return QApplication(qtargs2) 68 | 69 | 70 | if __name__ == '__main__': 71 | # This code will be executed if this module is run 'as-is'. 72 | 73 | # Enable HTTP proxy 74 | if 'http_proxy' in os.environ: 75 | proxy_url = urlparse.urlparse(os.environ.get('http_proxy')) 76 | proxy = QNetworkProxy(QNetworkProxy.HttpProxy, proxy_url.hostname, proxy_url.port) 77 | QNetworkProxy.setApplicationProxy(proxy) 78 | 79 | # Parse command line arguments. 80 | # Syntax: 81 | # $0 [--xvfb|--display=DISPLAY] [--debug] [--output=FILENAME] 82 | 83 | description = "Creates a screenshot of a website using QtWebkit." \ 84 | + "This program comes with ABSOLUTELY NO WARRANTY. " \ 85 | + "This is free software, and you are welcome to redistribute " \ 86 | + "it under the terms of the GNU General Public License v2." 87 | 88 | parser = OptionParser(usage="usage: %prog [options] ", 89 | version="%prog " + VERSION + ", Copyright (c) Roland Tapken", 90 | description=description, add_help_option=True) 91 | parser.add_option("-x", "--xvfb", nargs=2, type="int", dest="xvfb", 92 | help="Start an 'xvfb' instance with the given desktop size.", metavar="WIDTH HEIGHT") 93 | parser.add_option("-g", "--geometry", dest="geometry", nargs=2, default=(0, 0), type="int", 94 | help="Geometry of the virtual browser window (0 means 'autodetect') [default: %default].", metavar="WIDTH HEIGHT") 95 | parser.add_option("-o", "--output", dest="output", 96 | help="Write output to FILE instead of STDOUT.", metavar="FILE") 97 | parser.add_option("-f", "--format", dest="format", default="png", 98 | help="Output image format [default: %default]", metavar="FORMAT") 99 | parser.add_option("--scale", dest="scale", nargs=2, type="int", 100 | help="Scale the image to this size", metavar="WIDTH HEIGHT") 101 | parser.add_option("--aspect-ratio", dest="ratio", type="choice", choices=["ignore", "keep", "expand", "crop"], 102 | help="One of 'ignore', 'keep', 'crop' or 'expand' [default: %default]") 103 | parser.add_option("-F", "--feature", dest="features", action="append", type="choice", 104 | choices=["javascript", "plugins"], 105 | help="Enable additional Webkit features ('javascript', 'plugins')", metavar="FEATURE") 106 | parser.add_option("-w", "--wait", dest="wait", default=0, type="int", 107 | help="Time to wait after loading before the screenshot is taken [default: %default]", metavar="SECONDS") 108 | parser.add_option("-t", "--timeout", dest="timeout", default=0, type="int", 109 | help="Time before the request will be canceled [default: %default]", metavar="SECONDS") 110 | parser.add_option("-W", "--window", dest="window", action="store_true", 111 | help="Grab whole window instead of frame (may be required for plugins)", default=False) 112 | parser.add_option("-T", "--transparent", dest="transparent", action="store_true", 113 | help="Render output on a transparent background (Be sure to have a transparent background defined in the html)", default=False) 114 | parser.add_option("", "--style", dest="style", 115 | help="Change the Qt look and feel to STYLE (e.G. 'windows').", metavar="STYLE") 116 | parser.add_option("", "--encoded-url", dest="encoded_url", action="store_true", 117 | help="Treat URL as url-encoded", metavar="ENCODED_URL", default=False) 118 | parser.add_option("-d", "--display", dest="display", 119 | help="Connect to X server at DISPLAY.", metavar="DISPLAY") 120 | parser.add_option("--debug", action="store_true", dest="debug", 121 | help="Show debugging information.", default=False) 122 | parser.add_option("--log", action="store", dest="logfile", default=LOG_FILENAME, 123 | help="Select the log output file",) 124 | 125 | # Parse command line arguments and validate them (as far as we can) 126 | (options,args) = parser.parse_args() 127 | if len(args) != 1: 128 | parser.error("incorrect number of arguments") 129 | if options.display and options.xvfb: 130 | parser.error("options -x and -d are mutually exclusive") 131 | options.url = args[0] 132 | 133 | logging.basicConfig(filename=options.logfile,level=logging.WARN,) 134 | 135 | # Enable output of debugging information 136 | if options.debug: 137 | logger.setLevel(logging.DEBUG) 138 | 139 | if options.xvfb: 140 | # Start 'xvfb' instance by replacing the current process 141 | server_num = int(os.getpid() + 1e6) 142 | newArgs = ["xvfb-run", "--auto-servernum", "--server-num", str(server_num), "--server-args=-screen 0, %dx%dx24" % options.xvfb, sys.argv[0]] 143 | skipArgs = 0 144 | for i in range(1, len(sys.argv)): 145 | if skipArgs > 0: 146 | skipArgs -= 1 147 | elif sys.argv[i] in ["-x", "--xvfb"]: 148 | skipArgs = 2 # following: width and height 149 | else: 150 | newArgs.append(sys.argv[i]) 151 | logger.debug("Executing %s" % " ".join(newArgs)) 152 | try: 153 | os.execvp(newArgs[0],newArgs[1:]) 154 | except OSError: 155 | logger.error("Unable to find '%s'" % newArgs[0]) 156 | print >> sys.stderr, "Error - Unable to find '%s' for -x/--xvfb option" % newArgs[0] 157 | sys.exit(1) 158 | 159 | # Prepare output ("1" means STDOUT) 160 | if options.output is None: 161 | options.output = sys.stdout 162 | else: 163 | options.output = open(options.output, "w") 164 | 165 | logger.debug("Version %s, Python %s, Qt %s", VERSION, sys.version, qVersion()); 166 | 167 | # Technically, this is a QtGui application, because QWebPage requires it 168 | # to be. But because we will have no user interaction, and rendering can 169 | # not start before 'app.exec_()' is called, we have to trigger our "main" 170 | # by a timer event. 171 | def __main_qt(): 172 | # Render the page. 173 | # If this method times out or loading failed, a 174 | # RuntimeException is thrown 175 | try: 176 | # Initialize WebkitRenderer object 177 | renderer = WebkitRenderer() 178 | renderer.logger = logger 179 | renderer.width = options.geometry[0] 180 | renderer.height = options.geometry[1] 181 | renderer.timeout = options.timeout 182 | renderer.wait = options.wait 183 | renderer.format = options.format 184 | renderer.grabWholeWindow = options.window 185 | renderer.renderTransparentBackground = options.transparent 186 | renderer.encodedUrl = options.encoded_url 187 | 188 | if options.scale: 189 | renderer.scaleRatio = options.ratio 190 | renderer.scaleToWidth = options.scale[0] 191 | renderer.scaleToHeight = options.scale[1] 192 | 193 | if options.features: 194 | if "javascript" in options.features: 195 | renderer.qWebSettings[QWebSettings.JavascriptEnabled] = True 196 | if "plugins" in options.features: 197 | renderer.qWebSettings[QWebSettings.PluginsEnabled] = True 198 | 199 | renderer.render_to_file(url=options.url, file_object=options.output) 200 | options.output.close() 201 | QApplication.exit(0) 202 | except RuntimeError, e: 203 | logger.error("main: %s" % e) 204 | print >> sys.stderr, e 205 | QApplication.exit(1) 206 | 207 | # Initialize Qt-Application, but make this script 208 | # abortable via CTRL-C 209 | app = init_qtgui(display = options.display, style=options.style) 210 | signal.signal(signal.SIGINT, signal.SIG_DFL) 211 | 212 | QTimer.singleShot(0, __main_qt) 213 | sys.exit(app.exec_()) -------------------------------------------------------------------------------- /newsspider/webkit2png.log: -------------------------------------------------------------------------------- 1 | WARNING:webkit2png:Failed to load http://www.nytimes.com/2013/03/10/opinion/sunday/living-with-less-a-lot-less.html?pagewanted=2 2 | WARNING:webkit2png:Failed to load http://aws.amazon.com/about-aws/whats-new/2013/03/11/announcing-aws-elastic-beanstalk-for-node-js/ 3 | WARNING:webkit2png:SSL: The host name did not match any of the valid hosts for this certificate 4 | WARNING:webkit2png:SSL: The host name did not match any of the valid hosts for this certificate 5 | WARNING:webkit2png:SSL: The host name did not match any of the valid hosts for this certificate 6 | -------------------------------------------------------------------------------- /newsspider/webkit2png.py: -------------------------------------------------------------------------------- 1 | # 2 | # webkit2png.py 3 | # 4 | # Creates screenshots of webpages using by QtWebkit. 5 | # 6 | # Copyright (c) 2008 Roland Tapken 7 | # 8 | # This program is free software; you can redistribute it and/or 9 | # modify it under the terms of the GNU General Public License 10 | # as published by the Free Software Foundation; either version 2 11 | # of the License, or (at your option) any later version. 12 | # 13 | # This program is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with this program; if not, write to the Free Software 20 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 | # 22 | # Nice ideas "todo": 23 | # - Add QTcpSocket support to create a "screenshot daemon" that 24 | # can handle multiple requests at the same time. 25 | 26 | import time 27 | 28 | from PyQt4.QtCore import * 29 | from PyQt4.QtGui import * 30 | from PyQt4.QtWebKit import * 31 | from PyQt4.QtNetwork import * 32 | 33 | # Class for Website-Rendering. Uses QWebPage, which 34 | # requires a running QtGui to work. 35 | class WebkitRenderer(QObject): 36 | """A class that helps to create 'screenshots' of webpages using 37 | Qt's QWebkit. Requires PyQt4 library. 38 | 39 | Use "render()" to get a 'QImage' object, render_to_bytes() to get the 40 | resulting image as 'str' object or render_to_file() to write the image 41 | directly into a 'file' resource. 42 | 43 | These methods have to be called from within Qt's main (GUI) thread. 44 | An example on how to use this is the __qt_main() method at the end 45 | of the libraries source file. More generic examples: 46 | 47 | def qt_main(): 48 | while go_on(): 49 | do_something_meaningful() 50 | while QApplication.hasPendingEvents(): 51 | QApplication.processEvents() 52 | QApplication.quit() 53 | 54 | app = init_qtgui() 55 | QTimer.singleShot(0, qt_main) 56 | sys.exit(app.exec_()) 57 | 58 | Or let Qt handle event processing using a QTimer instance: 59 | 60 | def qt_main_loop(): 61 | if not go_on(): 62 | QApplication.quit() 63 | return 64 | do_something_meaningful() 65 | 66 | app = init_qtgui() 67 | main_timer = QTimer() 68 | QObject.connect(main_timer, QtCore.SIGNAL("timeout()"), qt_main_loop) 69 | sys.exit(app.exec_()) 70 | 71 | Avaible properties: 72 | width -- The width of the "browser" window. 0 means autodetect (default). 73 | height -- The height of the window. 0 means autodetect (default). 74 | timeout -- Seconds after that the request is aborted (default: 0) 75 | wait -- Seconds to wait after loading has been finished (default: 0) 76 | scaleToWidth -- The resulting image is scaled to this width. 77 | scaleToHeight -- The resulting image is scaled to this height. 78 | scaleRatio -- The image is scaled using this method. Possible values are: 79 | keep 80 | expand 81 | crop 82 | ignore 83 | grabWhileWindow -- If this is True a screenshot of the whole window is taken. Otherwise only the current frame is rendered. This is required for plugins to be visible, but it is possible that another window overlays the current one while the screenshot is taken. To reduce this possibility, the window is activated just before it is rendered if this property is set to True (default: False). 84 | qWebSettings -- Settings that should be assigned to the created QWebPage instance. See http://doc.trolltech.com/4.6/qwebsettings.html for possible keys. Defaults: 85 | JavascriptEnabled: False 86 | PluginsEnabled: False 87 | PrivateBrowsingEnabled: True 88 | JavascriptCanOpenWindows: False 89 | """ 90 | 91 | def __init__(self,**kwargs): 92 | """Sets default values for the properties.""" 93 | 94 | if not QApplication.instance(): 95 | raise RuntimeError(self.__class__.__name__ + " requires a running QApplication instance") 96 | QObject.__init__(self) 97 | 98 | # Initialize default properties 99 | self.width = kwargs.get('width', 0) 100 | self.height = kwargs.get('height', 0) 101 | self.timeout = kwargs.get('timeout', 0) 102 | self.wait = kwargs.get('wait', 0) 103 | self.scaleToWidth = kwargs.get('scaleToWidth', 0) 104 | self.scaleToHeight = kwargs.get('scaleToHeight', 0) 105 | self.scaleRatio = kwargs.get('scaleRatio', 'keep') 106 | self.format = kwargs.get('format', 'png') 107 | self.logger = kwargs.get('logger', None) 108 | # Set this to true if you want to capture flash. 109 | # Not that your desktop must be large enough for 110 | # fitting the whole window. 111 | self.grabWholeWindow = kwargs.get('grabWholeWindow', False) 112 | self.renderTransparentBackground = kwargs.get('renderTransparentBackground', False) 113 | self.ignoreAlert = kwargs.get('ignoreAlert', True) 114 | self.ignoreConfirm = kwargs.get('ignoreConfirm', True) 115 | self.ignorePrompt = kwargs.get('ignorePrompt', True) 116 | self.interruptJavaScript = kwargs.get('interruptJavaScript', True) 117 | self.encodedUrl = kwargs.get('encodedUrl', False) 118 | 119 | # Set some default options for QWebPage 120 | self.qWebSettings = { 121 | QWebSettings.JavascriptEnabled : False, 122 | QWebSettings.PluginsEnabled : False, 123 | QWebSettings.PrivateBrowsingEnabled : True, 124 | QWebSettings.JavascriptCanOpenWindows : False 125 | } 126 | 127 | 128 | def render(self, url): 129 | """Renders the given URL into a QImage object""" 130 | # We have to use this helper object because 131 | # QApplication.processEvents may be called, causing 132 | # this method to get called while it has not returned yet. 133 | helper = _WebkitRendererHelper(self) 134 | helper._window.resize( self.width, self.height ) 135 | image = helper.render(url) 136 | 137 | # Bind helper instance to this image to prevent the 138 | # object from being cleaned up (and with it the QWebPage, etc) 139 | # before the data has been used. 140 | image.helper = helper 141 | 142 | return image 143 | 144 | def render_to_file(self, url, file_object): 145 | """Renders the image into a File resource. 146 | Returns the size of the data that has been written. 147 | """ 148 | format = self.format # this may not be constant due to processEvents() 149 | image = self.render(url) 150 | qBuffer = QBuffer() 151 | image.save(qBuffer, format) 152 | file_object.write(qBuffer.buffer().data()) 153 | return qBuffer.size() 154 | 155 | def render_to_bytes(self, url): 156 | """Renders the image into an object of type 'str'""" 157 | format = self.format # this may not be constant due to processEvents() 158 | image = self.render(url) 159 | qBuffer = QBuffer() 160 | image.save(qBuffer, format) 161 | return qBuffer.buffer().data() 162 | 163 | class _WebkitRendererHelper(QObject): 164 | """This helper class is doing the real work. It is required to 165 | allow WebkitRenderer.render() to be called "asynchronously" 166 | (but always from Qt's GUI thread). 167 | """ 168 | 169 | def __init__(self, parent): 170 | """Copies the properties from the parent (WebkitRenderer) object, 171 | creates the required instances of QWebPage, QWebView and QMainWindow 172 | and registers some Slots. 173 | """ 174 | QObject.__init__(self) 175 | 176 | # Copy properties from parent 177 | for key,value in parent.__dict__.items(): 178 | setattr(self,key,value) 179 | 180 | # Create and connect required PyQt4 objects 181 | self._page = CustomWebPage(logger=self.logger, ignore_alert=self.ignoreAlert, 182 | ignore_confirm=self.ignoreConfirm, ignore_prompt=self.ignorePrompt, 183 | interrupt_js=self.interruptJavaScript) 184 | self._view = QWebView() 185 | self._view.setPage(self._page) 186 | self._window = QMainWindow() 187 | self._window.setCentralWidget(self._view) 188 | 189 | # Import QWebSettings 190 | for key, value in self.qWebSettings.iteritems(): 191 | self._page.settings().setAttribute(key, value) 192 | 193 | # Connect required event listeners 194 | self.connect(self._page, SIGNAL("loadFinished(bool)"), self._on_load_finished) 195 | self.connect(self._page, SIGNAL("loadStarted()"), self._on_load_started) 196 | self.connect(self._page.networkAccessManager(), SIGNAL("sslErrors(QNetworkReply *,const QList&)"), self._on_ssl_errors) 197 | self.connect(self._page.networkAccessManager(), SIGNAL("finished(QNetworkReply *)"), self._on_each_reply) 198 | 199 | # The way we will use this, it seems to be unesseccary to have Scrollbars enabled 200 | self._page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) 201 | self._page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) 202 | self._page.settings().setUserStyleSheetUrl(QUrl("data:text/css,html,body{overflow-y:hidden !important;}")) 203 | 204 | # Show this widget 205 | self._window.show() 206 | 207 | def __del__(self): 208 | """Clean up Qt4 objects. """ 209 | self._window.close() 210 | del self._window 211 | del self._view 212 | del self._page 213 | 214 | def render(self, url): 215 | """The real worker. Loads the page (_load_page) and awaits 216 | the end of the given 'delay'. While it is waiting outstanding 217 | QApplication events are processed. 218 | After the given delay, the Window or Widget (depends 219 | on the value of 'grabWholeWindow' is drawn into a QPixmap 220 | and postprocessed (_post_process_image). 221 | """ 222 | self._load_page(url, self.width, self.height, self.timeout) 223 | # Wait for end of timer. In this time, process 224 | # other outstanding Qt events. 225 | if self.wait > 0: 226 | if self.logger: self.logger.debug("Waiting %d seconds " % self.wait) 227 | waitToTime = time.time() + self.wait 228 | while time.time() < waitToTime: 229 | if QApplication.hasPendingEvents(): 230 | QApplication.processEvents() 231 | 232 | if self.renderTransparentBackground: 233 | # Another possible drawing solution 234 | image = QImage(self._page.viewportSize(), QImage.Format_ARGB32) 235 | image.fill(QColor(255,0,0,0).rgba()) 236 | 237 | # http://ariya.blogspot.com/2009/04/transparent-qwebview-and-qwebpage.html 238 | palette = self._view.palette() 239 | palette.setBrush(QPalette.Base, Qt.transparent) 240 | self._page.setPalette(palette) 241 | self._view.setAttribute(Qt.WA_OpaquePaintEvent, False) 242 | 243 | painter = QPainter(image) 244 | painter.setBackgroundMode(Qt.TransparentMode) 245 | self._page.mainFrame().render(painter) 246 | painter.end() 247 | else: 248 | if self.grabWholeWindow: 249 | # Note that this does not fully ensure that the 250 | # window still has the focus when the screen is 251 | # grabbed. This might result in a race condition. 252 | self._view.activateWindow() 253 | image = QPixmap.grabWindow(self._window.winId()) 254 | else: 255 | image = QPixmap.grabWidget(self._window) 256 | 257 | return self._post_process_image(image) 258 | 259 | def _load_page(self, url, width, height, timeout): 260 | """ 261 | This method implements the logic for retrieving and displaying 262 | the requested page. 263 | """ 264 | 265 | # This is an event-based application. So we have to wait until 266 | # "loadFinished(bool)" raised. 267 | cancelAt = time.time() + timeout 268 | self.__loading = True 269 | self.__loadingResult = False # Default 270 | if self.encodedUrl: 271 | self._page.mainFrame().load(QUrl.fromEncoded(url)) 272 | else: 273 | self._page.mainFrame().load(QUrl(url)) 274 | while self.__loading: 275 | if timeout > 0 and time.time() >= cancelAt: 276 | raise RuntimeError("Request timed out on %s" % url) 277 | while QApplication.hasPendingEvents() and self.__loading: 278 | QCoreApplication.processEvents() 279 | 280 | if self.logger: self.logger.debug("Processing result") 281 | 282 | if self.__loading_result == False: 283 | if self.logger: self.logger.warning("Failed to load %s" % url) 284 | 285 | # Set initial viewport (the size of the "window") 286 | size = self._page.mainFrame().contentsSize() 287 | if self.logger: self.logger.debug("contentsSize: %s", size) 288 | if width > 0: 289 | size.setWidth(width) 290 | if height > 0: 291 | size.setHeight(height) 292 | 293 | self._window.resize(size) 294 | 295 | def _post_process_image(self, qImage): 296 | """If 'scaleToWidth' or 'scaleToHeight' are set to a value 297 | greater than zero this method will scale the image 298 | using the method defined in 'scaleRatio'. 299 | """ 300 | if self.scaleToWidth > 0 or self.scaleToHeight > 0: 301 | # Scale this image 302 | if self.scaleRatio == 'keep': 303 | ratio = Qt.KeepAspectRatio 304 | elif self.scaleRatio in ['expand', 'crop']: 305 | ratio = Qt.KeepAspectRatioByExpanding 306 | else: # 'ignore' 307 | ratio = Qt.IgnoreAspectRatio 308 | qImage = qImage.scaled(self.scaleToWidth, self.scaleToHeight, ratio) 309 | if self.scaleRatio == 'crop': 310 | qImage = qImage.copy(0, 0, self.scaleToWidth, self.scaleToHeight) 311 | return qImage 312 | 313 | def _on_each_reply(self,reply): 314 | """Logs each requested uri""" 315 | self.logger.debug("Received %s" % (reply.url().toString())) 316 | 317 | # Eventhandler for "loadStarted()" signal 318 | def _on_load_started(self): 319 | """Slot that sets the '__loading' property to true.""" 320 | if self.logger: self.logger.debug("loading started") 321 | self.__loading = True 322 | 323 | # Eventhandler for "loadFinished(bool)" signal 324 | def _on_load_finished(self, result): 325 | """Slot that sets the '__loading' property to false and stores 326 | the result code in '__loading_result'. 327 | """ 328 | if self.logger: self.logger.debug("loading finished with result %s", result) 329 | self.__loading = False 330 | self.__loading_result = result 331 | 332 | # Eventhandler for "sslErrors(QNetworkReply *,const QList&)" signal 333 | def _on_ssl_errors(self, reply, errors): 334 | """Slot that writes SSL warnings into the log but ignores them.""" 335 | for e in errors: 336 | if self.logger: self.logger.warn("SSL: " + e.errorString()) 337 | reply.ignoreSslErrors() 338 | 339 | 340 | class CustomWebPage(QWebPage): 341 | def __init__(self, **kwargs): 342 | super(CustomWebPage, self).__init__() 343 | self.logger = kwargs.get('logger', None) 344 | self.ignore_alert = kwargs.get('ignore_alert', True) 345 | self.ignore_confirm = kwargs.get('ignore_confirm', True) 346 | self.ignore_prompt = kwargs.get('ignore_prompt', True) 347 | self.interrupt_js = kwargs.get('interrupt_js', True) 348 | 349 | def javaScriptAlert(self, frame, message): 350 | if self.logger: self.logger.debug('Alert: %s', message) 351 | if not self.ignore_alert: 352 | return super(CustomWebPage, self).javaScriptAlert(frame, message) 353 | 354 | def javaScriptConfirm(self, frame, message): 355 | if self.logger: self.logger.debug('Confirm: %s', message) 356 | if not self.ignore_confirm: 357 | return super(CustomWebPage, self).javaScriptConfirm(frame, message) 358 | else: 359 | return False 360 | 361 | def javaScriptPrompt(self, frame, message, default, result): 362 | """This function is called whenever a JavaScript program running inside frame tries to prompt 363 | the user for input. The program may provide an optional message, msg, as well as a default value 364 | for the input in defaultValue. 365 | 366 | If the prompt was cancelled by the user the implementation should return false; 367 | otherwise the result should be written to result and true should be returned. 368 | If the prompt was not cancelled by the user, the implementation should return true and 369 | the result string must not be null. 370 | """ 371 | if self.logger: self.logger.debug('Prompt: %s (%s)' % (message, default)) 372 | if not self.ignore_prompt: 373 | return super(CustomWebPage, self).javaScriptPrompt(frame, message, default, result) 374 | else: 375 | return False 376 | 377 | def shouldInterruptJavaScript(self): 378 | """This function is called when a JavaScript program is running for a long period of time. 379 | If the user wanted to stop the JavaScript the implementation should return true; otherwise false. 380 | """ 381 | if self.logger: self.logger.debug("WebKit ask to interrupt JavaScript") 382 | return self.interrupt_js 383 | -------------------------------------------------------------------------------- /proxycrawler/proxies.txt: -------------------------------------------------------------------------------- 1 | 5.199.132.164 443 HTTP 2 | 64.208.21.16 80 HTTP 3 | 110.153.9.250 80 HTTP 4 | 110.139.206.93 8080 HTTP 5 | 72.247.48.10 80 HTTP 6 | 180.250.130.186 80 HTTP 7 | 114.80.149.183 80 HTTP 8 | 123.108.14.39 8080 HTTP 9 | 211.167.112.14 80 HTTP 10 | 89.218.100.178 9090 HTTP 11 | 202.149.78.234 8080 HTTP 12 | 101.255.33.250 80 HTTP 13 | 80.79.179.10 8181 HTTP 14 | 180.250.165.197 8080 HTTP 15 | 119.110.71.109 8080 HTTP 16 | 123.129.242.131 8081 HTTP 17 | 211.167.112.14 82 HTTP 18 | 89.218.100.218 9090 HTTP 19 | 202.201.1.119 8001 HTTP 20 | 80.90.12.36 8080 HTTP 21 | 186.47.84.139 8080 HTTP 22 | 125.141.206.36 8080 HTTP 23 | 211.239.84.130 443 HTTP 24 | 89.218.120.114 9090 HTTP 25 | 203.92.47.202 8082 HTTP 26 | 114.247.21.244 3131 HTTP 27 | 190.92.87.98 8080 HTTP 28 | 87.110.149.88 8080 HTTP 29 | 186.208.71.70 8080 HTTP 30 | 119.235.21.11 80 HTTP 31 | 148.236.5.91 8080 HTTP 32 | 218.28.112.114 809 HTTP 33 | 119.252.168.34 8080 HTTP 34 | 202.52.244.110 8080 HTTP 35 | 218.108.85.59 82 HTTP 36 | 14.31.11.70 9009 HTTP 37 | 186.225.212.245 8080 HTTP 38 | 120.85.132.234 80 HTTP 39 | 175.25.243.27 80 HTTP 40 | 218.61.8.124 88 HTTP 41 | 89.251.103.130 8080 HTTP 42 | 195.202.159.123 8080 HTTP 43 | 212.233.147.48 8080 HTTP 44 | 218.206.204.254 80 HTTP 45 | 14.31.11.78 9009 HTTP 46 | 110.4.12.170 83 HTTP 47 | 186.226.98.254 8080 HTTP 48 | 124.227.191.68 9000 HTTP 49 | 190.29.22.247 8080 HTTP 50 | 218.89.165.131 6060 HTTP 51 | 91.218.84.195 80 HTTP 52 | 210.14.143.53 7020 HTTP 53 | 178.18.17.208 8080 HTTP 54 | 202.43.65.130 8080 HTTP 55 | 37.229.231.253 8080 HTTP 56 | 218.206.204.254 443 HTTP 57 | 58.210.247.18 1337 HTTP 58 | 112.213.118.48 80 HTTP 59 | 187.53.150.62 8080 HTTP 60 | 125.210.188.35 80 HTTP 61 | 195.69.191.203 80 HTTP 62 | 218.102.39.154 8080 HTTP 63 | 91.228.53.28 8080 HTTP 64 | 210.14.143.122 80 HTTP 65 | 202.43.188.5 8080 HTTP 66 | 202.46.146.22 8080 HTTP 67 | 92.126.217.47 80 HTTP 68 | 218.249.83.87 8080 HTTP 69 | 58.211.114.107 443 HTTP 70 | 113.160.50.51 80 HTTP 71 | 178.18.17.250 8080 HTTP 72 | 219.153.5.3 8181 HTTP 73 | 93.186.97.236 8080 HTTP 74 | 210.177.139.89 8080 HTTP 75 | 222.168.65.130 80 HTTP 76 | 103.28.227.78 8080 HTTP 77 | 219.150.254.158 8080 HTTP 78 | 58.221.129.158 1337 HTTP 79 | 119.82.253.88 8080 HTTP 80 | 187.85.89.167 8080 HTTP 81 | 182.93.206.92 8080 HTTP 82 | 202.106.179.141 10160 HTTP 83 | 219.223.252.150 56142 HTTP 84 | 95.77.97.146 8080 HTTP 85 | 211.99.28.21 808 HTTP 86 | 211.142.236.137 8080 HTTP 87 | 41.75.201.146 8080 HTTP 88 | 109.207.63.89 8090 HTTP 89 | 219.159.105.180 8080 HTTP 90 | 59.34.57.88 8080 HTTP 91 | 187.110.169.186 8080 HTTP 92 | 190.40.80.144 8080 HTTP 93 | 210.212.98.228 80 HTTP 94 | 220.246.4.74 8080 HTTP 95 | 211.142.236.133 8080 HTTP 96 | 218.22.71.122 8080 HTTP 97 | 58.210.212.107 80 HTTP 98 | 112.25.15.18 9098 HTTP 99 | 59.37.168.16 8081 HTTP 100 | 185.8.2.50 8080 HTTP 101 | 190.102.17.121 80 HTTP 102 | 218.106.99.22 888 HTTP 103 | 221.3.153.74 80 HTTP 104 | 106.3.98.79 80 HTTP 105 | 211.142.236.137 80 HTTP 106 | 2.133.93.170 9090 HTTP 107 | 78.9.164.162 8080 HTTP 108 | 119.6.73.235 80 HTTP 109 | 221.130.17.48 80 HTTP 110 | 59.57.15.71 80 HTTP 111 | 186.5.65.164 8080 HTTP 112 | 190.0.17.202 8080 HTTP 113 | 197.254.11.30 8080 HTTP 114 | 5.8.242.12 8080 HTTP 115 | 221.130.18.218 80 HTTP 116 | 110.74.220.50 8080 HTTP 117 | 211.144.72.153 80 HTTP 118 | 2.135.237.154 9090 HTTP 119 | 82.200.253.202 9090 HTTP 120 | 119.110.69.70 80 HTTP 121 | 221.130.17.139 80 HTTP 122 | 59.172.208.189 8080 HTTP 123 | 190.14.255.169 8080 HTTP 124 | 190.0.46.66 8080 HTTP 125 | 200.93.115.248 8080 HTTP 126 | 5.10.224.62 80 HTTP 127 | 221.130.18.253 80 HTTP 128 | 110.93.211.11 80 HTTP 129 | 82.209.195.5 8080 HTTP 130 | 125.39.68.195 80 HTTP 131 | 221.130.23.4 80 HTTP 132 | 59.172.208.190 8080 HTTP 133 | 198.15.119.111 8080 HTTP 134 | 190.0.61.194 8080 HTTP 135 | 200.107.32.127 8080 HTTP 136 | 5.10.224.62 8080 HTTP 137 | 221.130.23.4 80 HTTP 138 | 110.93.211.11 8080 HTTP 139 | 218.206.204.254 80 HTTP 140 | 2.135.238.26 9090 HTTP 141 | 180.250.192.222 8080 HTTP 142 | 221.130.23.5 80 HTTP 143 | 200.137.133.171 80 HTTP 144 | 31.170.178.2 8080 HTTP 145 | 221.130.23.6 80 HTTP 146 | 112.5.254.30 80 HTTP 147 | 218.206.204.254 443 HTTP 148 | 2.135.238.108 9090 HTTP 149 | 88.249.127.222 8080 HTTP 150 | 186.46.122.250 8080 HTTP 151 | 221.130.23.6 80 HTTP 152 | 61.166.55.153 11808 HTTP 153 | 201.218.63.4 8080 HTTP 154 | 190.211.97.71 8080 HTTP 155 | 200.213.4.4 8080 HTTP 156 | 41.78.26.45 8080 HTTP 157 | 221.130.23.29 80 HTTP 158 | 112.175.248.22 8080 HTTP 159 | 2.135.242.42 9090 HTTP 160 | 110.138.160.170 8080 HTTP 161 | 187.85.225.185 80 HTTP 162 | 221.130.23.8 80 HTTP 163 | 106.3.98.82 80 HTTP 164 | 201.63.184.5 8080 HTTP 165 | 46.249.66.50 8080 HTTP 166 | 222.169.11.34 8080 HTTP 167 | 114.113.221.72 54321 HTTP 168 | 221.10.40.232 80 HTTP 169 | 2.135.243.42 9090 HTTP 170 | 110.138.163.58 8080 HTTP 171 | 221.130.23.78 80 HTTP 172 | 106.3.98.82 82 HTTP 173 | 202.171.253.98 80 HTTP 174 | 202.108.77.153 80 HTTP 175 | 77.236.209.236 8080 HTTP 176 | 116.68.171.70 8080 HTTP 177 | 221.10.40.232 82 HTTP 178 | 27.50.11.165 80 HTTP 179 | 118.96.137.140 8080 HTTP 180 | 5.10.224.58 80 HTTP 181 | 106.3.98.82 83 HTTP 182 | 202.171.253.103 80 HTTP 183 | 198.154.114.100 8080 HTTP 184 | 202.162.198.178 8080 HTTP 185 | 91.202.164.185 8080 HTTP 186 | 223.4.205.37 808 HTTP 187 | 117.34.72.51 808 HTTP 188 | 221.10.40.232 83 HTTP 189 | 36.73.40.189 8080 HTTP 190 | 200.208.251.218 8080 HTTP 191 | 72.64.146.136 43 HTTP 192 | 221.130.23.80 80 HTTP 193 | 112.5.254.19 80 HTTP 194 | 202.171.253.103 85 HTTP 195 | 200.27.114.228 8080 HTTP 196 | 202.182.49.41 8080 HTTP 197 | 103.10.22.226 8080 HTTP 198 | 58.252.56.148 8080 HTTP 199 | 118.97.206.28 8080 HTTP 200 | 221.130.18.76 80 HTTP 201 | 58.67.147.204 8080 HTTP 202 | 201.64.247.3 8080 HTTP 203 | 81.169.154.244 8080 HTTP 204 | 221.130.23.81 80 HTTP 205 | 112.5.254.20 80 HTTP 206 | 202.171.253.108 80 HTTP 207 | 200.27.114.233 8080 HTTP 208 | 110.138.208.50 8080 HTTP 209 | 122.72.15.231 80 HTTP 210 | 118.97.212.162 8080 HTTP 211 | 221.130.199.19 80 HTTP 212 | 77.89.233.54 8080 HTTP 213 | 202.46.85.107 8080 HTTP 214 | 125.69.132.100 8080 HTTP 215 | 221.130.23.82 80 HTTP 216 | 117.41.182.188 8080 HTTP 217 | 202.171.253.108 83 HTTP 218 | 200.54.92.187 80 HTTP 219 | 37.77.50.133 80 HTTP 220 | 111.161.30.228 80 HTTP 221 | 124.81.208.34 8080 HTTP 222 | 119.4.250.105 80 HTTP 223 | 221.130.199.98 80 HTTP 224 | 85.172.4.154 80 HTTP 225 | 202.93.136.98 8080 HTTP 226 | 221.130.23.91 80 HTTP 227 | 118.145.0.76 10086 HTTP 228 | 203.124.12.71 8080 HTTP 229 | 200.61.31.69 8080 HTTP 230 | 61.55.141.11 80 HTTP 231 | 114.113.221.77 54321 HTTP 232 | 180.243.92.86 8080 HTTP 233 | 119.7.221.135 81 HTTP 234 | 221.178.174.171 888 HTTP 235 | 87.236.233.92 8080 HTTP 236 | 203.91.43.43 9988 HTTP 237 | 190.3.108.211 8080 HTTP 238 | 221.181.192.91 80 HTTP 239 | 206.130.99.82 8080 HTTP 240 | 200.71.86.50 8080 HTTP 241 | 61.135.223.4 7000 HTTP 242 | 119.110.69.70 8080 HTTP 243 | 202.74.241.196 8080 HTTP 244 | 119.7.221.135 82 HTTP 245 | 222.124.35.117 8080 HTTP 246 | 110.139.60.228 8080 HTTP 247 | 190.72.150.144 8080 HTTP 248 | 221.215.155.38 8090 HTTP 249 | 120.203.214.162 80 HTTP 250 | 211.232.93.13 808 HTTP 251 | 200.75.51.151 8080 HTTP 252 | 78.133.155.54 8080 HTTP 253 | 180.87.197.91 8080 HTTP 254 | 1.63.18.22 8080 HTTP 255 | 119.7.221.137 82 HTTP 256 | 222.187.222.118 8080 HTTP 257 | 111.13.87.150 80 HTTP 258 | 219.76.104.17 8080 HTTP 259 | 200.208.251.220 8080 HTTP 260 | 222.89.55.123 8080 HTTP 261 | 120.203.214.176 80 HTTP 262 | 218.102.39.153 8080 HTTP 263 | 200.109.228.67 8080 HTTP 264 | 109.207.61.189 8090 HTTP 265 | 180.249.119.252 8080 HTTP 266 | 2.133.92.106 9090 HTTP 267 | 119.59.193.175 8080 HTTP 268 | 62.84.67.170 8080 HTTP 269 | 111.161.30.236 80 HTTP 270 | 49.212.167.222 80 HTTP 271 | 58.20.230.131 8080 HTTP 272 | 222.217.99.72 9000 HTTP 273 | 120.203.214.187 80 HTTP 274 | 200.195.176.77 8080 HTTP 275 | 113.53.254.124 8080 HTTP 276 | 2.133.92.122 9090 HTTP 277 | 119.145.2.18 80 HTTP 278 | 171.101.144.18 8080 HTTP 279 | 112.5.254.172 80 HTTP 280 | 103.247.16.241 8080 HTTP 281 | 61.235.69.243 8080 HTTP 282 | 222.217.99.177 9000 HTTP 283 | 2.135.243.84 9090 HTTP 284 | 200.202.240.174 80 HTTP 285 | 123.235.12.118 8080 HTTP 286 | 200.169.162.132 80 HTTP 287 | 2.133.92.157 80 HTTP 288 | 119.233.255.51 80 HTTP 289 | 213.131.41.6 8080 HTTP 290 | 178.48.2.237 8080 HTTP 291 | 109.236.220.98 8080 HTTP 292 | 222.240.224.131 80 HTTP 293 | 123.134.95.142 80 HTTP 294 | 5.135.242.225 8080 HTTP 295 | 200.204.161.246 8080 HTTP 296 | 218.22.71.124 8080 HTTP 297 | 2.133.92.158 80 HTTP 298 | 119.233.255.60 80 HTTP 299 | 1.234.45.130 80 HTTP 300 | 180.247.120.217 8080 HTTP 301 | 116.255.234.73 3288 HTTP 302 | 61.177.248.202 1080 SOCKS4 303 | 201.56.208.233 8080 HTTP 304 | 177.83.122.189 8080 HTTP 305 | 218.22.71.126 8080 HTTP 306 | 2.133.93.82 9090 HTTP 307 | 119.235.21.10 8080 HTTP 308 | 27.116.21.163 8080 HTTP 309 | 124.207.170.230 8080 HTTP 310 | 121.204.0.2 80 HTTP 311 | 183.60.44.136 88 HTTP 312 | 49.0.96.1 8000 HTTP 313 | 201.86.70.162 80 HTTP 314 | 177.182.252.197 8080 HTTP 315 | 221.210.40.150 8080 HTTP 316 | 119.252.172.131 80 HTTP 317 | 49.0.110.1 8000 HTTP 318 | 124.240.187.81 82 HTTP 319 | 200.54.78.66 8080 HTTP 320 | 125.165.51.4 8080 HTTP 321 | 183.61.246.78 80 HTTP 322 | 62.201.207.14 8080 HTTP 323 | 201.249.192.74 8080 HTTP 324 | 190.116.87.4 8080 HTTP 325 | 2.135.238.92 9090 HTTP 326 | 120.194.100.46 8001 HTTP 327 | 58.215.88.12 80 HTTP 328 | 164.77.196.78 80 HTTP 329 | 202.154.225.229 8080 HTTP 330 | 186.5.102.162 8080 HTTP 331 | 114.80.136.112 7780 HTTP 332 | 183.129.249.82 80 HTTP 333 | 62.201.210.190 8080 HTTP 334 | 202.152.22.38 8080 HTTP 335 | 31.135.196.229 8080 HTTP 336 | 41.216.171.154 8080 HTTP 337 | 59.49.79.121 9527 HTTP 338 | 177.11.17.46 8080 HTTP 339 | 71.189.47.2 8081 HTTP 340 | 190.78.2.84 8080 HTTP 341 | 115.100.60.198 8000 HTTP 342 | 183.129.249.83 80 HTTP 343 | 63.141.216.176 80 HTTP 344 | 203.172.245.34 8080 HTTP 345 | 195.140.190.146 8080 HTTP 346 | 81.201.61.138 8080 HTTP 347 | 58.53.192.218 8123 HTTP 348 | 121.12.118.241 999 HTTP 349 | 59.57.15.71 80 HTTP 350 | 180.242.88.43 5311 HTTP 351 | 202.29.211.122 8080 HTTP 352 | 115.236.19.48 8080 HTTP 353 | 211.100.47.131 8990 HTTP 354 | 66.35.68.146 8080 HTTP 355 | 212.175.88.3 8080 HTTP 356 | 197.251.194.164 8080 HTTP 357 | 89.171.46.225 8080 HTTP 358 | 59.59.51.74 8001 HTTP 359 | 122.11.38.182 9090 HTTP 360 | 59.172.208.186 8080 HTTP 361 | 183.110.231.124 80 HTTP 362 | 202.202.1.189 80 HTTP 363 | 116.112.66.102 808 HTTP 364 | 211.100.47.244 8990 HTTP 365 | 74.221.211.117 8080 HTTP 366 | 213.110.196.11 80 HTTP 367 | 202.108.50.72 80 HTTP 368 | 94.137.239.19 81 HTTP 369 | 60.165.173.36 8003 HTTP 370 | 122.72.0.6 80 HTTP 371 | 61.156.217.166 8000 HTTP 372 | 187.20.25.42 8080 HTTP 373 | 203.93.104.20 80 HTTP 374 | 119.97.146.152 80 HTTP 375 | 211.100.52.42 8990 HTTP 376 | 77.65.22.245 8080 HTTP 377 | 217.117.14.247 80 HTTP 378 | 202.145.3.130 8080 HTTP 379 | 110.139.58.31 8080 HTTP 380 | 60.191.142.233 8360 HTTP 381 | 122.144.1.213 9999 HTTP 382 | 78.188.3.171 8080 HTTP 383 | 190.29.30.114 8080 HTTP 384 | 119.252.168.34 80 HTTP 385 | 120.194.100.42 8001 HTTP 386 | 211.142.236.133 80 HTTP 387 | 77.78.104.129 8080 HTTP 388 | 218.100.84.123 8080 HTTP 389 | 202.146.237.79 808 HTTP 390 | 114.113.221.70 54321 HTTP 391 | 61.136.93.38 8080 HTTP 392 | 122.252.181.20 8080 HTTP 393 | 78.188.47.21 8080 HTTP 394 | 190.128.170.18 8080 HTTP 395 | 178.233.149.172 8080 HTTP 396 | 120.203.214.182 80 HTTP 397 | 219.83.100.195 8080 HTTP 398 | 208.163.36.221 8080 HTTP 399 | 61.152.108.187 80 HTTP 400 | 123.30.174.61 8080 HTTP 401 | 83.17.80.124 8080 HTTP 402 | 200.60.11.20 8080 HTTP 403 | 177.70.17.154 8080 HTTP 404 | 187.5.122.231 8080 HTTP 405 | 122.72.2.180 80 HTTP 406 | 211.142.236.137 80 HTTP 407 | 77.238.209.194 8080 HTTP 408 | 222.124.19.210 8080 HTTP 409 | 221.179.173.170 8080 HTTP 410 | 118.97.58.166 8080 HTTP 411 | 61.155.140.154 55808 HTTP 412 | 92.39.54.161 80 HTTP 413 | 200.137.133.169 80 HTTP 414 | 177.85.233.190 8080 HTTP 415 | 122.72.120.63 80 HTTP 416 | 211.142.236.137 8080 HTTP 417 | 78.159.235.3 8080 HTTP 418 | 222.124.147.105 8080 HTTP 419 | 36.73.42.103 8080 HTTP 420 | 101.255.33.254 80 HTTP 421 | 201.41.66.212 8080 HTTP 422 | 178.219.103.205 8080 HTTP 423 | 200.24.17.46 80 HTTP 424 | 122.72.124.2 80 HTTP 425 | 218.22.71.125 8080 HTTP 426 | 80.90.27.60 8080 HTTP 427 | 222.124.207.29 8080 HTTP 428 | 60.166.13.182 80 HTTP 429 | 122.224.5.210 443 HTTP 430 | 85.207.17.146 8080 HTTP 431 | 123.164.148.134 80 HTTP 432 | 103.28.113.134 8080 HTTP 433 | 202.29.60.220 8080 HTTP 434 | 180.250.130.186 8080 HTTP 435 | 202.181.176.3 80 HTTP 436 | 122.225.22.22 8080 HTTP 437 | 218.22.71.210 8080 HTTP 438 | 222.124.218.164 8080 HTTP 439 | 60.216.7.28 3079 HTTP 440 | 89.218.100.90 9090 HTTP 441 | 123.164.148.134 82 HTTP 442 | 103.247.37.86 8080 HTTP 443 | 202.152.40.202 8080 HTTP 444 | 195.191.250.229 80 HTTP 445 | 213.24.60.52 8080 HTTP 446 | 202.97.159.227 8080 HTTP 447 | 218.104.193.102 80 HTTP 448 | 81.213.157.71 80 HTTP 449 | 223.25.195.68 8080 HTTP 450 | 78.38.80.142 8080 HTTP 451 | 186.192.17.138 8080 HTTP 452 | 89.237.134.10 8080 HTTP 453 | 124.81.113.183 8080 HTTP 454 | 109.74.236.165 8080 HTTP 455 | 217.29.117.162 8080 HTTP 456 | 203.110.169.76 9128 HTTP 457 | 218.201.21.175 80 HTTP 458 | 82.200.236.58 9090 HTTP 459 | 72.64.146.136 8080 HTTP 460 | 81.90.224.209 8080 HTTP 461 | 189.29.118.245 8080 HTTP 462 | 103.23.139.97 8080 HTTP 463 | 125.39.238.242 8080 HTTP 464 | 109.224.5.194 80 HTTP 465 | 60.214.67.86 8080 HTTP 466 | 203.110.169.83 9128 HTTP 467 | 218.201.21.176 80 HTTP 468 | 110.74.222.117 8080 HTTP 469 | 95.129.199.70 8080 HTTP 470 | 190.79.44.28 8080 HTTP 471 | 211.151.171.207 80 HTTP 472 | 218.108.242.100 48814 HTTP 473 | 93.189.28.106 8080 HTTP 474 | 211.144.76.58 9000 HTTP 475 | 218.201.21.177 80 HTTP 476 | 82.200.254.114 9090 HTTP 477 | 122.72.76.122 80 HTTP 478 | 103.5.49.37 8080 HTTP 479 | 201.12.116.18 8080 HTTP 480 | 109.207.61.182 8090 HTTP 481 | 150.165.75.129 8080 HTTP 482 | 111.161.30.237 80 HTTP 483 | 212.76.180.50 8080 HTTP 484 | 72.64.146.135 8080 HTTP 485 | 110.139.151.124 8080 HTTP 486 | 211.154.151.218 88 HTTP 487 | 218.201.21.178 80 HTTP 488 | 82.200.254.146 9090 HTTP 489 | 122.72.76.130 80 HTTP 490 | 109.207.61.167 8090 HTTP 491 | 202.43.188.9 8080 HTTP 492 | 110.136.245.31 8080 HTTP 493 | 151.236.194.2 8080 HTTP 494 | 113.108.92.104 80 HTTP 495 | 218.56.161.14 8118 HTTP 496 | 116.77.35.118 80 HTTP 497 | 211.167.112.14 80 HTTP 498 | 218.204.39.164 80 HTTP 499 | 89.188.224.70 8080 HTTP 500 | 190.121.154.246 8080 HTTP 501 | 124.240.187.79 81 HTTP 502 | 202.43.188.15 8080 HTTP 503 | 112.175.18.180 80 HTTP 504 | 164.77.196.75 80 HTTP 505 | 114.32.95.96 8080 HTTP 506 | 219.76.104.1 80 HTTP 507 | 111.161.30.233 80 HTTP 508 | 211.167.112.14 82 HTTP 509 | 221.130.17.37 80 HTTP 510 | 89.218.68.13 80 HTTP 511 | 200.55.206.210 8080 HTTP 512 | 124.240.187.79 82 HTTP 513 | 113.142.8.205 8080 HTTP 514 | 177.125.167.253 8080 HTTP 515 | 219.76.104.1 8080 HTTP 516 | 118.97.255.107 8080 HTTP 517 | 211.167.112.15 80 HTTP 518 | 221.130.18.45 80 HTTP 519 | 89.218.68.34 9090 HTTP 520 | 219.154.46.138 8080 HTTP 521 | 186.16.203.50 8080 HTTP 522 | 202.102.48.205 8080 HTTP 523 | 113.195.134.231 8080 HTTP 524 | 178.169.97.35 54321 HTTP 525 | 116.112.66.102 808 HTTP 526 | 219.231.164.40 45238 HTTP 527 | 176.33.138.156 8080 HTTP 528 | 211.167.112.15 82 HTTP 529 | 221.130.18.52 80 HTTP 530 | 89.218.68.130 9090 HTTP 531 | 197.251.194.126 8080 HTTP 532 | 203.93.28.166 8080 HTTP 533 | 183.110.231.240 80 HTTP 534 | 118.96.66.107 80 HTTP 535 | 219.242.50.50 8080 HTTP 536 | 211.167.112.16 80 HTTP 537 | 221.130.18.189 80 HTTP 538 | 89.218.68.132 80 HTTP 539 | 2.133.92.18 9090 HTTP 540 | 202.102.58.208 80 HTTP 541 | 218.108.242.105 41884 HTTP 542 | 116.226.46.19 8080 HTTP 543 | 183.221.250.137 80 HTTP 544 | 118.97.91.129 8080 HTTP 545 | 221.7.145.42 8080 HTTP 546 | 211.167.112.17 80 HTTP 547 | 221.130.18.253 80 HTTP 548 | 89.218.100.210 9090 HTTP 549 | 2.133.92.26 9090 HTTP 550 | 202.102.58.209 80 HTTP 551 | 27.116.21.162 8080 HTTP 552 | 118.26.231.104 5060 HTTP 553 | 183.221.250.141 80 HTTP 554 | 222.74.98.234 8080 HTTP 555 | 222.89.154.6 9000 HTTP 556 | 202.28.110.17 8080 HTTP 557 | 211.167.112.17 82 HTTP 558 | 221.176.14.72 80 HTTP 559 | 89.218.100.250 9090 HTTP 560 | 2.133.92.162 9090 HTTP 561 | 118.194.164.90 54321 HTTP 562 | 186.101.41.25 80 HTTP 563 | 118.195.65.243 80 HTTP 564 | 5.8.242.10 8080 HTTP 565 | 202.107.195.231 80 HTTP 566 | 218.28.254.77 880 HTTP 567 | 221.176.169.194 8001 HTTP 568 | 89.218.101.74 9090 HTTP 569 | 46.249.66.50 80 HTTP 570 | 119.7.221.137 81 HTTP 571 | 119.167.231.183 80 HTTP 572 | 41.78.25.69 8080 HTTP 573 | 210.212.152.5 80 HTTP 574 | 221.6.15.156 82 HTTP 575 | 221.181.192.25 80 HTTP 576 | 2.133.93.154 9090 HTTP 577 | 210.101.131.232 8080 HTTP 578 | 93.90.235.178 8080 HTTP 579 | 119.7.221.137 83 HTTP 580 | 186.101.65.115 80 HTTP 581 | 50.22.206.179 8080 HTTP 582 | 221.6.15.157 82 HTTP 583 | 93.94.180.15 8080 HTTP 584 | 2.133.94.42 9090 HTTP 585 | 218.94.1.166 82 HTTP 586 | 110.139.206.93 80 HTTP 587 | 119.62.128.172 80 HTTP 588 | 61.8.72.99 8080 HTTP 589 | 141.105.87.77 80 HTTP 590 | 41.215.3.98 80 HTTP 591 | 222.88.94.245 80 HTTP 592 | 101.255.36.30 808 HTTP 593 | 218.249.114.42 8088 HTTP 594 | 111.161.30.227 80 HTTP 595 | 121.14.9.76 80 HTTP 596 | 187.4.63.148 80 HTTP 597 | 68.71.76.242 8082 HTTP 598 | 197.251.194.121 8080 HTTP 599 | 119.187.148.102 8000 HTTP 600 | 222.92.141.155 8090 HTTP 601 | 101.255.60.158 8080 HTTP 602 | 2.135.237.194 9090 HTTP 603 | 111.161.30.232 80 HTTP 604 | 121.17.144.132 8080 HTTP 605 | 187.4.63.149 80 HTTP 606 | 121.52.144.245 80 HTTP 607 | 74.221.215.254 8080 HTTP 608 | 122.115.62.108 8081 HTTP 609 | 221.130.23.29 80 HTTP 610 | 222.187.222.118 8080 HTTP 611 | 103.11.99.162 8080 HTTP 612 | 2.135.237.250 9090 HTTP 613 | 2.133.94.26 9090 HTTP 614 | 112.125.120.145 10080 HTTP 615 | 122.4.78.26 34808 HTTP 616 | 122.72.0.145 80 HTTP 617 | 78.80.36.194 8080 HTTP 618 | 219.76.104.18 8080 HTTP 619 | 221.179.173.170 8080 HTTP 620 | 222.188.10.1 1080 SOCKS4 621 | 103.246.145.184 8080 HTTP 622 | 2.135.242.162 9090 HTTP 623 | 2.135.238.146 9090 HTTP 624 | 122.72.0.28 80 HTTP 625 | 187.33.208.250 8080 HTTP 626 | 122.72.2.180 80 HTTP 627 | 78.131.55.82 8080 HTTP 628 | 58.67.147.196 8080 HTTP 629 | 202.116.1.149 8128 HTTP 630 | 221.195.42.195 8080 HTTP 631 | 222.217.99.156 9000 HTTP 632 | 2.135.242.170 9090 HTTP 633 | 24.158.199.54 8082 HTTP 634 | 117.121.238.17 8080 HTTP 635 | 123.125.74.212 80 HTTP 636 | 187.115.65.187 80 HTTP 637 | 122.72.2.180 8080 HTTP 638 | 82.79.92.226 8080 HTTP 639 | 202.137.22.182 8080 HTTP 640 | 223.4.173.109 808 HTTP 641 | 2.135.242.186 9090 HTTP 642 | 118.96.192.84 8080 HTTP 643 | 124.240.187.79 80 HTTP 644 | 91.202.164.29 8080 HTTP 645 | 103.10.22.231 8080 HTTP 646 | 27.54.218.248 80 HTTP 647 | 222.74.98.234 8080 HTTP 648 | 27.50.132.145 88 HTTP 649 | 2.135.242.226 9090 HTTP 650 | 190.67.169.194 8080 HTTP 651 | 124.240.187.79 83 HTTP 652 | 123.164.148.132 80 HTTP 653 | 101.255.33.249 80 HTTP 654 | 69.29.105.153 8080 HTTP 655 | 58.67.147.205 8080 HTTP 656 | 61.167.49.188 8080 HTTP 657 | 124.240.187.80 80 HTTP 658 | 190.66.22.53 8080 HTTP 659 | 109.207.61.170 8090 HTTP 660 | 180.248.156.56 8080 HTTP 661 | 114.113.221.69 54321 HTTP 662 | 124.95.142.94 80 HTTP 663 | 58.248.254.38 80 HTTP 664 | -------------------------------------------------------------------------------- /proxycrawler/proxycrawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/proxycrawler/proxycrawler/__init__.py -------------------------------------------------------------------------------- /proxycrawler/proxycrawler/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class ProxycrawlerItem(Item): 9 | address = Field() 10 | port = Field() 11 | protocol = Field() 12 | location = Field() 13 | 14 | type = Field() # 0: anonymity #1 nonanonymity 15 | delay = Field() # in second 16 | timestamp = Field() 17 | -------------------------------------------------------------------------------- /proxycrawler/proxycrawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | # 6 | # Reference: http://www.cnblogs.com/igloo1986/archive/2012/08/25/2655597.html 7 | 8 | import urllib 9 | from scrapy.exceptions import DropItem 10 | import socket 11 | 12 | class ProxycrawlerPipeline(object): 13 | def process_item(self, item, spider): 14 | protocol = item['protocol'] 15 | address = item['address'] 16 | port = item['port'] 17 | proxies = {protocol:'%s:%s'% (address, port)} 18 | 19 | #check if proxy can be connected 20 | try: 21 | socket.setdefaulttimeout(3) 22 | data = urllib.urlopen('http://ifconfig.me/ip', proxies=proxies).read() 23 | except: 24 | raise DropItem("curl download the proxy %s:%s is bad" % (address, port)) 25 | 26 | if data: 27 | line = '%s\t%s\t%s\n' % (address, port, protocol) 28 | file('proxies.txt', 'a+').write(line) 29 | return item 30 | else: 31 | raise DropItem("Not valid proxy %s:%s" %(address, port)) 32 | -------------------------------------------------------------------------------- /proxycrawler/proxycrawler/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for proxycrawler project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'proxycrawler' 10 | 11 | SPIDER_MODULES = ['proxycrawler.spiders'] 12 | NEWSPIDER_MODULE = 'proxycrawler.spiders' 13 | ITEM_PIPELINES = ['proxycrawler.pipelines.ProxycrawlerPipeline'] 14 | 15 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 16 | # USER_AGENT = 'proxycrawler (+http://www.yourdomain.com)' 17 | -------------------------------------------------------------------------------- /proxycrawler/proxycrawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /proxycrawler/proxycrawler/spiders/proxy.py: -------------------------------------------------------------------------------- 1 | # See http://www.cnblogs.com/igloo1986/archive/2012/08/25/2655597.html 2 | from scrapy.selector import HtmlXPathSelector 3 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 4 | from scrapy.contrib.spiders import CrawlSpider, Rule 5 | from proxycrawler.items import ProxycrawlerItem 6 | import re 7 | 8 | class ProxySpider(CrawlSpider): 9 | name = 'proxy' 10 | allowed_domains = ['www.cnproxy.com'] 11 | indexes = [1,2,3,4,5,6,7,8,9,10] 12 | start_urls = [] 13 | for i in indexes: 14 | url = 'http://www.cnproxy.com/proxy%s.html' % i 15 | start_urls.append(url) 16 | start_urls.append('http://www.cnproxy.com/proxyedu1.html') 17 | start_urls.append('http://www.cnproxy.com/proxyedu2.html') 18 | 19 | def parse(self, response): 20 | hxs = HtmlXPathSelector(response) 21 | addresses = hxs.select('//tr[position()>1]/td[position()=1]').re('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') 22 | protocols = hxs.select('//tr[position()>1]/td[position()=2]').re('(.*)<\/td>') 23 | locations = hxs.select('//tr[position()>1]/td[position()=4]').re('(.*)<\/td>') 24 | ports_re = re.compile('write\(":"(.*)\)') 25 | raw_ports = ports_re.findall(response.body); 26 | port_map = {'z':'3','m':'4','k':'2','l':'9','d':'0','b':'5','i':'7','w':'6','r':'8','c':'1','+':''} 27 | ports = [] 28 | for port in raw_ports: 29 | tmp = port 30 | for key in port_map: 31 | tmp = tmp.replace(key, port_map[key]); 32 | ports.append(tmp) 33 | items = [] 34 | for i in range(len(addresses)): 35 | item = ProxycrawlerItem() 36 | item['address'] = addresses[i] 37 | item['protocol'] = protocols[i] 38 | item['location'] = locations[i] 39 | item['port'] = ports[i] 40 | items.append(item) 41 | return items 42 | -------------------------------------------------------------------------------- /proxycrawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = proxycrawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = proxycrawler 12 | -------------------------------------------------------------------------------- /scrapy-ws.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Example script to control a Scrapy server using its JSON-RPC web service. 4 | 5 | It only provides a reduced functionality as its main purpose is to illustrate 6 | how to write a web service client. Feel free to improve or write you own. 7 | 8 | Also, keep in mind that the JSON-RPC API is not stable. The recommended way for 9 | controlling a Scrapy server is through the execution queue (see the "queue" 10 | command). 11 | 12 | """ 13 | 14 | import sys, optparse, urllib, json 15 | from urlparse import urljoin 16 | 17 | from scrapy.utils.jsonrpc import jsonrpc_client_call, JsonRpcError 18 | 19 | def get_commands(): 20 | return { 21 | 'help': cmd_help, 22 | 'stop': cmd_stop, 23 | 'list-available': cmd_list_available, 24 | 'list-running': cmd_list_running, 25 | 'list-resources': cmd_list_resources, 26 | 'get-global-stats': cmd_get_global_stats, 27 | 'get-spider-stats': cmd_get_spider_stats, 28 | } 29 | 30 | def cmd_help(args, opts): 31 | """help - list available commands""" 32 | print "Available commands:" 33 | for _, func in sorted(get_commands().items()): 34 | print " ", func.__doc__ 35 | 36 | def cmd_stop(args, opts): 37 | """stop - stop a running spider""" 38 | jsonrpc_call(opts, 'crawler/engine', 'close_spider', args[0]) 39 | 40 | def cmd_list_running(args, opts): 41 | """list-running - list running spiders""" 42 | for x in json_get(opts, 'crawler/engine/open_spiders'): 43 | print x 44 | 45 | def cmd_list_available(args, opts): 46 | """list-available - list name of available spiders""" 47 | for x in jsonrpc_call(opts, 'crawler/spiders', 'list'): 48 | print x 49 | 50 | def cmd_list_resources(args, opts): 51 | """list-resources - list available web service resources""" 52 | for x in json_get(opts, '')['resources']: 53 | print x 54 | 55 | def cmd_get_spider_stats(args, opts): 56 | """get-spider-stats - get stats of a running spider""" 57 | stats = jsonrpc_call(opts, 'stats', 'get_stats', args[0]) 58 | for name, value in stats.items(): 59 | print "%-40s %s" % (name, value) 60 | 61 | def cmd_get_global_stats(args, opts): 62 | """get-global-stats - get global stats""" 63 | stats = jsonrpc_call(opts, 'stats', 'get_stats') 64 | for name, value in stats.items(): 65 | print "%-40s %s" % (name, value) 66 | 67 | def get_wsurl(opts, path): 68 | return urljoin("http://%s:%s/"% (opts.host, opts.port), path) 69 | 70 | def jsonrpc_call(opts, path, method, *args, **kwargs): 71 | url = get_wsurl(opts, path) 72 | return jsonrpc_client_call(url, method, *args, **kwargs) 73 | 74 | def json_get(opts, path): 75 | url = get_wsurl(opts, path) 76 | return json.loads(urllib.urlopen(url).read()) 77 | 78 | def parse_opts(): 79 | usage = "%prog [options] [arg] ..." 80 | description = "Scrapy web service control script. Use '%prog help' " \ 81 | "to see the list of available commands." 82 | op = optparse.OptionParser(usage=usage, description=description) 83 | op.add_option("-H", dest="host", default="localhost", \ 84 | help="Scrapy host to connect to") 85 | op.add_option("-P", dest="port", type="int", default=6080, \ 86 | help="Scrapy port to connect to") 87 | opts, args = op.parse_args() 88 | if not args: 89 | op.print_help() 90 | sys.exit(2) 91 | cmdname, cmdargs, opts = args[0], args[1:], opts 92 | commands = get_commands() 93 | if cmdname not in commands: 94 | sys.stderr.write("Unknown command: %s\n\n" % cmdname) 95 | cmd_help(None, None) 96 | sys.exit(1) 97 | return commands[cmdname], cmdargs, opts 98 | 99 | def main(): 100 | cmd, args, opts = parse_opts() 101 | try: 102 | cmd(args, opts) 103 | except IndexError: 104 | print cmd.__doc__ 105 | except JsonRpcError, e: 106 | print str(e) 107 | if e.data: 108 | print "Server Traceback below:" 109 | print e.data 110 | 111 | 112 | if __name__ == '__main__': 113 | main() 114 | -------------------------------------------------------------------------------- /server/code.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf8 3 | import os, sys 4 | import web 5 | import view 6 | import config 7 | import db 8 | from view import render 9 | 10 | urls = ( 11 | '/', 'index', 12 | '/page/(\d+)','index' 13 | ) 14 | 15 | class index: 16 | def GET(self,page=1): 17 | page = int(page) 18 | limit=config.perpage 19 | offset = (page -1)*limit 20 | counting = db.counting() 21 | pages = counting / limit 22 | if counting % limit > 0: 23 | pages += 1 24 | if page > pages: 25 | raise web.seeother('/') 26 | else: 27 | return render.base(view.listing(offset=offset,limit=limit), 28 | pages=pages, curpage=page) 29 | 30 | if __name__ == "__main__": 31 | if len(sys.argv)==1: 32 | port = os.environ.get("PORT", "8081") 33 | sys.argv.append(port) 34 | app = web.application(urls, globals()) 35 | app.internalerror = web.debugerror 36 | app.run() 37 | -------------------------------------------------------------------------------- /server/config.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import web 3 | DB = web.database(dbn='mysql', db='news', user='root', pw='feisky') 4 | cache = False 5 | perpage = 25 6 | -------------------------------------------------------------------------------- /server/db.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import config 3 | 4 | def listing(**k): 5 | return config.DB.select('news', order='created desc', **k) 6 | 7 | def counting(): 8 | return config.DB.query('select count(*) as count from news')[0].count 9 | -------------------------------------------------------------------------------- /server/sql/tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE items ( 2 | id serial primary key, 3 | author_id int references users, 4 | body text, 5 | created timestamp default current_timestamp, 6 | ); 7 | -------------------------------------------------------------------------------- /server/static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/server/static/favicon.ico -------------------------------------------------------------------------------- /server/templates/base.html: -------------------------------------------------------------------------------- 1 | $def with (page, title=None, pages=1, curpage=1) 2 | 3 | 4 | $if title: : $title\ 5 | $else: Tech News 6 | 7 | 8 | 9 | 10 |

Tech News Home

11 | $:page 12 | 13 | $if curpage!=1: 14 | Prev 15 | 16 | $if curpage+1 in range(1,pages+1): 17 | Next 18 | 19 | 20 | -------------------------------------------------------------------------------- /server/templates/item.html: -------------------------------------------------------------------------------- 1 | $def with (item) 2 | 3 |

4 | $item.title 5 | ($item.site) 6 | $datestr(item.created) 7 |

8 | 9 |

10 | $:item.abstract 11 |

12 | -------------------------------------------------------------------------------- /server/templates/listing.html: -------------------------------------------------------------------------------- 1 | $def with (items) 2 | 3 | $for item in items: 4 | $:render.item(item) -------------------------------------------------------------------------------- /server/view.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | import web 3 | import db 4 | import config 5 | import datetime 6 | 7 | def datestr(x): 8 | """ 9 | Can't seem to set mysql creation ddl to UTC, so we'll have to adjust the datestr 10 | function to localtime which we will assume is the same as your database server. 11 | """ 12 | return web.datestr(x, datetime.datetime.now()) 13 | 14 | t_globals = dict( 15 | datestr=datestr, 16 | ) 17 | render = web.template.render('templates/', cache=config.cache, 18 | globals=t_globals) 19 | render._keywords['globals']['render'] = render 20 | 21 | def listing(**k): 22 | l = db.listing(**k) 23 | return render.listing(l) 24 | -------------------------------------------------------------------------------- /server/webkit2png: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # webkit2png.py 4 | # 5 | # Creates screenshots of webpages using by QtWebkit. 6 | # 7 | # Copyright (c) 2008 Roland Tapken 8 | # 9 | # This program is free software; you can redistribute it and/or 10 | # modify it under the terms of the GNU General Public License 11 | # as published by the Free Software Foundation; either version 2 12 | # of the License, or (at your option) any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program; if not, write to the Free Software 21 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 | # 23 | # Nice ideas "todo": 24 | # - Add QTcpSocket support to create a "screenshot daemon" that 25 | # can handle multiple requests at the same time. 26 | 27 | from webkit2png import WebkitRenderer 28 | 29 | import sys 30 | import signal 31 | import os 32 | import urlparse 33 | import logging 34 | from optparse import OptionParser 35 | 36 | from PyQt4.QtCore import * 37 | from PyQt4.QtGui import * 38 | from PyQt4.QtWebKit import * 39 | from PyQt4.QtNetwork import * 40 | 41 | VERSION="20091224" 42 | LOG_FILENAME = 'webkit2png.log' 43 | logger = logging.getLogger('webkit2png'); 44 | 45 | def init_qtgui(display=None, style=None, qtargs=None): 46 | """Initiates the QApplication environment using the given args.""" 47 | if QApplication.instance(): 48 | logger.debug("QApplication has already been instantiated. \ 49 | Ignoring given arguments and returning existing QApplication.") 50 | return QApplication.instance() 51 | 52 | qtargs2 = [sys.argv[0]] 53 | 54 | if display: 55 | qtargs2.append('-display') 56 | qtargs2.append(display) 57 | # Also export DISPLAY var as this may be used 58 | # by flash plugin 59 | os.environ["DISPLAY"] = display 60 | 61 | if style: 62 | qtargs2.append('-style') 63 | qtargs2.append(style) 64 | 65 | qtargs2.extend(qtargs or []) 66 | 67 | return QApplication(qtargs2) 68 | 69 | 70 | if __name__ == '__main__': 71 | # This code will be executed if this module is run 'as-is'. 72 | 73 | # Enable HTTP proxy 74 | if 'http_proxy' in os.environ: 75 | proxy_url = urlparse.urlparse(os.environ.get('http_proxy')) 76 | proxy = QNetworkProxy(QNetworkProxy.HttpProxy, proxy_url.hostname, proxy_url.port) 77 | QNetworkProxy.setApplicationProxy(proxy) 78 | 79 | # Parse command line arguments. 80 | # Syntax: 81 | # $0 [--xvfb|--display=DISPLAY] [--debug] [--output=FILENAME] 82 | 83 | description = "Creates a screenshot of a website using QtWebkit." \ 84 | + "This program comes with ABSOLUTELY NO WARRANTY. " \ 85 | + "This is free software, and you are welcome to redistribute " \ 86 | + "it under the terms of the GNU General Public License v2." 87 | 88 | parser = OptionParser(usage="usage: %prog [options] ", 89 | version="%prog " + VERSION + ", Copyright (c) Roland Tapken", 90 | description=description, add_help_option=True) 91 | parser.add_option("-x", "--xvfb", nargs=2, type="int", dest="xvfb", 92 | help="Start an 'xvfb' instance with the given desktop size.", metavar="WIDTH HEIGHT") 93 | parser.add_option("-g", "--geometry", dest="geometry", nargs=2, default=(0, 0), type="int", 94 | help="Geometry of the virtual browser window (0 means 'autodetect') [default: %default].", metavar="WIDTH HEIGHT") 95 | parser.add_option("-o", "--output", dest="output", 96 | help="Write output to FILE instead of STDOUT.", metavar="FILE") 97 | parser.add_option("-f", "--format", dest="format", default="png", 98 | help="Output image format [default: %default]", metavar="FORMAT") 99 | parser.add_option("--scale", dest="scale", nargs=2, type="int", 100 | help="Scale the image to this size", metavar="WIDTH HEIGHT") 101 | parser.add_option("--aspect-ratio", dest="ratio", type="choice", choices=["ignore", "keep", "expand", "crop"], 102 | help="One of 'ignore', 'keep', 'crop' or 'expand' [default: %default]") 103 | parser.add_option("-F", "--feature", dest="features", action="append", type="choice", 104 | choices=["javascript", "plugins"], 105 | help="Enable additional Webkit features ('javascript', 'plugins')", metavar="FEATURE") 106 | parser.add_option("-w", "--wait", dest="wait", default=0, type="int", 107 | help="Time to wait after loading before the screenshot is taken [default: %default]", metavar="SECONDS") 108 | parser.add_option("-t", "--timeout", dest="timeout", default=0, type="int", 109 | help="Time before the request will be canceled [default: %default]", metavar="SECONDS") 110 | parser.add_option("-W", "--window", dest="window", action="store_true", 111 | help="Grab whole window instead of frame (may be required for plugins)", default=False) 112 | parser.add_option("-T", "--transparent", dest="transparent", action="store_true", 113 | help="Render output on a transparent background (Be sure to have a transparent background defined in the html)", default=False) 114 | parser.add_option("", "--style", dest="style", 115 | help="Change the Qt look and feel to STYLE (e.G. 'windows').", metavar="STYLE") 116 | parser.add_option("", "--encoded-url", dest="encoded_url", action="store_true", 117 | help="Treat URL as url-encoded", metavar="ENCODED_URL", default=False) 118 | parser.add_option("-d", "--display", dest="display", 119 | help="Connect to X server at DISPLAY.", metavar="DISPLAY") 120 | parser.add_option("--debug", action="store_true", dest="debug", 121 | help="Show debugging information.", default=False) 122 | parser.add_option("--log", action="store", dest="logfile", default=LOG_FILENAME, 123 | help="Select the log output file",) 124 | 125 | # Parse command line arguments and validate them (as far as we can) 126 | (options,args) = parser.parse_args() 127 | if len(args) != 1: 128 | parser.error("incorrect number of arguments") 129 | if options.display and options.xvfb: 130 | parser.error("options -x and -d are mutually exclusive") 131 | options.url = args[0] 132 | 133 | logging.basicConfig(filename=options.logfile,level=logging.WARN,) 134 | 135 | # Enable output of debugging information 136 | if options.debug: 137 | logger.setLevel(logging.DEBUG) 138 | 139 | if options.xvfb: 140 | # Start 'xvfb' instance by replacing the current process 141 | server_num = int(os.getpid() + 1e6) 142 | newArgs = ["xvfb-run", "--auto-servernum", "--server-num", str(server_num), "--server-args=-screen 0, %dx%dx24" % options.xvfb, sys.argv[0]] 143 | skipArgs = 0 144 | for i in range(1, len(sys.argv)): 145 | if skipArgs > 0: 146 | skipArgs -= 1 147 | elif sys.argv[i] in ["-x", "--xvfb"]: 148 | skipArgs = 2 # following: width and height 149 | else: 150 | newArgs.append(sys.argv[i]) 151 | logger.debug("Executing %s" % " ".join(newArgs)) 152 | try: 153 | os.execvp(newArgs[0],newArgs[1:]) 154 | except OSError: 155 | logger.error("Unable to find '%s'" % newArgs[0]) 156 | print >> sys.stderr, "Error - Unable to find '%s' for -x/--xvfb option" % newArgs[0] 157 | sys.exit(1) 158 | 159 | # Prepare output ("1" means STDOUT) 160 | if options.output is None: 161 | options.output = sys.stdout 162 | else: 163 | options.output = open(options.output, "w") 164 | 165 | logger.debug("Version %s, Python %s, Qt %s", VERSION, sys.version, qVersion()); 166 | 167 | # Technically, this is a QtGui application, because QWebPage requires it 168 | # to be. But because we will have no user interaction, and rendering can 169 | # not start before 'app.exec_()' is called, we have to trigger our "main" 170 | # by a timer event. 171 | def __main_qt(): 172 | # Render the page. 173 | # If this method times out or loading failed, a 174 | # RuntimeException is thrown 175 | try: 176 | # Initialize WebkitRenderer object 177 | renderer = WebkitRenderer() 178 | renderer.logger = logger 179 | renderer.width = options.geometry[0] 180 | renderer.height = options.geometry[1] 181 | renderer.timeout = options.timeout 182 | renderer.wait = options.wait 183 | renderer.format = options.format 184 | renderer.grabWholeWindow = options.window 185 | renderer.renderTransparentBackground = options.transparent 186 | renderer.encodedUrl = options.encoded_url 187 | 188 | if options.scale: 189 | renderer.scaleRatio = options.ratio 190 | renderer.scaleToWidth = options.scale[0] 191 | renderer.scaleToHeight = options.scale[1] 192 | 193 | if options.features: 194 | if "javascript" in options.features: 195 | renderer.qWebSettings[QWebSettings.JavascriptEnabled] = True 196 | if "plugins" in options.features: 197 | renderer.qWebSettings[QWebSettings.PluginsEnabled] = True 198 | 199 | renderer.render_to_file(url=options.url, file_object=options.output) 200 | options.output.close() 201 | QApplication.exit(0) 202 | except RuntimeError, e: 203 | logger.error("main: %s" % e) 204 | print >> sys.stderr, e 205 | QApplication.exit(1) 206 | 207 | # Initialize Qt-Application, but make this script 208 | # abortable via CTRL-C 209 | app = init_qtgui(display = options.display, style=options.style) 210 | signal.signal(signal.SIGINT, signal.SIG_DFL) 211 | 212 | QTimer.singleShot(0, __main_qt) 213 | sys.exit(app.exec_()) -------------------------------------------------------------------------------- /server/webkit2png.py: -------------------------------------------------------------------------------- 1 | # 2 | # webkit2png.py 3 | # 4 | # Creates screenshots of webpages using by QtWebkit. 5 | # 6 | # Copyright (c) 2008 Roland Tapken 7 | # 8 | # This program is free software; you can redistribute it and/or 9 | # modify it under the terms of the GNU General Public License 10 | # as published by the Free Software Foundation; either version 2 11 | # of the License, or (at your option) any later version. 12 | # 13 | # This program is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | # 18 | # You should have received a copy of the GNU General Public License 19 | # along with this program; if not, write to the Free Software 20 | # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 | # 22 | # Nice ideas "todo": 23 | # - Add QTcpSocket support to create a "screenshot daemon" that 24 | # can handle multiple requests at the same time. 25 | 26 | import time 27 | 28 | from PyQt4.QtCore import * 29 | from PyQt4.QtGui import * 30 | from PyQt4.QtWebKit import * 31 | from PyQt4.QtNetwork import * 32 | 33 | # Class for Website-Rendering. Uses QWebPage, which 34 | # requires a running QtGui to work. 35 | class WebkitRenderer(QObject): 36 | """A class that helps to create 'screenshots' of webpages using 37 | Qt's QWebkit. Requires PyQt4 library. 38 | 39 | Use "render()" to get a 'QImage' object, render_to_bytes() to get the 40 | resulting image as 'str' object or render_to_file() to write the image 41 | directly into a 'file' resource. 42 | 43 | These methods have to be called from within Qt's main (GUI) thread. 44 | An example on how to use this is the __qt_main() method at the end 45 | of the libraries source file. More generic examples: 46 | 47 | def qt_main(): 48 | while go_on(): 49 | do_something_meaningful() 50 | while QApplication.hasPendingEvents(): 51 | QApplication.processEvents() 52 | QApplication.quit() 53 | 54 | app = init_qtgui() 55 | QTimer.singleShot(0, qt_main) 56 | sys.exit(app.exec_()) 57 | 58 | Or let Qt handle event processing using a QTimer instance: 59 | 60 | def qt_main_loop(): 61 | if not go_on(): 62 | QApplication.quit() 63 | return 64 | do_something_meaningful() 65 | 66 | app = init_qtgui() 67 | main_timer = QTimer() 68 | QObject.connect(main_timer, QtCore.SIGNAL("timeout()"), qt_main_loop) 69 | sys.exit(app.exec_()) 70 | 71 | Avaible properties: 72 | width -- The width of the "browser" window. 0 means autodetect (default). 73 | height -- The height of the window. 0 means autodetect (default). 74 | timeout -- Seconds after that the request is aborted (default: 0) 75 | wait -- Seconds to wait after loading has been finished (default: 0) 76 | scaleToWidth -- The resulting image is scaled to this width. 77 | scaleToHeight -- The resulting image is scaled to this height. 78 | scaleRatio -- The image is scaled using this method. Possible values are: 79 | keep 80 | expand 81 | crop 82 | ignore 83 | grabWhileWindow -- If this is True a screenshot of the whole window is taken. Otherwise only the current frame is rendered. This is required for plugins to be visible, but it is possible that another window overlays the current one while the screenshot is taken. To reduce this possibility, the window is activated just before it is rendered if this property is set to True (default: False). 84 | qWebSettings -- Settings that should be assigned to the created QWebPage instance. See http://doc.trolltech.com/4.6/qwebsettings.html for possible keys. Defaults: 85 | JavascriptEnabled: False 86 | PluginsEnabled: False 87 | PrivateBrowsingEnabled: True 88 | JavascriptCanOpenWindows: False 89 | """ 90 | 91 | def __init__(self,**kwargs): 92 | """Sets default values for the properties.""" 93 | 94 | if not QApplication.instance(): 95 | raise RuntimeError(self.__class__.__name__ + " requires a running QApplication instance") 96 | QObject.__init__(self) 97 | 98 | # Initialize default properties 99 | self.width = kwargs.get('width', 0) 100 | self.height = kwargs.get('height', 0) 101 | self.timeout = kwargs.get('timeout', 0) 102 | self.wait = kwargs.get('wait', 0) 103 | self.scaleToWidth = kwargs.get('scaleToWidth', 0) 104 | self.scaleToHeight = kwargs.get('scaleToHeight', 0) 105 | self.scaleRatio = kwargs.get('scaleRatio', 'keep') 106 | self.format = kwargs.get('format', 'png') 107 | self.logger = kwargs.get('logger', None) 108 | # Set this to true if you want to capture flash. 109 | # Not that your desktop must be large enough for 110 | # fitting the whole window. 111 | self.grabWholeWindow = kwargs.get('grabWholeWindow', False) 112 | self.renderTransparentBackground = kwargs.get('renderTransparentBackground', False) 113 | self.ignoreAlert = kwargs.get('ignoreAlert', True) 114 | self.ignoreConfirm = kwargs.get('ignoreConfirm', True) 115 | self.ignorePrompt = kwargs.get('ignorePrompt', True) 116 | self.interruptJavaScript = kwargs.get('interruptJavaScript', True) 117 | self.encodedUrl = kwargs.get('encodedUrl', False) 118 | 119 | # Set some default options for QWebPage 120 | self.qWebSettings = { 121 | QWebSettings.JavascriptEnabled : False, 122 | QWebSettings.PluginsEnabled : False, 123 | QWebSettings.PrivateBrowsingEnabled : True, 124 | QWebSettings.JavascriptCanOpenWindows : False 125 | } 126 | 127 | 128 | def render(self, url): 129 | """Renders the given URL into a QImage object""" 130 | # We have to use this helper object because 131 | # QApplication.processEvents may be called, causing 132 | # this method to get called while it has not returned yet. 133 | helper = _WebkitRendererHelper(self) 134 | helper._window.resize( self.width, self.height ) 135 | image = helper.render(url) 136 | 137 | # Bind helper instance to this image to prevent the 138 | # object from being cleaned up (and with it the QWebPage, etc) 139 | # before the data has been used. 140 | image.helper = helper 141 | 142 | return image 143 | 144 | def render_to_file(self, url, file_object): 145 | """Renders the image into a File resource. 146 | Returns the size of the data that has been written. 147 | """ 148 | format = self.format # this may not be constant due to processEvents() 149 | image = self.render(url) 150 | qBuffer = QBuffer() 151 | image.save(qBuffer, format) 152 | file_object.write(qBuffer.buffer().data()) 153 | return qBuffer.size() 154 | 155 | def render_to_bytes(self, url): 156 | """Renders the image into an object of type 'str'""" 157 | format = self.format # this may not be constant due to processEvents() 158 | image = self.render(url) 159 | qBuffer = QBuffer() 160 | image.save(qBuffer, format) 161 | return qBuffer.buffer().data() 162 | 163 | class _WebkitRendererHelper(QObject): 164 | """This helper class is doing the real work. It is required to 165 | allow WebkitRenderer.render() to be called "asynchronously" 166 | (but always from Qt's GUI thread). 167 | """ 168 | 169 | def __init__(self, parent): 170 | """Copies the properties from the parent (WebkitRenderer) object, 171 | creates the required instances of QWebPage, QWebView and QMainWindow 172 | and registers some Slots. 173 | """ 174 | QObject.__init__(self) 175 | 176 | # Copy properties from parent 177 | for key,value in parent.__dict__.items(): 178 | setattr(self,key,value) 179 | 180 | # Create and connect required PyQt4 objects 181 | self._page = CustomWebPage(logger=self.logger, ignore_alert=self.ignoreAlert, 182 | ignore_confirm=self.ignoreConfirm, ignore_prompt=self.ignorePrompt, 183 | interrupt_js=self.interruptJavaScript) 184 | self._view = QWebView() 185 | self._view.setPage(self._page) 186 | self._window = QMainWindow() 187 | self._window.setCentralWidget(self._view) 188 | 189 | # Import QWebSettings 190 | for key, value in self.qWebSettings.iteritems(): 191 | self._page.settings().setAttribute(key, value) 192 | 193 | # Connect required event listeners 194 | self.connect(self._page, SIGNAL("loadFinished(bool)"), self._on_load_finished) 195 | self.connect(self._page, SIGNAL("loadStarted()"), self._on_load_started) 196 | self.connect(self._page.networkAccessManager(), SIGNAL("sslErrors(QNetworkReply *,const QList&)"), self._on_ssl_errors) 197 | self.connect(self._page.networkAccessManager(), SIGNAL("finished(QNetworkReply *)"), self._on_each_reply) 198 | 199 | # The way we will use this, it seems to be unesseccary to have Scrollbars enabled 200 | self._page.mainFrame().setScrollBarPolicy(Qt.Horizontal, Qt.ScrollBarAlwaysOff) 201 | self._page.mainFrame().setScrollBarPolicy(Qt.Vertical, Qt.ScrollBarAlwaysOff) 202 | self._page.settings().setUserStyleSheetUrl(QUrl("data:text/css,html,body{overflow-y:hidden !important;}")) 203 | 204 | # Show this widget 205 | self._window.show() 206 | 207 | def __del__(self): 208 | """Clean up Qt4 objects. """ 209 | self._window.close() 210 | del self._window 211 | del self._view 212 | del self._page 213 | 214 | def render(self, url): 215 | """The real worker. Loads the page (_load_page) and awaits 216 | the end of the given 'delay'. While it is waiting outstanding 217 | QApplication events are processed. 218 | After the given delay, the Window or Widget (depends 219 | on the value of 'grabWholeWindow' is drawn into a QPixmap 220 | and postprocessed (_post_process_image). 221 | """ 222 | self._load_page(url, self.width, self.height, self.timeout) 223 | # Wait for end of timer. In this time, process 224 | # other outstanding Qt events. 225 | if self.wait > 0: 226 | if self.logger: self.logger.debug("Waiting %d seconds " % self.wait) 227 | waitToTime = time.time() + self.wait 228 | while time.time() < waitToTime: 229 | if QApplication.hasPendingEvents(): 230 | QApplication.processEvents() 231 | 232 | if self.renderTransparentBackground: 233 | # Another possible drawing solution 234 | image = QImage(self._page.viewportSize(), QImage.Format_ARGB32) 235 | image.fill(QColor(255,0,0,0).rgba()) 236 | 237 | # http://ariya.blogspot.com/2009/04/transparent-qwebview-and-qwebpage.html 238 | palette = self._view.palette() 239 | palette.setBrush(QPalette.Base, Qt.transparent) 240 | self._page.setPalette(palette) 241 | self._view.setAttribute(Qt.WA_OpaquePaintEvent, False) 242 | 243 | painter = QPainter(image) 244 | painter.setBackgroundMode(Qt.TransparentMode) 245 | self._page.mainFrame().render(painter) 246 | painter.end() 247 | else: 248 | if self.grabWholeWindow: 249 | # Note that this does not fully ensure that the 250 | # window still has the focus when the screen is 251 | # grabbed. This might result in a race condition. 252 | self._view.activateWindow() 253 | image = QPixmap.grabWindow(self._window.winId()) 254 | else: 255 | image = QPixmap.grabWidget(self._window) 256 | 257 | return self._post_process_image(image) 258 | 259 | def _load_page(self, url, width, height, timeout): 260 | """ 261 | This method implements the logic for retrieving and displaying 262 | the requested page. 263 | """ 264 | 265 | # This is an event-based application. So we have to wait until 266 | # "loadFinished(bool)" raised. 267 | cancelAt = time.time() + timeout 268 | self.__loading = True 269 | self.__loadingResult = False # Default 270 | if self.encodedUrl: 271 | self._page.mainFrame().load(QUrl.fromEncoded(url)) 272 | else: 273 | self._page.mainFrame().load(QUrl(url)) 274 | while self.__loading: 275 | if timeout > 0 and time.time() >= cancelAt: 276 | raise RuntimeError("Request timed out on %s" % url) 277 | while QApplication.hasPendingEvents() and self.__loading: 278 | QCoreApplication.processEvents() 279 | 280 | if self.logger: self.logger.debug("Processing result") 281 | 282 | if self.__loading_result == False: 283 | if self.logger: self.logger.warning("Failed to load %s" % url) 284 | 285 | # Set initial viewport (the size of the "window") 286 | size = self._page.mainFrame().contentsSize() 287 | if self.logger: self.logger.debug("contentsSize: %s", size) 288 | if width > 0: 289 | size.setWidth(width) 290 | if height > 0: 291 | size.setHeight(height) 292 | 293 | self._window.resize(size) 294 | 295 | def _post_process_image(self, qImage): 296 | """If 'scaleToWidth' or 'scaleToHeight' are set to a value 297 | greater than zero this method will scale the image 298 | using the method defined in 'scaleRatio'. 299 | """ 300 | if self.scaleToWidth > 0 or self.scaleToHeight > 0: 301 | # Scale this image 302 | if self.scaleRatio == 'keep': 303 | ratio = Qt.KeepAspectRatio 304 | elif self.scaleRatio in ['expand', 'crop']: 305 | ratio = Qt.KeepAspectRatioByExpanding 306 | else: # 'ignore' 307 | ratio = Qt.IgnoreAspectRatio 308 | qImage = qImage.scaled(self.scaleToWidth, self.scaleToHeight, ratio) 309 | if self.scaleRatio == 'crop': 310 | qImage = qImage.copy(0, 0, self.scaleToWidth, self.scaleToHeight) 311 | return qImage 312 | 313 | def _on_each_reply(self,reply): 314 | """Logs each requested uri""" 315 | self.logger.debug("Received %s" % (reply.url().toString())) 316 | 317 | # Eventhandler for "loadStarted()" signal 318 | def _on_load_started(self): 319 | """Slot that sets the '__loading' property to true.""" 320 | if self.logger: self.logger.debug("loading started") 321 | self.__loading = True 322 | 323 | # Eventhandler for "loadFinished(bool)" signal 324 | def _on_load_finished(self, result): 325 | """Slot that sets the '__loading' property to false and stores 326 | the result code in '__loading_result'. 327 | """ 328 | if self.logger: self.logger.debug("loading finished with result %s", result) 329 | self.__loading = False 330 | self.__loading_result = result 331 | 332 | # Eventhandler for "sslErrors(QNetworkReply *,const QList&)" signal 333 | def _on_ssl_errors(self, reply, errors): 334 | """Slot that writes SSL warnings into the log but ignores them.""" 335 | for e in errors: 336 | if self.logger: self.logger.warn("SSL: " + e.errorString()) 337 | reply.ignoreSslErrors() 338 | 339 | 340 | class CustomWebPage(QWebPage): 341 | def __init__(self, **kwargs): 342 | super(CustomWebPage, self).__init__() 343 | self.logger = kwargs.get('logger', None) 344 | self.ignore_alert = kwargs.get('ignore_alert', True) 345 | self.ignore_confirm = kwargs.get('ignore_confirm', True) 346 | self.ignore_prompt = kwargs.get('ignore_prompt', True) 347 | self.interrupt_js = kwargs.get('interrupt_js', True) 348 | 349 | def javaScriptAlert(self, frame, message): 350 | if self.logger: self.logger.debug('Alert: %s', message) 351 | if not self.ignore_alert: 352 | return super(CustomWebPage, self).javaScriptAlert(frame, message) 353 | 354 | def javaScriptConfirm(self, frame, message): 355 | if self.logger: self.logger.debug('Confirm: %s', message) 356 | if not self.ignore_confirm: 357 | return super(CustomWebPage, self).javaScriptConfirm(frame, message) 358 | else: 359 | return False 360 | 361 | def javaScriptPrompt(self, frame, message, default, result): 362 | """This function is called whenever a JavaScript program running inside frame tries to prompt 363 | the user for input. The program may provide an optional message, msg, as well as a default value 364 | for the input in defaultValue. 365 | 366 | If the prompt was cancelled by the user the implementation should return false; 367 | otherwise the result should be written to result and true should be returned. 368 | If the prompt was not cancelled by the user, the implementation should return true and 369 | the result string must not be null. 370 | """ 371 | if self.logger: self.logger.debug('Prompt: %s (%s)' % (message, default)) 372 | if not self.ignore_prompt: 373 | return super(CustomWebPage, self).javaScriptPrompt(frame, message, default, result) 374 | else: 375 | return False 376 | 377 | def shouldInterruptJavaScript(self): 378 | """This function is called when a JavaScript program is running for a long period of time. 379 | If the user wanted to stop the JavaScript the implementation should return true; otherwise false. 380 | """ 381 | if self.logger: self.logger.debug("WebKit ask to interrupt JavaScript") 382 | return self.interrupt_js 383 | -------------------------------------------------------------------------------- /tutorial/README: -------------------------------------------------------------------------------- 1 | 2 | A simple tutorial from http://doc.scrapy.org/en/0.16/intro/tutorial.html -------------------------------------------------------------------------------- /tutorial/items.json: -------------------------------------------------------------------------------- 1 | [{"desc": ["\n "], "link": ["/"], "title": ["Top"]}, 2 | {"desc": [], "link": ["/Computers/"], "title": ["Computers"]}, 3 | {"desc": [], "link": ["/Computers/Programming/"], "title": ["Programming"]}, 4 | {"desc": [], "link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, 5 | {"desc": [], "link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, 6 | {"desc": ["\n \t", "\u00a0", "\n "], "link": [], "title": []}, 7 | {"desc": ["\n ", " \n ", "\n "], "link": ["/Computers/Programming/Languages/Python/Resources/"], "title": ["Computers: Programming: Languages: Python: Resources"]}, 8 | {"desc": ["\n ", " \n ", "\n "], "link": ["/Computers/Programming/Languages/Ruby/Books/"], "title": ["Computers: Programming: Languages: Ruby: Books"]}, 9 | {"desc": ["\n \t", "\n ", "\n\t\t\t\t\t"], "link": ["/World/Deutsch/Computer/Programmieren/Sprachen/Python/B%C3%BCcher/"], "title": ["German"]}, 10 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161769. Printed edition of official tutorial, for v2.x, from Python.org. [Network Theory, online]\n \n "], "link": ["http://www.network-theory.co.uk/python/intro/"], "title": ["An Introduction to Python"]}, 11 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Wesley J. Chun; Prentice Hall PTR, 2001, ISBN 0130260363. For experienced developers to improve extant skills; professional level examples. Starts by introducing syntax, objects, error handling, functions, classes, built-ins. [Prentice Hall]\n \n "], "link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130260363,00%2Ben-USS_01DBC.html"], "title": ["Core Python Programming"]}, 12 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - The primary goal of this book is to promote object-oriented design using Python and to illustrate the use of the emerging object-oriented design patterns.\r\nA secondary goal of the book is to present mathematical tools just in time. Analysis techniques and proofs are presented as needed and in the proper context.\n \n "], "link": ["http://www.brpreiss.com/books/opus7/html/book.html"], "title": ["Data Structures and Algorithms with Object-Oriented Design Patterns in Python"]}, 13 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Mark Pilgrim, Guide to Python 3 and its differences from Python 2. Each chapter starts with a real code sample and explains it fully. Has a comprehensive appendix of all the syntactic and semantic changes in Python 3\r\n\r\n\n \n "], "link": ["http://www.diveintopython.net/"], "title": ["Dive Into Python 3"]}, 14 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - This book covers a wide range of topics. From raw TCP and UDP to encryption with TSL, and then to HTTP, SMTP, POP, IMAP, and ssh. It gives you a good understanding of each field and how to do everything on the network with Python.\n \n "], "link": ["http://rhodesmill.org/brandon/2011/foundations-of-python-network-programming/"], "title": ["Foundations of Python Network Programming"]}, 15 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - Free Python books and tutorials.\n \n "], "link": ["http://www.techbooksforfree.com/perlpython.shtml"], "title": ["Free Python books"]}, 16 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - Annotated list of free online books on Python scripting language. Topics range from beginner to advanced.\n \n "], "link": ["http://www.freetechbooks.com/python-f6.html"], "title": ["FreeTechBooks: Python Scripting Language"]}, 17 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Allen B. Downey, Jeffrey Elkner, Chris Meyers; Green Tea Press, 2002, ISBN 0971677506. Teaches general principles of programming, via Python as subject language. Thorough, in-depth approach to many basic and intermediate programming topics. Full text online and downloads: HTML, PDF, PS, LaTeX. [Free, Green Tea Press]\n \n "], "link": ["http://greenteapress.com/thinkpython/"], "title": ["How to Think Like a Computer Scientist: Learning with Python"]}, 18 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - Book by Alan Gauld with full text online. Introduction for those learning programming basics: terminology, concepts, methods to write code. Assumes no prior knowledge but basic computer skills.\n \n "], "link": ["http://www.freenetpages.co.uk/hp/alan.gauld/"], "title": ["Learn to Program Using Python"]}, 19 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Rashi Gupta; John Wiley and Sons, 2002, ISBN 0471219754. Covers language basics, use for CGI scripting, GUI development, network programming; shows why it is one of more sophisticated of popular scripting languages. [Wiley]\n \n "], "link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0471219754.html"], "title": ["Making Use of Python"]}, 20 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Magnus Lie Hetland; Apress LP, 2002, ISBN 1590590066. Readable guide to ideas most vital to new users, from basics common to high level languages, to more specific aspects, to a series of 10 ever more complex programs. [Apress]\n \n "], "link": ["http://hetland.org/writing/practical-python/"], "title": ["Practical Python"]}, 21 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Rytis Sileika, ISBN13: 978-1-4302-2605-5, Uses real-world system administration examples like manage devices with SNMP and SOAP, build a distributed monitoring system, manage web applications and parse complex log files, monitor and manage MySQL databases.\r\n\n \n "], "link": ["http://www.sysadminpy.com/"], "title": ["Pro Python System Administration"]}, 22 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - A Complete Introduction to the Python 3.\n \n "], "link": ["http://www.qtrac.eu/py3book.html"], "title": ["Programming in Python 3 (Second Edition)"]}, 23 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Dave Brueck, Stephen Tanner; John Wiley and Sons, 2001, ISBN 0764548077. Full coverage, clear explanations, hands-on examples, full language reference; shows step by step how to use components, assemble them, form full-featured programs. [John Wiley and Sons]\n \n "], "link": ["http://www.wiley.com/WileyCDA/WileyTitle/productCd-0764548077.html"], "title": ["Python 2.1 Bible"]}, 24 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - A step-by-step tutorial for OOP in Python 3, including discussion and examples of abstraction, encapsulation, information hiding, and raise, handle, define, and manipulate exceptions.\n \n "], "link": ["https://www.packtpub.com/python-3-object-oriented-programming/book"], "title": ["Python 3 Object Oriented Programming"]}, 25 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Guido van Rossum, Fred L. Drake, Jr.; Network Theory Ltd., 2003, ISBN 0954161785. Printed edition of official language reference, for v2.x, from Python.org, describes syntax, built-in datatypes. [Network Theory, online]\n \n "], "link": ["http://www.network-theory.co.uk/python/language/"], "title": ["Python Language Reference Manual"]}, 26 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Thomas W. Christopher; Prentice Hall PTR, 2002, ISBN 0130409561. Shows how to write large programs, introduces powerful design patterns that deliver high levels of robustness, scalability, reuse.\n \n "], "link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0130409561,00%2Ben-USS_01DBC.html"], "title": ["Python Programming Patterns"]}, 27 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Richard Hightower; Addison-Wesley, 2002, 0201616165. Begins with Python basics, many exercises, interactive sessions. Shows programming novices concepts and practical methods. Shows programming experts Python's abilities and ways to interface with Java APIs. [publisher website]\n \n "], "link": ["http://www.informit.com/store/product.aspx?isbn=0201616165&redir=1"], "title": ["Python Programming with the Java Class Libraries: A Tutorial for Building Web and Enterprise Applications with Jython"]}, 28 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Chris Fehily; Peachpit Press, 2002, ISBN 0201748843. Task-based, step-by-step visual reference guide, many screen shots, for courses in digital graphics; Web design, scripting, development; multimedia, page layout, office tools, operating systems. [Prentice Hall]\n \n "], "link": ["http://www.pearsonhighered.com/educator/academic/product/0,,0201748843,00%2Ben-USS_01DBC.html"], "title": ["Python: Visual QuickStart Guide"]}, 29 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Ivan Van Laningham; Sams Publishing, 2000, ISBN 0672317354. Split into 24 hands-on, 1 hour lessons; steps needed to learn topic: syntax, language features, OO design and programming, GUIs (Tkinter), system administration, CGI. [Sams Publishing]\n \n "], "link": ["http://www.informit.com/store/product.aspx?isbn=0672317354"], "title": ["Sams Teach Yourself Python in 24 Hours"]}, 30 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By David Mertz; Addison Wesley. Book in progress, full text, ASCII format. Asks for feedback. [author website, Gnosis Software, Inc.]\n \n "], "link": ["http://gnosis.cx/TPiP/"], "title": ["Text Processing in Python"]}, 31 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - By Sean McGrath; Prentice Hall PTR, 2000, ISBN 0130211192, has CD-ROM. Methods to build XML applications fast, Python tutorial, DOM and SAX, new Pyxie open source XML processing library. [Prentice Hall PTR]\n \n "], "link": ["http://www.informit.com/store/product.aspx?isbn=0130211192"], "title": ["XML Processing with Python"]}, 32 | {"desc": ["\n "], "link": ["/"], "title": ["Top"]}, 33 | {"desc": [], "link": ["/Computers/"], "title": ["Computers"]}, 34 | {"desc": [], "link": ["/Computers/Programming/"], "title": ["Programming"]}, 35 | {"desc": [], "link": ["/Computers/Programming/Languages/"], "title": ["Languages"]}, 36 | {"desc": [], "link": ["/Computers/Programming/Languages/Python/"], "title": ["Python"]}, 37 | {"desc": ["\n \t", "\u00a0", "\n "], "link": [], "title": []}, 38 | {"desc": ["\n ", " \n ", "\n "], "link": ["/Computers/Programming/Resources/"], "title": ["Computers: Programming: Resources"]}, 39 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - A directory of free Python and Zope hosting providers, with reviews and ratings.\n \n "], "link": ["http://www.oinko.net/freepython/"], "title": ["Free Python and Zope Hosting Directory"]}, 40 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - Features Python books, resources, news and articles.\n \n "], "link": ["http://oreilly.com/python/"], "title": ["O'Reilly Python Center"]}, 41 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - Resources for reporting bugs, accessing the Python source tree with CVS and taking part in the development of Python.\n \n "], "link": ["http://www.python.org/dev/"], "title": ["Python Developer's Guide"]}, 42 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - Scripts, examples and news about Python programming for the Windows platform.\n \n "], "link": ["http://win32com.goermezer.de/"], "title": ["Social Bug"]}, 43 | {"desc": ["\n\t\t\t\n \t", " \n\t\t\t\n\t\t\t\t\t\n - Contains links to assorted resources from the Python universe, compiled by PythonWare.\n \n "], "link": ["http://www.pythonware.com/daily/"], "title": ["eff-bot's Daily Python URL"]}] -------------------------------------------------------------------------------- /tutorial/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = tutorial.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tutorial 12 | -------------------------------------------------------------------------------- /tutorial/tutorial/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feiskyer/scrapy-examples/3f72fe87d9272859f7bce31d0394ca078a6ab4a3/tutorial/tutorial/__init__.py -------------------------------------------------------------------------------- /tutorial/tutorial/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class TutorialItem(Item): 9 | # define the fields for your item here like: 10 | title=Field() 11 | link=Field() 12 | desc=Field() 13 | 14 | class DmozItem(Item): 15 | # define the fields for your item here like: 16 | title=Field() 17 | link=Field() 18 | desc=Field() 19 | 20 | -------------------------------------------------------------------------------- /tutorial/tutorial/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | 6 | class TutorialPipeline(object): 7 | def process_item(self, item, spider): 8 | return item 9 | -------------------------------------------------------------------------------- /tutorial/tutorial/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for tutorial project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'tutorial' 10 | 11 | SPIDER_MODULES = ['tutorial.spiders'] 12 | NEWSPIDER_MODULE = 'tutorial.spiders' 13 | 14 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 15 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)' 16 | -------------------------------------------------------------------------------- /tutorial/tutorial/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /tutorial/tutorial/spiders/dmoz_spider.py: -------------------------------------------------------------------------------- 1 | from scrapy.spider import BaseSpider 2 | from scrapy.selector import HtmlXPathSelector 3 | from tutorial.items import DmozItem 4 | 5 | class DmozSpider(BaseSpider): 6 | name = "dmoz" 7 | allowed_domains = ["dmoz.org"] 8 | start_urls = [ 9 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/", 10 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/" 11 | ] 12 | 13 | def parse(self, response): 14 | hxs = HtmlXPathSelector(response) 15 | sites = hxs.select('//ul/li') 16 | items = [] 17 | for site in sites: 18 | item = DmozItem() 19 | item['title'] = site.select('a/text()').extract() 20 | item['link'] = site.select('a/@href').extract() 21 | item['desc'] = site.select('text()').extract() 22 | items.append(item) 23 | return items 24 | -------------------------------------------------------------------------------- /web.py/README: -------------------------------------------------------------------------------- 1 | web.py simple examples. 2 | 3 | Web sites: `http://webpy.org` 4 | 5 | Install guide: 6 | 7 | pip install web.py 8 | pip install markdown 9 | 10 | Code examples of web.py is taken from `http://webpy.org/src/` 11 | -------------------------------------------------------------------------------- /web.py/simple-blog/blog.py: -------------------------------------------------------------------------------- 1 | """ Basic blog using webpy 0.3 """ 2 | import web 3 | import model 4 | 5 | ### Url mappings 6 | 7 | urls = ( 8 | '/', 'Index', 9 | '/view/(\d+)', 'View', 10 | '/new', 'New', 11 | '/delete/(\d+)', 'Delete', 12 | '/edit/(\d+)', 'Edit', 13 | ) 14 | 15 | 16 | ### Templates 17 | t_globals = { 18 | 'datestr': web.datestr 19 | } 20 | render = web.template.render('templates', base='base', globals=t_globals) 21 | 22 | class Index: 23 | 24 | def GET(self): 25 | """ Show page """ 26 | posts = model.get_posts() 27 | return render.index(posts) 28 | 29 | 30 | class View: 31 | 32 | def GET(self, id): 33 | """ View single post """ 34 | post = model.get_post(int(id)) 35 | return render.view(post) 36 | 37 | 38 | class New: 39 | 40 | form = web.form.Form( 41 | web.form.Textbox('title', web.form.notnull, 42 | size=30, 43 | description="Post title:"), 44 | web.form.Textarea('content', web.form.notnull, 45 | rows=30, cols=80, 46 | description="Post content:"), 47 | web.form.Button('Post entry'), 48 | ) 49 | 50 | def GET(self): 51 | form = self.form() 52 | return render.new(form) 53 | 54 | def POST(self): 55 | form = self.form() 56 | if not form.validates(): 57 | return render.new(form) 58 | model.new_post(form.d.title, form.d.content) 59 | raise web.seeother('/') 60 | 61 | 62 | class Delete: 63 | 64 | def POST(self, id): 65 | model.del_post(int(id)) 66 | raise web.seeother('/') 67 | 68 | 69 | class Edit: 70 | 71 | def GET(self, id): 72 | post = model.get_post(int(id)) 73 | form = New.form() 74 | form.fill(post) 75 | return render.edit(post, form) 76 | 77 | 78 | def POST(self, id): 79 | form = New.form() 80 | post = model.get_post(int(id)) 81 | if not form.validates(): 82 | return render.edit(post, form) 83 | model.update_post(int(id), form.d.title, form.d.content) 84 | raise web.seeother('/') 85 | 86 | 87 | app = web.application(urls, globals()) 88 | 89 | if __name__ == '__main__': 90 | app.run() 91 | -------------------------------------------------------------------------------- /web.py/simple-blog/model.py: -------------------------------------------------------------------------------- 1 | import web, datetime 2 | 3 | db = web.database(dbn='mysql', db='todo', user='root', passwd='feisky') 4 | 5 | def get_posts(): 6 | return db.select('entries', order='id DESC') 7 | 8 | def get_post(id): 9 | try: 10 | return db.select('entries', where='id=$id', vars=locals())[0] 11 | except IndexError: 12 | return None 13 | 14 | def new_post(title, text): 15 | db.insert('entries', title=title, content=text, posted_on=datetime.datetime.utcnow()) 16 | 17 | def del_post(id): 18 | db.delete('entries', where="id=$id", vars=locals()) 19 | 20 | def update_post(id, title, text): 21 | db.update('entries', where="id=$id", vars=locals(), 22 | title=title, content=text) 23 | -------------------------------------------------------------------------------- /web.py/simple-blog/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE entries ( 2 | id INT AUTO_INCREMENT, 3 | title TEXT, 4 | content TEXT, 5 | posted_on DATETIME, 6 | primary key (id) 7 | ); 8 | -------------------------------------------------------------------------------- /web.py/simple-blog/templates/base.html: -------------------------------------------------------------------------------- 1 | $def with (page) 2 | 3 | 4 | 5 | My Blog 6 | 12 | 13 | 14 | 15 | 19 | 20 | $:page 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /web.py/simple-blog/templates/edit.html: -------------------------------------------------------------------------------- 1 | $def with (post, form) 2 | 3 |

Edit $form.d.title

4 | 5 |
6 | $:form.render() 7 |
8 | 9 | 10 |

Delete post

11 |
12 | 13 |
14 | -------------------------------------------------------------------------------- /web.py/simple-blog/templates/index.html: -------------------------------------------------------------------------------- 1 | $def with (posts) 2 | 3 |

Blog posts

4 | 5 |
    6 | $for post in posts: 7 |
  • 8 | $post.title 9 | from $datestr(post.posted_on) 10 | Edit 11 |
  • 12 |
13 | -------------------------------------------------------------------------------- /web.py/simple-blog/templates/new.html: -------------------------------------------------------------------------------- 1 | $def with (form) 2 | 3 | 4 |

New Blog Post

5 |
6 | $:form.render() 7 |
8 | 9 | -------------------------------------------------------------------------------- /web.py/simple-blog/templates/view.html: -------------------------------------------------------------------------------- 1 | $def with (post) 2 | 3 |

$post.title

4 | $datestr(post.posted_on)
5 | 6 | $post.content 7 | 8 | -------------------------------------------------------------------------------- /web.py/simple-todo/model.py: -------------------------------------------------------------------------------- 1 | import web 2 | 3 | db=web.database(dbn="mysql", db="todo", user="root", passwd="feisky") 4 | 5 | def get_todos(): 6 | return db.select('todo', order='id') 7 | 8 | def new_todo(text): 9 | db.insert('todo', title=text) 10 | 11 | def del_todo(id): 12 | db.delete('todo', where="id=$id", vars=locals()) 13 | 14 | -------------------------------------------------------------------------------- /web.py/simple-todo/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE todo ( 2 | id INT AUTO_INCREMENT, 3 | title TEXT, 4 | primary key (id) 5 | ); 6 | -------------------------------------------------------------------------------- /web.py/simple-todo/templates/base.html: -------------------------------------------------------------------------------- 1 | $def with (page) 2 | 3 | 4 | 5 | Todo list 6 | 7 | 8 | $:page 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /web.py/simple-todo/templates/index.html: -------------------------------------------------------------------------------- 1 | $def with(todos, form) 2 | 3 | 4 | 5 | 6 | 7 | 8 | $for todo in todos: 9 | 10 | 11 | 16 | 17 | 18 |
What to do?
$todo.title 12 |
13 | 14 |
15 |
19 |
20 | $:form.render() 21 |
22 | 23 | -------------------------------------------------------------------------------- /web.py/simple-todo/todo.py: -------------------------------------------------------------------------------- 1 | import web 2 | import model 3 | 4 | urls=( 5 | '/', 'Index', 6 | '/del/(\d+)', 'Delete' 7 | ) 8 | 9 | render = web.template.render('templates', base='base') 10 | 11 | class Index: 12 | form=web.form.Form( 13 | web.form.Textbox('title', web.form.notnull, 14 | description='I need to:'), 15 | web.form.Button('Add todo'), 16 | ) 17 | 18 | def GET(self): 19 | todos=model.get_todos() 20 | form=self.form() 21 | return render.index(todos, form) 22 | 23 | def POST(self): 24 | form=self.form() 25 | if not form.validates(): 26 | todos=model.get_todos() 27 | return render.index(todos, form) 28 | model.new_todo(form.d.title) 29 | raise web.seeother('/') 30 | 31 | class Delete: 32 | def POST(self, id): 33 | id=int(id) 34 | model.del_todo(id) 35 | raise web.seeother('/') 36 | 37 | if __name__=='__main__': 38 | app=web.application(urls, globals()) 39 | app.run() 40 | -------------------------------------------------------------------------------- /web.py/simple-wiki/model.py: -------------------------------------------------------------------------------- 1 | import web 2 | 3 | db = web.database(dbn='mysql', db='todo', user='root', passwd='feisky') 4 | 5 | def get_pages(): 6 | return db.select('pages', order='id DESC') 7 | 8 | def get_page_by_url(url): 9 | try: 10 | return db.select('pages', where='url=$url', vars=locals())[0] 11 | except IndexError: 12 | return None 13 | 14 | def get_page_by_id(id): 15 | try: 16 | return db.select('pages', where='id=$id', vars=locals())[0] 17 | except IndexError: 18 | return None 19 | 20 | def new_page(url, title, text): 21 | db.insert('pages', url=url, title=title, content=text) 22 | 23 | def del_page(id): 24 | db.delete('pages', where="id=$id", vars=locals()) 25 | 26 | def update_page(id, url, title, text): 27 | db.update('pages', where="id=$id", vars=locals(), 28 | url=url, title=title, content=text) 29 | -------------------------------------------------------------------------------- /web.py/simple-wiki/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE pages ( 2 | id INT AUTO_INCREMENT, 3 | url TEXT, 4 | title TEXT, 5 | content TEXT, 6 | primary key (id) 7 | ); 8 | -------------------------------------------------------------------------------- /web.py/simple-wiki/templates/base.html: -------------------------------------------------------------------------------- 1 | $def with (page) 2 | 3 | 4 | 5 | $if page.has_key('title'): 6 | $page.title 7 | $else: 8 | My Wiki 9 | 11 | 12 | 13 | 14 | 18 | 19 | $:page 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /web.py/simple-wiki/templates/edit.html: -------------------------------------------------------------------------------- 1 | $def with (page, form) 2 | 3 |

Edit $form.d.title

4 | 5 |
6 | $:form.render() 7 |
8 | 9 | 10 |

Delete page?

11 |
12 | 13 |
14 | -------------------------------------------------------------------------------- /web.py/simple-wiki/templates/index.html: -------------------------------------------------------------------------------- 1 | $def with (pages) 2 | 3 |

Webpy Wiki

4 | 5 |

Pages:

6 | 7 |
    8 | $for page in pages: 9 |
  • $page.url
  • 10 | 11 |
12 | -------------------------------------------------------------------------------- /web.py/simple-wiki/templates/new.html: -------------------------------------------------------------------------------- 1 | $def with (form) 2 | 3 | 4 |

New Wiki Page

5 |
6 | $:form.render() 7 |
8 | -------------------------------------------------------------------------------- /web.py/simple-wiki/templates/view.html: -------------------------------------------------------------------------------- 1 | $def with (page) 2 | 3 | $var title: $page.title 4 | 5 |

$page.title

6 | 7 | $:markdown(page.content) 8 | 9 | 10 | Edit 11 | -------------------------------------------------------------------------------- /web.py/simple-wiki/wiki.py: -------------------------------------------------------------------------------- 1 | """ Basic wiki using webpy 0.3 """ 2 | import web 3 | import model 4 | import markdown 5 | 6 | ### Url mappings 7 | 8 | urls = ( 9 | '/', 'Index', 10 | '/new', 'New', 11 | '/edit/(\d+)', 'Edit', 12 | '/delete/(\d+)', 'Delete', 13 | '/(.*)', 'Page', 14 | ) 15 | 16 | 17 | ### Templates 18 | t_globals = { 19 | 'datestr': web.datestr, 20 | 'markdown': markdown.markdown, 21 | } 22 | render = web.template.render('templates', base='base', globals=t_globals) 23 | 24 | 25 | class Index: 26 | 27 | def GET(self): 28 | """ Show page """ 29 | pages = model.get_pages() 30 | return render.index(pages) 31 | 32 | 33 | class Page: 34 | 35 | def GET(self, url): 36 | """ View single page """ 37 | page = model.get_page_by_url(url) 38 | if not page: 39 | raise web.seeother('/new?url=%s' % web.websafe(url)) 40 | return render.view(page) 41 | 42 | 43 | class New: 44 | 45 | def not_page_exists(url): 46 | return not bool(model.get_page_by_url(url)) 47 | 48 | page_exists_validator = web.form.Validator('Page already exists', 49 | not_page_exists) 50 | 51 | form = web.form.Form( 52 | web.form.Textbox('url', web.form.notnull, page_exists_validator, 53 | size=30, 54 | description="Location:"), 55 | web.form.Textbox('title', web.form.notnull, 56 | size=30, 57 | description="Page title:"), 58 | web.form.Textarea('content', web.form.notnull, 59 | rows=30, cols=80, 60 | description="Page content:", post="Use markdown syntax"), 61 | web.form.Button('Create page'), 62 | ) 63 | 64 | def GET(self): 65 | url = web.input(url='').url 66 | form = self.form() 67 | form.fill({'url':url}) 68 | return render.new(form) 69 | 70 | def POST(self): 71 | form = self.form() 72 | if not form.validates(): 73 | return render.new(form) 74 | model.new_page(form.d.url, form.d.title, form.d.content) 75 | raise web.seeother('/' + form.d.url) 76 | 77 | 78 | class Delete: 79 | 80 | def POST(self, id): 81 | model.del_page(int(id)) 82 | raise web.seeother('/') 83 | 84 | 85 | class Edit: 86 | 87 | form = web.form.Form( 88 | web.form.Textbox('url', web.form.notnull, 89 | size=30, 90 | description="Location:"), 91 | web.form.Textbox('title', web.form.notnull, 92 | size=30, 93 | description="Page title:"), 94 | web.form.Textarea('content', web.form.notnull, 95 | rows=30, cols=80, 96 | description="Page content:", post="Use markdown syntax"), 97 | web.form.Button('Update page'), 98 | ) 99 | 100 | def GET(self, id): 101 | page = model.get_page_by_id(int(id)) 102 | form = self.form() 103 | form.fill(page) 104 | return render.edit(page, form) 105 | 106 | 107 | def POST(self, id): 108 | form = self.form() 109 | page = model.get_page_by_id(int(id)) 110 | if not form.validates(): 111 | return render.edit(page, form) 112 | model.update_page(int(id), form.d.url, form.d.title, form.d.content) 113 | raise web.seeother('/') 114 | 115 | 116 | app = web.application(urls, globals()) 117 | 118 | if __name__ == '__main__': 119 | app.run() 120 | --------------------------------------------------------------------------------