├── README.md ├── items.txt ├── requirements.txt ├── scrapy.cfg └── weibosearch ├── __init__.py ├── db └── mysql.sql ├── feeds.py ├── items.py ├── pipelines.py ├── query.py ├── redis ├── __init__.py ├── dupefilter.py ├── pipelines.py ├── queue.py └── scheduler.py ├── settings.py ├── sina ├── __init__.py └── weibo.py ├── spiders ├── WeiboSearchSpider.py └── __init__.py └── timerange └── __init__.py /README.md: -------------------------------------------------------------------------------- 1 | WeiboSearch 2 | =================== 3 | A distributed Sina Weibo Search spider base on Scrapy and Redis. 4 | 5 | tpeng 6 | 7 | ## Installation 8 | $ sudo apt-get install mysql-server 9 | $ sudo apt-get install redis-server 10 | $ sudo apt-get install python-mysqldb 11 | $ sudo pip install -r requirements.txt 12 | 13 | ## Usage 14 | 1. put your keywords in items.txt 15 | 2. `scrapy crawl weibosearch -a username=your_weibo_account -a password=your_weibo_password` 16 | 3. add another spider with *scrapy crawl weibosearch -a username=another_weibo_account -a password=another_weibo_password* 17 | 18 | or 19 | 1. `scrapy crawl weibosearch -a username=your_weibo_account -a password=your_weibo_password -a savedb=False` to avoid save to 20 | mysql db. (easier for experiment) 21 | -------------------------------------------------------------------------------- /items.txt: -------------------------------------------------------------------------------- 1 | scrapy -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | redis 2 | git+https://github.com/scrapy/scrapy#egg=Scrapy 3 | pyquery 4 | rsa -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = weibosearch.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = weibosearch 12 | -------------------------------------------------------------------------------- /weibosearch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpeng/weibosearch/75997f2d8f7b833dd01128e034e50c30dd41a2c6/weibosearch/__init__.py -------------------------------------------------------------------------------- /weibosearch/db/mysql.sql: -------------------------------------------------------------------------------- 1 | drop table if exists weibosearch2; 2 | 3 | create table author ( 4 | id BIGINT not null, 5 | name VARCHAR(100) not null, 6 | url VARCHAR(500) not null, 7 | PRIMARY KEY (id) 8 | ) engine=innodb default charset=utf8; 9 | 10 | CREATE INDEX author_id ON author (id); 11 | 12 | create table feed ( 13 | id BIGINT not null, 14 | author_id BIGINT, 15 | FOREIGN KEY (author_id) references author(id), 16 | content VARCHAR(500) not null, 17 | replies INTEGER, 18 | retweets INTEGER, 19 | timestamp TIMESTAMP, 20 | PRIMARY KEY (id) 21 | ) engine=innodb default charset=utf8; 22 | 23 | CREATE INDEX feed_id on feed(id); -------------------------------------------------------------------------------- /weibosearch/feeds.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | # A weibo parser. 3 | # 4 | # tpeng 5 | # 2012/9/21 6 | # 7 | from pyquery import PyQuery as pq 8 | from datetime import datetime 9 | import re 10 | 11 | class SearchPage(): 12 | def __init__(self, values): 13 | if values is None or len(values) == 0: 14 | self.values = [] 15 | else: 16 | self.values = values 17 | 18 | def __len__(self): 19 | return len(self.values) 20 | 21 | def __getitem__(self, key): 22 | return self.values[key] 23 | 24 | def __iter__(self): 25 | return iter(self.values) 26 | 27 | @staticmethod 28 | def wrap(html): 29 | jQuery = pq(html) 30 | hrefs = jQuery('li a') 31 | values = [] 32 | if len(hrefs) > 1: 33 | size = int(hrefs[-2].text) 34 | href = hrefs[-2] 35 | link = href.get('href') 36 | if link.startswith('/'): 37 | link = '%s%s' % ('http://s.weibo.com', link) 38 | for i in xrange(1, size + 1): 39 | values.append(re.sub(r'page=\d+', 'page=%s' % i, link)) 40 | return SearchPage(values) 41 | 42 | # represent a single feed return by the weibo search 43 | class Author(): 44 | def __init__(self, id, name, img_url): 45 | self.id = id 46 | self.name = name 47 | self.img_url = img_url 48 | 49 | @staticmethod 50 | def wrap(html): 51 | jQuery = pq(html) 52 | name = unicode(jQuery('a').attr('title')) 53 | img = jQuery('a img').attr('src') 54 | # id = unicode(jQuery('a').attr('suda-data').split(':')[-1]) 55 | id = re.search('id=(\d+)&', jQuery('a img').attr('usercard'), re.I).group(1) 56 | return Author(id, name, img) 57 | 58 | def __str__(self): 59 | return 'Author(id=%s, name=%s)' % (self.id, self.name) 60 | 61 | 62 | class Feed(): 63 | def __init__(self, mid, author, content, retweets, replies, timestamp): 64 | self.mid = mid 65 | self.author = author 66 | self.content = content 67 | self.retweets = retweets 68 | self.replies = replies 69 | self.timestamp = timestamp 70 | 71 | @staticmethod 72 | def wrap(html): 73 | replies = retweets = 0 74 | jQuery = pq(html) 75 | dl = jQuery("dl.feed_list") 76 | author = Author.wrap(dl('dt.face').html()) 77 | em = jQuery('dd.content em').eq(0) 78 | imgs = em.find('img') 79 | # replace the images with image's alt text 80 | for img in imgs: 81 | if pq(img).attr('alt'): 82 | pq(img).replaceWith(pq(img).attr('alt')) 83 | spans = em.find('span') 84 | # replace the span (added by weibo search for highlight the words) with text 85 | for span in spans: 86 | pq(span).replaceWith(pq(span).text()) 87 | content = em.text() 88 | info = jQuery('dd.content p.info').text() 89 | retweets_match = re.search(ur'\u8f6c\u53d1\((\d+)\)', info, re.M | re.I | re.U) 90 | if retweets_match: 91 | retweets = int(retweets_match.group(1)) 92 | replies_match = re.search(ur'\u8bc4\u8bba\((\d+)\)', info, re.M | re.I | re.U) 93 | if replies_match: 94 | replies = int(replies_match.group(1)) 95 | 96 | time = jQuery('dd.content p.info a.date').attr('date') 97 | timestamp = datetime.fromtimestamp(long(time) / 1000) 98 | return Feed(dl.attr('mid'), author, content, retweets, replies, timestamp) 99 | 100 | def __str__(self): 101 | return 'Feed(mid=%s author=%s)' % (self.mid, self.author) 102 | -------------------------------------------------------------------------------- /weibosearch/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class ScrapyWeiboItem(Item): 9 | html = Field() 10 | -------------------------------------------------------------------------------- /weibosearch/pipelines.py: -------------------------------------------------------------------------------- 1 | # See: http://doc.scrapy.org/en/0.14/topics/item-pipeline.html 2 | # tpeng 3 | # 4 | from scrapy.exceptions import DropItem 5 | from twisted.enterprise import adbapi 6 | from weibosearch.feeds import Feed 7 | from scrapy import log 8 | import MySQLdb.cursors 9 | 10 | class ScrapyWeiboPipeline(object): 11 | def __init__(self): 12 | self.dbpool = adbapi.ConnectionPool('MySQLdb', 13 | db='weibosearch2', 14 | user='root', 15 | passwd='pw', 16 | cursorclass=MySQLdb.cursors.DictCursor, 17 | charset='utf8', 18 | use_unicode=True 19 | ) 20 | 21 | def process_item(self, item, spider): 22 | # run db query in thread pool 23 | if spider.savedb == 'True': 24 | query = self.dbpool.runInteraction(self._conditional_insert, item) 25 | query.addErrback(self.handle_error) 26 | return item 27 | 28 | def _conditional_insert(self, tx, item): 29 | # create record if doesn't exist. 30 | # all this block run on it's own thread 31 | try: 32 | feed = Feed.wrap(item['html']) 33 | except Exception as e: 34 | print e 35 | raise DropItem('Feed.wrap error: %s' % item['html']) 36 | 37 | # insert author 38 | tx.execute("select * from author where id = %s" % feed.author.id) 39 | result = tx.fetchone() 40 | if result: 41 | log.msg("Author already stored in db: %s" % feed.author.id, level=log.INFO) 42 | else: 43 | tx.execute("insert into author (id, name, url)" 44 | "values (%s, %s, %s)", 45 | (feed.author.id, feed.author.name, feed.author.img_url)) 46 | log.msg("Author stored in db: %s" % feed.author.id, level=log.INFO) 47 | 48 | # insert feed 49 | tx.execute("select * from feed where id = %s" % feed.mid) 50 | result = tx.fetchone() 51 | if result: 52 | log.msg("Feed already stored in db: (%s,%s)" % (feed.author.id, feed.mid), level=log.INFO) 53 | else: 54 | tx.execute("insert into feed (id, author_id, content, retweets, replies, timestamp)" 55 | "values (%s, %s, %s, %s, %s, %s)", 56 | (feed.mid, feed.author.id, feed.content, feed.retweets, feed.replies, 57 | feed.timestamp.strftime('%Y-%m-%d %H:%M:%S'))) 58 | 59 | log.msg("Feed stored in db: %s" % feed.mid, level=log.INFO) 60 | 61 | def handle_error(self, e): 62 | log.err(e) -------------------------------------------------------------------------------- /weibosearch/query.py: -------------------------------------------------------------------------------- 1 | class QueryFactory: 2 | @staticmethod 3 | def create_query(query): 4 | return 'http://s.weibo.com/weibo/%s&Refer=STopic_box&scope=ori' % query 5 | 6 | @staticmethod 7 | def create_paging_query(query, page): 8 | return 'http://s.weibo.com/weibo/%s&page=%d' % (query, page) 9 | 10 | @staticmethod 11 | def create_timerange_query(query, start, end): 12 | s = start.strftime('%Y-%m-%d-%H') 13 | e = end.strftime('%Y-%m-%d-%H') 14 | return 'http://s.weibo.com/weibo/%s&Refer=STopic_box×cope=custom:%s:%s&scope=ori' % (query, s, e) 15 | 16 | -------------------------------------------------------------------------------- /weibosearch/redis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpeng/weibosearch/75997f2d8f7b833dd01128e034e50c30dd41a2c6/weibosearch/redis/__init__.py -------------------------------------------------------------------------------- /weibosearch/redis/dupefilter.py: -------------------------------------------------------------------------------- 1 | import redis 2 | import time 3 | from scrapy.dupefilter import BaseDupeFilter 4 | from scrapy.utils.request import request_fingerprint 5 | 6 | class RFPDupeFilter(BaseDupeFilter): 7 | """Redis-based request duplication filter""" 8 | 9 | def __init__(self, server, key): 10 | """Initialize duplication filter 11 | 12 | Parameters: 13 | server -- Redis connection 14 | key -- redis key to store fingerprints 15 | 16 | """ 17 | self.server = server 18 | self.key = key 19 | 20 | @classmethod 21 | def from_settings(cls, settings): 22 | host = settings.get('REDIS_HOST', 'localhost') 23 | port = settings.get('REDIS_PORT', 6379) 24 | server = redis.Redis(host, port) 25 | # create one-time key. needed to support to use this 26 | # class as standalone dupefilter with scrapy's default scheduler 27 | # if scrapy passes spider on open() method this wouldn't be needed 28 | key = "dupefilter:%s" % int(time.time()) 29 | return cls(server, key) 30 | 31 | def request_seen(self, request): 32 | fp = request_fingerprint(request) 33 | added = self.server.sadd(self.key, fp) 34 | return not added 35 | 36 | def close(self, reason): 37 | """Delete data on close. Called by scrapy's scheduler""" 38 | self.clear() 39 | 40 | def clear(self): 41 | """Clears fingerprints data""" 42 | self.server.delete(self.key) 43 | 44 | -------------------------------------------------------------------------------- /weibosearch/redis/pipelines.py: -------------------------------------------------------------------------------- 1 | import redis 2 | 3 | from twisted.internet.threads import deferToThread 4 | from scrapy.utils.serialize import ScrapyJSONEncoder 5 | 6 | 7 | class RedisPipeline(object): 8 | """Pushes serialized item into a redis list/queue""" 9 | 10 | def __init__(self, host, port): 11 | self.server = redis.Redis(host, port) 12 | self.encoder = ScrapyJSONEncoder() 13 | 14 | @classmethod 15 | def from_settings(cls, settings): 16 | host = settings.get('REDIS_HOST', 'localhost') 17 | port = settings.get('REDIS_PORT', 6379) 18 | return cls(host, port) 19 | 20 | def process_item(self, item, spider): 21 | return deferToThread(self._process_item, item, spider) 22 | 23 | def _process_item(self, item, spider): 24 | key = self.item_key(item, spider) 25 | data = self.encoder.encode(dict(item)) 26 | self.server.rpush(key, data) 27 | return item 28 | 29 | def item_key(self, item, spider): 30 | """Returns redis key based on given spider""" 31 | return "%s:items" % spider.name 32 | 33 | -------------------------------------------------------------------------------- /weibosearch/redis/queue.py: -------------------------------------------------------------------------------- 1 | import marshal 2 | from scrapy.utils.reqser import request_to_dict, request_from_dict 3 | 4 | class SpiderQueue(object): 5 | """Per-spider queue abstraction on top of redis using sorted set""" 6 | 7 | def __init__(self, server, spider, key): 8 | """Initialize per-spider redis queue 9 | 10 | Parameters: 11 | redis -- redis connection 12 | spider -- spider instance 13 | key -- key for this queue (e.g. "%(spider)s:queue") 14 | 15 | """ 16 | self.redis = server 17 | self.spider = spider 18 | self.key = key % {'spider': spider.name} 19 | 20 | def __len__(self): 21 | return self.redis.zcard(self.key) 22 | 23 | def push(self, request): 24 | data = marshal.dumps(request_to_dict(request, self.spider)) 25 | pairs = {data: -request.priority} 26 | self.redis.zadd(self.key, **pairs) 27 | 28 | def pop(self): 29 | # use atomic range/remove using multi/exec 30 | pipe = self.redis.pipeline() 31 | pipe.multi() 32 | pipe.zrange(self.key, 0, 0).zremrangebyrank(self.key, 0, 0) 33 | results, count = pipe.execute() 34 | if results: 35 | return request_from_dict(marshal.loads(results[0]), self.spider) 36 | 37 | def clear(self): 38 | self.redis.delete(self.key) 39 | -------------------------------------------------------------------------------- /weibosearch/redis/scheduler.py: -------------------------------------------------------------------------------- 1 | import redis 2 | from weibosearch.redis.queue import SpiderQueue 3 | from weibosearch.redis.dupefilter import RFPDupeFilter 4 | 5 | # default values 6 | REDIS_HOST = 'localhost' 7 | REDIS_PORT = 6379 8 | SCHEDULER_PERSIST = True 9 | QUEUE_KEY = '%(spider)s:requests' 10 | DUPEFILTER_KEY = '%(spider)s:dupefilter' 11 | 12 | class Scheduler(object): 13 | """Redis-based scheduler""" 14 | 15 | def __init__(self, redis, persist, queue_key): 16 | self.server = redis 17 | self.persist = persist 18 | self.queue_key = queue_key 19 | # in-memory queue 20 | self.own_queue = [] 21 | 22 | def __len__(self): 23 | return len(self.queue) 24 | 25 | @classmethod 26 | def from_settings(cls, settings): 27 | host = settings.get('REDIS_HOST', REDIS_HOST) 28 | port = settings.get('REDIS_PORT', REDIS_PORT) 29 | persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) 30 | queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) 31 | server = redis.Redis(host, port) 32 | return cls(server, persist, queue_key) 33 | 34 | @classmethod 35 | def from_crawler(cls, crawler): 36 | settings = crawler.settings 37 | return cls.from_settings(settings) 38 | 39 | def open(self, spider): 40 | self.spider = spider 41 | self.queue = SpiderQueue(self.server, spider, self.queue_key) 42 | self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name}) 43 | # notice if there are requests already in the queue 44 | if not self.persist: 45 | self.df.clear() 46 | self.queue.clear() 47 | 48 | if len(self.queue): 49 | spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) 50 | 51 | def close(self, reason): 52 | pass 53 | 54 | def enqueue_request(self, request): 55 | if not request.dont_filter and self.df.request_seen(request): 56 | return 57 | if self.spider.logined: 58 | self.queue.push(request) 59 | else: 60 | self.own_queue.append(request) 61 | 62 | def next_request(self): 63 | if self.spider.logined: 64 | return self.queue.pop() 65 | if len(self.own_queue) > 0: 66 | return self.own_queue.pop() 67 | 68 | def has_pending_requests(self): 69 | if self.spider.logined: 70 | return len(self) > 0 71 | return len(self.own_queue) 72 | 73 | -------------------------------------------------------------------------------- /weibosearch/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for scrapy_weibo project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'weibosearch' 10 | 11 | SPIDER_MODULES = ['weibosearch.spiders'] 12 | NEWSPIDER_MODULE = 'weibosearch.spiders' 13 | 14 | # redis config 15 | REDIS_HOST = 'localhost' 16 | REDIS_PORT = 6379 17 | 18 | # scheduler config 19 | SCHEDULER_PERSIST = True 20 | QUEUE_KEY = '%(spider)s:requests' 21 | DUPEFILTER_KEY = '%(spider)s:dupefilter' 22 | SCHEDULER = "weibosearch.redis.scheduler.Scheduler" 23 | 24 | # pipelines config 25 | ITEM_PIPELINES = ['weibosearch.pipelines.ScrapyWeiboPipeline'] 26 | 27 | DOWNLOAD_DELAY = 10 28 | 29 | TIME_DELTA = 30 30 | 31 | # bootstrap from file (item.txt) or from db 32 | BOOTSTRAP = 'file' 33 | 34 | # how many feeds can fetch from a item 35 | FEED_LIMIT = 300000 -------------------------------------------------------------------------------- /weibosearch/sina/__init__.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | def _epoch(): 4 | return datetime(2009, 8, 16) 5 | 6 | -------------------------------------------------------------------------------- /weibosearch/sina/weibo.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | # original from http://www.douban.com/note/201767245/ 3 | # also see http://www.cnblogs.com/mouse-coder/archive/2013/03/03/2941265.html for recent change in weibo login 4 | # modified by tpeng 5 | # 2012/9/20 6 | 7 | import urllib 8 | import urllib2 9 | import cookielib 10 | import base64 11 | import re, sys, json 12 | import binascii 13 | import rsa 14 | 15 | postdata = { 16 | 'entry': 'weibo', 17 | 'gateway': '1', 18 | 'from': '', 19 | 'savestate': '7', 20 | 'userticket': '1', 21 | 'ssosimplelogin': '1', 22 | 'vsnf': '1', 23 | 'vsnval': '', 24 | 'su': '', 25 | 'service': 'miniblog', 26 | 'servertime': '', 27 | 'nonce': '', 28 | 'pwencode': 'rsa2', 29 | 'sp': '', 30 | 'encoding': 'UTF-8', 31 | 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 32 | 'returntype': 'META' 33 | } 34 | 35 | class Weibo(): 36 | def __init__(self): 37 | # 获取一个保存cookie的对象 38 | self.cj = cookielib.LWPCookieJar() 39 | 40 | # 将一个保存cookie对象,和一个HTTP的cookie的处理器绑定 41 | cookie_support = urllib2.HTTPCookieProcessor(self.cj) 42 | 43 | # 创建一个opener,将保存了cookie的http处理器,还有设置一个handler用于处理http的URL的打开 44 | opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) 45 | 46 | # 将包含了cookie、http处理器、http的handler的资源和urllib2对象板顶在一起 47 | urllib2.install_opener(opener) 48 | 49 | def _get_servertime(self, username): 50 | url = 'http://login.sina.com.cn/sso/prelogin.php?entry=sso&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&client=ssologin.js(v1.4.4)' %username 51 | data = urllib2.urlopen(url).read() 52 | p = re.compile('\((.*)\)') 53 | json_data = p.search(data).group(1) 54 | data = json.loads(json_data) 55 | servertime = str(data['servertime']) 56 | nonce = data['nonce'] 57 | pubkey = data['pubkey'] 58 | rsakv = data['rsakv'] 59 | return servertime, nonce, pubkey, rsakv 60 | 61 | def _get_pwd(self, pwd, servertime, nonce, pubkey): 62 | rsaPublickey = int(pubkey, 16) 63 | key = rsa.PublicKey(rsaPublickey, 65537) 64 | message = str(servertime) + '\t' + str(nonce) + '\n' + str(pwd) 65 | pwd = rsa.encrypt(message, key) 66 | return binascii.b2a_hex(pwd) 67 | 68 | def _get_user(self, username): 69 | username_ = urllib.quote(username) 70 | username = base64.encodestring(username_)[:-1] 71 | return username 72 | 73 | def login(self, username, pwd): 74 | url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.4)' 75 | try: 76 | servertime, nonce, pubkey, rsakv = self._get_servertime(username) 77 | except: 78 | print >> sys.stderr, 'Get severtime error!' 79 | return None 80 | global postdata 81 | postdata['servertime'] = servertime 82 | postdata['nonce'] = nonce 83 | postdata['su'] = self._get_user(username) 84 | postdata['sp'] = self._get_pwd(pwd, servertime, nonce, pubkey) 85 | postdata['rsakv'] = rsakv 86 | postdata = urllib.urlencode(postdata) 87 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0'} 88 | 89 | req = urllib2.Request( 90 | url=url, 91 | data=postdata, 92 | headers=headers 93 | ) 94 | 95 | result = urllib2.urlopen(req) 96 | text = result.read() 97 | p = re.compile('location\.replace\([\'|"](.*?)[\'|"]\)') 98 | try: 99 | return p.search(text).group(1) 100 | except: 101 | return None 102 | 103 | if __name__ == '__main__': 104 | weibo = Weibo() 105 | # weibo.login('your weibo account', 'your password') 106 | -------------------------------------------------------------------------------- /weibosearch/spiders/WeiboSearchSpider.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | # weibosearch spider 3 | # tpeng 4 | # 5 | import codecs 6 | from datetime import datetime, timedelta 7 | import urllib 8 | import MySQLdb 9 | from scrapy import log 10 | from scrapy.conf import settings 11 | from scrapy.exceptions import CloseSpider 12 | from scrapy.http import Request 13 | from scrapy.spider import BaseSpider 14 | from weibosearch.feeds import SearchPage 15 | from weibosearch.items import ScrapyWeiboItem 16 | import re, json 17 | from pyquery import PyQuery as pq 18 | from lxml.html import tostring 19 | from weibosearch.query import QueryFactory 20 | from weibosearch.sina.weibo import Weibo 21 | from weibosearch.sina import _epoch 22 | from weibosearch.timerange import daterange 23 | 24 | # default values 25 | REDIS_HOST = 'localhost' 26 | REDIS_PORT = 6379 27 | 28 | class WeiboSearchSpider(BaseSpider): 29 | name = 'weibosearch' 30 | allowed_domains = ['weibo.com'] 31 | weibo = Weibo() 32 | # allow save to db 33 | savedb = 'True' 34 | username = 'YOUR_WEIBO_ACCOUNT' 35 | password = 'YOUR_WEIBO_PASSWORD' 36 | 37 | def __init__(self, name=None, **kwargs): 38 | super(WeiboSearchSpider, self).__init__(name, **kwargs) 39 | if not self.savedb: 40 | self.db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="pw", db="weibosearch2", 41 | charset='utf8', use_unicode=True) 42 | self.cursor = self.db.cursor() 43 | self.logined = False 44 | 45 | self.log('login with %s' % self.username) 46 | login_url = self.weibo.login(self.username, self.password) 47 | if login_url: 48 | self.start_urls.append(login_url) 49 | 50 | # only parse the login page 51 | def parse(self, response): 52 | if response.body.find('feedBackUrlCallBack') != -1: 53 | data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1)) 54 | userinfo = data.get('userinfo', '') 55 | if len(userinfo): 56 | log.msg('user id %s' % userinfo['userid'], level=log.INFO) 57 | assert userinfo['userid'] == self.username 58 | self.logined = True 59 | 60 | bootstrap = settings.get('BOOTSTRAP') 61 | log.msg('bootstrap from %s' % bootstrap, level=log.INFO) 62 | # FIXME: use last scheduled time instead of today, otherwise queue filter will not work 63 | today = datetime.now() 64 | if bootstrap == 'file': 65 | lines = tuple(codecs.open('items.txt', 'r', 'utf-8')) 66 | for line in lines: 67 | if line.startswith("#"): 68 | continue 69 | start = _epoch() 70 | url = QueryFactory.create_timerange_query(urllib.quote(line.encode('utf8')), start, today) 71 | request = Request(url=url, callback=self.parse_weibo, meta={ 72 | 'query': line, 73 | 'start': start.strftime("%Y-%m-%d %H:%M:%S"), 74 | 'end': today.strftime("%Y-%m-%d %H:%M:%S"), 75 | 'last_fetched': today.strftime("%Y-%m-%d %H:%M:%S")}) 76 | yield request 77 | else: 78 | self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', ''))) 79 | 80 | # TODO: can also bootstrap from db 81 | 82 | def parse_weibo(self, response): 83 | query = response.request.meta['query'] 84 | start = datetime.strptime(response.request.meta['start'], "%Y-%m-%d %H:%M:%S") 85 | end = datetime.strptime(response.request.meta['end'], "%Y-%m-%d %H:%M:%S") 86 | range = daterange(start, end).delta() 87 | last_fetched = datetime.strptime(response.request.meta['last_fetched'], "%Y-%m-%d %H:%M:%S") 88 | 89 | jQuery = pq(response.body) 90 | scripts = jQuery('script') 91 | 92 | text = "".join(filter(lambda x: x is not None, [x.text for x in scripts])) 93 | # check if we exceed the sina limit 94 | sassfilter_match = re.search(r'{(\"pid\":\"pl_common_sassfilter\".*?)}', text, re.M | re.I) 95 | if sassfilter_match: 96 | raise CloseSpider('weibo search exceeded') 97 | 98 | # check the num of search results 99 | totalshow_match = re.search(r'{(\"pid\":\"pl_common_totalshow\".*?)}', text, re.M | re.I) 100 | if totalshow_match: 101 | html = json.loads(totalshow_match.group())['html'] 102 | if len(html) == 0: 103 | raise CloseSpider('not login? %s' % html) 104 | totalshow = pq(html) 105 | if totalshow('div.topcon_l').html() is None: 106 | log.msg('%s 0 feeds' % query, level=log.INFO) 107 | return 108 | topcon_num = int(re.search('\s(\d+)\s', totalshow('div.topcon_l').text().replace(',', ''), re.I).group(1)) 109 | log.msg('%s %d feeds' % (query, topcon_num), level=log.INFO) 110 | max_feeds = settings.getint('FEED_LIMIT', 200000) 111 | if topcon_num > max_feeds: 112 | log.msg('too much (%d) result for %s.' % (topcon_num, query), logLevel=log.WARNING) 113 | elif 1000 < topcon_num < max_feeds: 114 | # weibo search only allow 20 feeds on 1 page and at most 50 pages. 115 | days = range.days / float(2) 116 | middle = start + timedelta(days) 117 | 118 | # first part 119 | url = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), start, middle) 120 | request = Request(url=url, callback=self.parse_weibo) 121 | request.meta['query'] = query 122 | request.meta['start'] = start.strftime("%Y-%m-%d %H:%M:%S") 123 | request.meta['end'] = middle.strftime("%Y-%m-%d %H:%M:%S") 124 | request.meta['priority'] = days / 2 125 | request.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S") 126 | yield request 127 | 128 | # second part 129 | url2 = QueryFactory.create_timerange_query(urllib.quote(query.encode('utf8')), middle, end) 130 | request2 = Request(url=url2, callback=self.parse_weibo) 131 | request2.meta['query'] = query 132 | request2.meta['start'] = middle.strftime("%Y-%m-%d %H:%M:%S") 133 | request2.meta['end'] = end.strftime("%Y-%m-%d %H:%M:%S") 134 | request2.meta['priority'] = days / 2 135 | request2.meta['last_fetched'] = last_fetched.strftime("%Y-%m-%d %H:%M:%S") 136 | yield request2 137 | else: 138 | # check the feeds update 139 | feedlist_match = re.search(r'{(\"pid\":\"pl_weibo_feedlist\".*?)}', text, re.M | re.I) 140 | if feedlist_match: 141 | search_results = pq(json.loads(feedlist_match.group())['html']) 142 | feeds = search_results('dl.feed_list') 143 | search_pages = search_results('ul.search_page_M') 144 | pages = SearchPage.wrap(search_pages) 145 | 146 | # send the items to pipeline 147 | for feed in feeds: 148 | item = ScrapyWeiboItem() 149 | item['html'] = tostring(feed) 150 | yield item 151 | # skip first page and request other pages 152 | for i in xrange(2, len(pages)): 153 | query = pages[i] 154 | log.msg('%s' % query) 155 | request = Request(url=query, callback=self.parse_page) 156 | request.meta['query'] = query 157 | yield request 158 | 159 | # parse single weibo page 160 | def parse_page(self, response): 161 | jQuery = pq(response.body) 162 | scripts = jQuery('script') 163 | for script in scripts: 164 | match = re.search(r'{(\"pid\":\"pl_weibo_feedlist\".*)}', unicode(script.text), re.M | re.I) 165 | if match: 166 | search_results = pq(json.loads(match.group())['html']) 167 | feeds = search_results('dl.feed_list') 168 | for feed in feeds: 169 | item = ScrapyWeiboItem() 170 | item['html'] = tostring(feed) 171 | yield item 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /weibosearch/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /weibosearch/timerange/__init__.py: -------------------------------------------------------------------------------- 1 | # tpeng: from https://github.com/charettes/python-timerange @c7ccf80038e2894b8108999b745697283a7945d4 2 | # maybe use git submodule is a better idea? 3 | 4 | from datetime import time, date, datetime 5 | 6 | __all__ = ('datetimerange', 'daterange', 'timerange') 7 | 8 | class temporalrange: 9 | def __init__(self, frm, to): 10 | if not isinstance(frm, self.temporal): 11 | raise TypeError("frm must be an instance of %s" % self.temporal.__name__) 12 | 13 | if not isinstance(to, self.temporal): 14 | raise TypeError("to must be an instance of %s" % self.temporal.__name__) 15 | 16 | self.frm, self.to = sorted([frm, to]) 17 | 18 | def intersection(self, range): 19 | if not isinstance(range, self.__class__): 20 | raise TypeError("can only apply intersections between two instances of %s" %\ 21 | self.__class__.__name__) 22 | 23 | frm = max(self.frm, range.frm) 24 | to = min(self.to, range.to) 25 | 26 | if frm > to: 27 | return None 28 | else: 29 | return self.__class__(frm, to) 30 | 31 | __and__ = intersection 32 | 33 | def include(self, range): 34 | """Test if the range is contained within this one""" 35 | return range.frm >= self.frm and range.to <= self.to 36 | 37 | def delta(self): 38 | """Fetch the datetime.timedelta between self.frm and self.to""" 39 | return self.to - self.frm 40 | 41 | def __contains__(self, item): 42 | if not isinstance(item, self.temporal): 43 | raise TypeError("a %s only contains instances of %s" % (self.__class__.__name__, 44 | self.temporal.__name__)) 45 | 46 | return item >= self.frm and item <= self.to 47 | 48 | def __eq__(self, other): 49 | if not isinstance(other, self.__class__): 50 | return NotImplemented 51 | return self.to == other.to and self.frm == other.frm 52 | 53 | def __ne__(self, other): 54 | return not self.__eq__(other) 55 | 56 | def __hash__(self): 57 | return hash(repr(self)) 58 | 59 | def __repr__(self): 60 | return "timerange.%s(%s, %s)" % (self.__class__.__name__, repr(self.frm), repr(self.to)) 61 | 62 | 63 | class datetimerange(temporalrange): 64 | """ 65 | 66 | >>> a = datetime(2010, 5, 14, 23) 67 | >>> b = datetime(2010, 5, 15) 68 | >>> c = datetime(2010, 5, 15, 3) 69 | >>> d = datetime(2010, 5, 16) 70 | >>> e = datetime(2010, 5, 17) 71 | 72 | >>> datetimerange(a, 12) 73 | Traceback (most recent call last): 74 | ... 75 | TypeError: to must be an instance of datetime 76 | 77 | >>> datetimerange(b, a).frm == a 78 | True 79 | 80 | >>> hash(datetimerange(a, b)) == hash(datetimerange(a, b)) 81 | True 82 | 83 | >>> repr(datetimerange(a, b)) 84 | 'timerange.datetimerange(datetime.datetime(2010, 5, 14, 23, 0), datetime.datetime(2010, 5, 15, 0, 0))' 85 | 86 | >>> datetimerange(a, b).delta() 87 | datetime.timedelta(0, 3600) 88 | 89 | >>> a == datetime(2010, 5, 14, 23) 90 | True 91 | 92 | >>> a == b 93 | False 94 | 95 | >>> b in datetimerange(a, c) 96 | True 97 | 98 | >>> c in datetimerange(a, b) 99 | False 100 | 101 | >>> 2 in datetimerange(a, c) 102 | Traceback (most recent call last): 103 | ... 104 | TypeError: a datetimerange only contains instances of datetime 105 | 106 | >>> c in datetimerange(a, d) & datetimerange(b, e) 107 | True 108 | 109 | >>> datetimerange(a, b) & datetimerange(d, e) 110 | 111 | >>> datetimerange(a, d).include(datetimerange(a, c)) 112 | True 113 | 114 | >>> datetimerange(a, c).include(datetimerange(a, d)) 115 | False 116 | """ 117 | temporal = datetime 118 | 119 | 120 | class daterange(temporalrange): 121 | """ 122 | 123 | >>> a = date(2010, 5, 14) 124 | >>> b = date(2010, 5, 15) 125 | >>> c = date(2010, 5, 17) 126 | >>> d = date(2010, 5, 18) 127 | >>> e = date(2010, 5, 19) 128 | 129 | >>> daterange(a, 12) 130 | Traceback (most recent call last): 131 | ... 132 | TypeError: to must be an instance of date 133 | 134 | >>> daterange(b, a).frm == a 135 | True 136 | 137 | >>> hash(daterange(a, b)) == hash(daterange(a, b)) 138 | True 139 | 140 | >>> repr(daterange(a, b)) 141 | 'timerange.daterange(datetime.date(2010, 5, 14), datetime.date(2010, 5, 15))' 142 | 143 | >>> daterange(a, b).delta() 144 | datetime.timedelta(1) 145 | 146 | >>> a == date(2010, 5, 14) 147 | True 148 | 149 | >>> a == b 150 | False 151 | 152 | >>> b in daterange(a, c) 153 | True 154 | 155 | >>> c in daterange(a, b) 156 | False 157 | 158 | >>> 2 in daterange(a, c) 159 | Traceback (most recent call last): 160 | ... 161 | TypeError: a daterange only contains instances of date 162 | 163 | >>> c in daterange(a, d) & daterange(b, e) 164 | True 165 | 166 | >>> daterange(a, b) & daterange(d, e) 167 | 168 | >>> daterange(a, d).include(daterange(a, c)) 169 | True 170 | 171 | >>> daterange(a, c).include(daterange(a, d)) 172 | False 173 | """ 174 | temporal = date 175 | 176 | 177 | class timerange(temporalrange): 178 | """ 179 | 180 | >>> a = time(12, 15) 181 | >>> b = time(12, 25) 182 | >>> c = time(13, 40) 183 | >>> d = time(16, 20) 184 | >>> e = time(18, 50) 185 | 186 | >>> timerange(a, 12) 187 | Traceback (most recent call last): 188 | ... 189 | TypeError: to must be an instance of time 190 | 191 | >>> timerange(b, a).frm == a 192 | True 193 | 194 | >>> hash(timerange(a, b)) == hash(timerange(a, b)) 195 | True 196 | 197 | >>> repr(timerange(a, b)) 198 | 'timerange.timerange(datetime.time(12, 15), datetime.time(12, 25))' 199 | 200 | >>> timerange(a, b).delta() 201 | datetime.timedelta(0, 600) 202 | 203 | >>> a == time(12, 15) 204 | True 205 | 206 | >>> a == b 207 | False 208 | 209 | >>> b in timerange(a, c) 210 | True 211 | 212 | >>> c in timerange(a, b) 213 | False 214 | 215 | >>> 2 in timerange(a, c) 216 | Traceback (most recent call last): 217 | ... 218 | TypeError: a timerange only contains instances of time 219 | 220 | >>> c in timerange(a, d) & timerange(b, e) 221 | True 222 | 223 | >>> timerange(a, b) & timerange(d, e) 224 | 225 | >>> timerange(a, d).include(timerange(a, c)) 226 | True 227 | 228 | >>> timerange(a, c).include(timerange(a, d)) 229 | False 230 | """ 231 | temporal = time 232 | 233 | #@staticmethod 234 | def __datetime(time): 235 | return datetime.min.replace( 236 | hour=time.hour, 237 | minute=time.minute, 238 | second=time.second, 239 | microsecond=time.microsecond, 240 | tzinfo=time.tzinfo 241 | ) 242 | 243 | __datetime = staticmethod(__datetime) 244 | 245 | def delta(self): 246 | return timerange.__datetime(self.to) - timerange.__datetime(self.frm) 247 | 248 | if __name__ == "__main__": 249 | import doctest 250 | 251 | doctest.testmod() 252 | --------------------------------------------------------------------------------