├── .gitignore ├── README.md ├── example.py ├── scrapy.ini ├── weibo_login.py └── weibo_scrapy.py /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | *~ 19 | *txt$ 20 | *TXT$ 21 | following* 22 | *csv 23 | *txt 24 | # External tool builders 25 | .externalToolBuilders/ 26 | 27 | # Locally stored "Eclipse launch configurations" 28 | *.launch 29 | 30 | # CDT-specific 31 | .cproject 32 | 33 | # PDT-specific 34 | .buildpath 35 | 36 | 37 | ################# 38 | ## Visual Studio 39 | ################# 40 | 41 | ## Ignore Visual Studio temporary files, build results, and 42 | ## files generated by popular Visual Studio add-ons. 43 | 44 | # User-specific files 45 | *.suo 46 | *.user 47 | *.sln.docstates 48 | 49 | # Build results 50 | [Dd]ebug/ 51 | [Rr]elease/ 52 | *_i.c 53 | *_p.c 54 | *.ilk 55 | *.meta 56 | *.obj 57 | *.pch 58 | *.pdb 59 | *.pgc 60 | *.pgd 61 | *.rsp 62 | *.sbr 63 | *.tlb 64 | *.tli 65 | *.tlh 66 | *.tmp 67 | *.vspscc 68 | .builds 69 | *.dotCover 70 | 71 | 72 | # Visual C++ cache files 73 | ipch/ 74 | *.aps 75 | *.ncb 76 | *.opensdf 77 | *.sdf 78 | 79 | # Visual Studio profiler 80 | *.psess 81 | *.vsp 82 | 83 | # ReSharper is a .NET coding add-in 84 | _ReSharper* 85 | 86 | # Installshield output folder 87 | [Ee]xpress 88 | 89 | # DocProject is a documentation generator add-in 90 | DocProject/buildhelp/ 91 | DocProject/Help/*.HxT 92 | DocProject/Help/*.HxC 93 | DocProject/Help/*.hhc 94 | DocProject/Help/*.hhk 95 | DocProject/Help/*.hhp 96 | DocProject/Help/Html2 97 | DocProject/Help/html 98 | 99 | # Click-Once directory 100 | publish 101 | 102 | # Others 103 | [Bb]in 104 | [Oo]bj 105 | sql 106 | TestResults 107 | *.Cache 108 | ClientBin 109 | stylecop.* 110 | ~$* 111 | *.dbmdl 112 | Generated_Code #added for RIA/Silverlight projects 113 | 114 | # Backup & report files from converting an old project file to a newer 115 | # Visual Studio version. Backup files are not needed, because we have git ;-) 116 | _UpgradeReport_Files/ 117 | Backup*/ 118 | UpgradeLog*.XML 119 | 120 | 121 | 122 | ############ 123 | ## Windows 124 | ############ 125 | 126 | # Windows image file caches 127 | Thumbs.db 128 | 129 | # Folder config file 130 | Desktop.ini 131 | 132 | 133 | ############# 134 | ## Python 135 | ############# 136 | 137 | *.py[co] 138 | 139 | # Packages 140 | *.egg 141 | *.egg-info 142 | dist 143 | build 144 | eggs 145 | parts 146 | bin 147 | var 148 | sdist 149 | develop-eggs 150 | .installed.cfg 151 | 152 | # Installer logs 153 | pip-log.txt 154 | 155 | # Unit test / coverage reports 156 | .coverage 157 | .tox 158 | 159 | #Translations 160 | *.mo 161 | 162 | #Mr Developer 163 | .mr.developer.cfg 164 | 165 | # Mac crap 166 | .DS_Store 167 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | WEIBO_SCRAPY 2 | ============ 3 | 4 | WEIBO\_SCRAPY是一个PYTHON实现的,使用多线程抓取WEIBO信息的框架。WEIBO\_SCRAPY框架给用户提供WEIBO的模拟登录和多线程抓取微博信息的接口,让用户只需关心抓取的业务逻辑,而不用处理棘手的WEIBO模拟登录和多线程编程。 5 | 6 | WEIBO\_SCRAPY is a **Multi-Threading** SINA WEIBO data extraction Framework in Python. WEIBO\_SCRAPY provides WEIBO login simulator and interface for WEIBO data extraction with multi-threading, it saves users a lot of time by getting users out of writing WEIBO login simulator from scratch and multi-threading programming, users now can focus on their own **extraction** logic. 7 | 8 | 9 | ======= 10 | 11 | ###WEIBO\_SCRAPY的功能 12 | 1\. 微博模拟登录 13 | 14 | 2\. 多线程抓取框架 15 | 16 | 3\. **抓取任务**接口 17 | 18 | 4\. 抓取参数配置 19 | 20 | ###WEIBO\_SCRAPY Provides 21 | 1\. WEIBO Login Simulator 22 | 23 | 2\. Multi-Threading Extraction Framework 24 | 25 | 3\. **Extraction Task** Interface 26 | 27 | 4\. Easy Way of Parameters Configuration 28 | 29 | ###How to Use WEIBO\_SCRAPY 30 | #!/usr/bin/env python 31 | #coding=utf8 32 | 33 | from weibo_scrapy import scrapy 34 | 35 | class my_scrapy(scrapy): 36 | 37 | def scrapy_do_task(self, uid=None): 38 | ''' 39 | User needs to overwrite this method to perform uid-based scrapy task. 40 | @param uid: weibo uid 41 | @return: a list of uids gained from this task, optional 42 | ''' 43 | super(my_scrapy, self).__init__(**kwds) 44 | 45 | #do what you want with uid here, note that this scrapy is uid based, so make sure there are uids in task queue, 46 | #or gain new uids from this function 47 | print 'WOW...' 48 | return 'replace this string with uid list which gained from this task' 49 | 50 | if __name__ == '__main__': 51 | 52 | s = my_scrapy(uids_file = 'uids_all.txt', config = 'my.ini') 53 | s.scrapy() 54 | 55 | ###相关阅读(Readings) 56 | [基于UID的WEIBO信息抓取框架WEIBO_SCRAPY](http://yoyzhou.github.io/blog/2013/04/08/weibo-scrapy-framework-with-multi-threading/) 57 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf8 3 | 4 | from weibo_scrapy import scrapy 5 | 6 | 7 | class my_scrapy(scrapy): 8 | 9 | def scrapy_do_task(self, uid=None): 10 | ''' 11 | User needs to overwrite this method to perform uid-based scrapy task. 12 | @param uid: weibo uid 13 | @return: a list of uids gained from this task, optional 14 | ''' 15 | super(my_scrapy, self).__init__(**kwds) 16 | 17 | #do what you want with uid here, note that this scrapy is uid based, so make sure there are uids in task queue, 18 | #or gain new uids from this function 19 | 20 | return 'replace this string with uid list which gained from this task' 21 | 22 | if __name__ == '__main__': 23 | 24 | s = my_scrapy(start_uid = '1197161814') 25 | s.scrapy() 26 | -------------------------------------------------------------------------------- /scrapy.ini: -------------------------------------------------------------------------------- 1 | [login_account_info] 2 | #account info for login 3 | login_username = ur_weibo_account_id_here 4 | login_uid = 1248521225 5 | login_password = ur_weibo_account_password_here 6 | cookies_file = weibo_cookies.dat 7 | 8 | [scrapy_settings] 9 | thread_number = 50 10 | wanted = 100000 11 | #only one property of below 2 is required, and start_uid takes advantage of uids_file 12 | #also note that arguments from constructor will overwrite this two properties 13 | start_uid = 1248521225 14 | uids_file = 15 | -------------------------------------------------------------------------------- /weibo_login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf8 3 | 4 | ''' 5 | Created on Mar 18, 2013 6 | 7 | @author: yoyzhou 8 | ''' 9 | 10 | try: 11 | import os 12 | import sys 13 | import urllib 14 | import urllib2 15 | import cookielib 16 | import base64 17 | import re 18 | import hashlib 19 | import json 20 | import rsa 21 | import binascii 22 | 23 | except ImportError: 24 | print >> sys.stderr, """\ 25 | 26 | There was a problem importing one of the Python modules required. 27 | The error leading to this problem was: 28 | 29 | %s 30 | 31 | Please install a package which provides this module, or 32 | verify that the module is installed correctly. 33 | 34 | It's possible that the above module doesn't match the current version of Python, 35 | which is: 36 | 37 | %s 38 | 39 | """ % (sys.exc_info(), sys.version) 40 | sys.exit(1) 41 | 42 | 43 | __prog__= "weibo_login" 44 | __site__= "http://yoyzhou.github.com" 45 | __weibo__= "@pigdata" 46 | __version__="0.1 beta" 47 | 48 | 49 | def get_prelogin_status(username): 50 | """ 51 | Perform prelogin action, get prelogin status, including servertime, nonce, rsakv, etc. 52 | """ 53 | #prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&client=ssologin.js(v1.4.5)' 54 | prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=' + get_user(username) + \ 55 | '&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.5)'; 56 | data = urllib2.urlopen(prelogin_url).read() 57 | p = re.compile('\((.*)\)') 58 | 59 | try: 60 | json_data = p.search(data).group(1) 61 | data = json.loads(json_data) 62 | servertime = str(data['servertime']) 63 | nonce = data['nonce'] 64 | rsakv = data['rsakv'] 65 | return servertime, nonce, rsakv 66 | except: 67 | print 'Getting prelogin status met error!' 68 | return None 69 | 70 | 71 | def login(username, pwd, cookie_file): 72 | """" 73 | Login with use name, password and cookies. 74 | (1) If cookie file exists then try to load cookies; 75 | (2) If no cookies found then do login 76 | """ 77 | #If cookie file exists then try to load cookies 78 | if os.path.exists(cookie_file): 79 | try: 80 | cookie_jar = cookielib.LWPCookieJar(cookie_file) 81 | cookie_jar.load(ignore_discard=True, ignore_expires=True) 82 | loaded = 1 83 | except cookielib.LoadError: 84 | loaded = 0 85 | print 'Loading cookies error' 86 | 87 | #install loaded cookies for urllib2 88 | if loaded: 89 | cookie_support = urllib2.HTTPCookieProcessor(cookie_jar) 90 | opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) 91 | urllib2.install_opener(opener) 92 | print 'Loading cookies success' 93 | return 1 94 | else: 95 | return do_login(username, pwd, cookie_file) 96 | 97 | else: #If no cookies found 98 | return do_login(username, pwd, cookie_file) 99 | 100 | 101 | def do_login(username,pwd,cookie_file): 102 | """" 103 | Perform login action with use name, password and saving cookies. 104 | @param username: login user name 105 | @param pwd: login password 106 | @param cookie_file: file name where to save cookies when login succeeded 107 | """ 108 | #POST data per LOGIN WEIBO, these fields can be captured using httpfox extension in FIrefox 109 | login_data = { 110 | 'entry': 'weibo', 111 | 'gateway': '1', 112 | 'from': '', 113 | 'savestate': '7', 114 | 'userticket': '1', 115 | 'pagerefer':'', 116 | 'vsnf': '1', 117 | 'su': '', 118 | 'service': 'miniblog', 119 | 'servertime': '', 120 | 'nonce': '', 121 | 'pwencode': 'rsa2', 122 | 'rsakv': '', 123 | 'sp': '', 124 | 'encoding': 'UTF-8', 125 | 'prelt': '45', 126 | 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack', 127 | 'returntype': 'META' 128 | } 129 | 130 | cookie_jar2 = cookielib.LWPCookieJar() 131 | cookie_support2 = urllib2.HTTPCookieProcessor(cookie_jar2) 132 | opener2 = urllib2.build_opener(cookie_support2, urllib2.HTTPHandler) 133 | urllib2.install_opener(opener2) 134 | login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)' 135 | try: 136 | servertime, nonce, rsakv = get_prelogin_status(username) 137 | except: 138 | return 139 | 140 | #Fill POST data 141 | login_data['servertime'] = servertime 142 | login_data['nonce'] = nonce 143 | login_data['su'] = get_user(username) 144 | login_data['sp'] = get_pwd_rsa(pwd, servertime, nonce) 145 | login_data['rsakv'] = rsakv 146 | login_data = urllib.urlencode(login_data) 147 | http_headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0'} 148 | req_login = urllib2.Request( 149 | url = login_url, 150 | data = login_data, 151 | headers = http_headers 152 | ) 153 | result = urllib2.urlopen(req_login) 154 | text = result.read() 155 | p = re.compile('location\.replace\(\"(.*?)\"\)') #double quote regex 156 | ps = re.compile('location\.replace\(\'(.*?)\'\)') #single quote regex 157 | 158 | try: 159 | #Search login redirection URL 160 | try: 161 | #first try with double quote regex 162 | login_url = p.search(text).group(1) 163 | except: 164 | #try with double quote regex 165 | login_url = ps.search(text).group(1) 166 | 167 | data = urllib2.urlopen(login_url).read() 168 | 169 | #Verify login feedback, check whether result is TRUE 170 | patt_feedback = 'feedBackUrlCallBack\((.*)\)' 171 | p = re.compile(patt_feedback, re.MULTILINE) 172 | 173 | feedback = p.search(data).group(1) 174 | 175 | feedback_json = json.loads(feedback) 176 | if feedback_json['result']: 177 | cookie_jar2.save(cookie_file,ignore_discard=True, ignore_expires=True) 178 | return 1 179 | else: 180 | return 0 181 | except: 182 | return 0 183 | 184 | 185 | def get_pwd_wsse(pwd, servertime, nonce): 186 | """ 187 | Get wsse encrypted password 188 | """ 189 | pwd1 = hashlib.sha1(pwd).hexdigest() 190 | pwd2 = hashlib.sha1(pwd1).hexdigest() 191 | pwd3_ = pwd2 + servertime + nonce 192 | pwd3 = hashlib.sha1(pwd3_).hexdigest() 193 | return pwd3 194 | 195 | def get_pwd_rsa(pwd, servertime, nonce): 196 | """ 197 | Get rsa2 encrypted password, using RSA module from https://pypi.python.org/pypi/rsa/3.1.1, documents can be accessed at 198 | http://stuvel.eu/files/python-rsa-doc/index.html 199 | """ 200 | #n, n parameter of RSA public key, which is published by WEIBO.COM 201 | #hardcoded here but you can also find it from values return from prelogin status above 202 | weibo_rsa_n = 'EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443' 203 | 204 | #e, exponent parameter of RSA public key, WEIBO uses 0x10001, which is 65537 in Decimal 205 | weibo_rsa_e = 65537 206 | 207 | message = str(servertime) + '\t' + str(nonce) + '\n' + str(pwd) 208 | 209 | #construct WEIBO RSA Publickey using n and e above, note that n is a hex string 210 | key = rsa.PublicKey(int(weibo_rsa_n, 16), weibo_rsa_e) 211 | 212 | #get encrypted password 213 | encropy_pwd = rsa.encrypt(message, key) 214 | 215 | #trun back encrypted password binaries to hex string 216 | return binascii.b2a_hex(encropy_pwd) 217 | 218 | 219 | def get_user(username): 220 | username_ = urllib.quote(username) 221 | username = base64.encodestring(username_)[:-1] 222 | return username 223 | 224 | 225 | if __name__ == '__main__': 226 | 227 | username = 'ur_user_name_here' 228 | pwd = 'ur_password_here' 229 | cookie_file = 'weibo_login_cookies.dat' 230 | 231 | if login(username, pwd, cookie_file): 232 | print 'Login WEIBO succeeded' 233 | else: 234 | print 'Login WEIBO failed' 235 | -------------------------------------------------------------------------------- /weibo_scrapy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf8 3 | 4 | try: 5 | import sys 6 | import time 7 | import threading 8 | import Queue 9 | import ConfigParser 10 | from weibo_login import login 11 | 12 | except ImportError: 13 | print >> sys.stderr, """\ 14 | 15 | There was a problem importing one of the Python modules required to run yum. 16 | The error leading to this problem was: 17 | 18 | %s 19 | 20 | Please install a package which provides this module, or 21 | verify that the module is installed correctly. 22 | 23 | It's possible that the above module doesn't match the current version of Python, 24 | which is: 25 | 26 | %s 27 | 28 | """ % (sys.exc_info(), sys.version) 29 | sys.exit(1) 30 | 31 | 32 | __prog__= "weibo_scrapy" 33 | __site__= "http://yoyzhou.github.com" 34 | __weibo__= "@pigdata" 35 | __version__="0.1 beta" 36 | 37 | 38 | #####global variables##### 39 | 40 | visited_uids = set() 41 | task_queue = Queue.Queue() 42 | lock = threading.Lock() 43 | 44 | scraped = 0 45 | config_file = 'scrapy.ini' 46 | 47 | class scrapy(object): 48 | 49 | global visited_uids 50 | global task_queue 51 | global lock 52 | 53 | global scraped 54 | global config_file 55 | 56 | #//TODO add config file feature 57 | def __init__(self, config=None, thread_number=None, start_uid=None, uids_file=None): 58 | 59 | _config = {} 60 | if config: 61 | _config = self.__load_configuration__(config) 62 | else: 63 | _config = self.__load_configuration__(config_file) 64 | 65 | self.login_username = _config['login_username'] 66 | self.login_uid = _config['login_uid'] 67 | self.login_password = _config['login_password'] 68 | self.cookies_file = _config['cookies_file'] 69 | 70 | #get scrapy settings 71 | self.thread_number = _config['thread_number'] 72 | 73 | self.start_uid = _config['start_uid'] 74 | self.uids_file = _config['uids_file'] 75 | self.wanted = _config[ 'wanted'] 76 | 77 | #accepts arguments also, and arguments have higher priority 78 | if thread_number: 79 | self.thread_number = thread_number 80 | if start_uid and uids_file: 81 | raise Exception('You can only specify `start_uid` or `uids_file` in constructor') 82 | 83 | if start_uid: 84 | self.start_uid = start_uid 85 | self.uids_file = None 86 | if uids_file: 87 | self.uids_file = uids_file 88 | self.start_uid = None 89 | 90 | def scrapy(self): 91 | 92 | login_status = login(self.login_username, self.login_password, self.cookies_file) 93 | 94 | if login_status: 95 | 96 | if self.start_uid: 97 | task_queue.put(self.start_uid) 98 | 99 | elif self.uids_file: 100 | uids_list = self.__load_uids__() 101 | for uid in uids_list: 102 | task_queue.put(uid) 103 | 104 | else: #start uid or uids file is needed 105 | raise Exception('ERROR: Start uid or uids file is needed.') 106 | 107 | #spawn a pool of threads, and pass them queue instance 108 | for _ in range(self.thread_number): 109 | st = scrapy_threading(self.scrapy_do_task, self.wanted) 110 | st.setDaemon(True) 111 | st.start() 112 | 113 | 114 | task_queue.join() 115 | 116 | 117 | def scrapy_do_task(self, uid=None): 118 | ''' 119 | User needs to overwrite this method to perform uid-based scrapy task. 120 | @param uid: weibo uid 121 | @return: a list of uids gained from this task, optional 122 | ''' 123 | #return [] 124 | pass 125 | 126 | def __load_configuration__(self, config_file): 127 | config = ConfigParser.RawConfigParser(allow_no_value=True) 128 | config.read(config_file) 129 | settings = {} 130 | #get login account user info 131 | settings['login_username'] = config.get('login_account_info', 'login_username') 132 | settings['login_uid'] = config.get('login_account_info', 'login_uid') 133 | settings['login_password'] = config.get('login_account_info', 'login_password') 134 | settings['cookies_file'] = config.get('login_account_info', 'cookies_file') 135 | 136 | #get scrapy settings 137 | settings['thread_number'] = config.getint('scrapy_settings', 'thread_number') 138 | settings['start_uid'] = config.get('scrapy_settings', 'start_uid') 139 | settings['uids_file'] = config.get('scrapy_settings', 'uids_file') 140 | settings['wanted'] = config.getint('scrapy_settings', 'wanted') 141 | 142 | return settings 143 | 144 | def __load_uids__(self): 145 | ''' 146 | Loads uids from file. File should be formatted as one uid on each line. 147 | ''' 148 | uids_list = [] 149 | with open(self.uids_file, 'r') as uids: 150 | for uid in uids: 151 | if uid: 152 | uids_list.append(uid.strip()) 153 | 154 | return uids_list 155 | 156 | class scrapy_threading(threading.Thread): 157 | """Thread class to handle scrapy task""" 158 | 159 | def __init__(self, task, wanted): 160 | threading.Thread.__init__(self) 161 | self.do_task = task 162 | self.wanted = wanted 163 | 164 | def run(self): 165 | global visited_uids 166 | global task_queue 167 | global scraped 168 | global lock 169 | 170 | while scraped < self.wanted: 171 | 172 | #crawl info based on each uid 173 | if task_queue: 174 | 175 | uid = task_queue.get() 176 | 177 | if uid in visited_uids: #already crawled 178 | task_queue.task_done() 179 | 180 | else: 181 | try: 182 | gains = self.do_task(uid) 183 | 184 | #per debug 185 | wow = '{0: <25}'.format('[' + time.asctime() + '] ') + ' uid_' + '{0: <12}'.format(uid) 186 | print wow 187 | for uid in gains: 188 | task_queue.put(uid) 189 | 190 | #signals that queue job is done 191 | task_queue.task_done() 192 | 193 | #counting scrapied number 194 | with lock: 195 | scraped += 1 196 | #per debug 197 | print 'scraped: ' + str(scraped) 198 | 199 | except Exception, e: 200 | print e 201 | pass 202 | 203 | else: 204 | time.sleep(30) 205 | --------------------------------------------------------------------------------