├── README.md └── crawler.py /README.md: -------------------------------------------------------------------------------- 1 | tbcrawler 2 | ============= 3 | 淘宝和天猫的爬虫,可以根据搜索关键词,物品id来抓去页面的信息. 4 | db:MongoDB 5 | 6 | tbcrawler 7 | * 2 * * * python /data/git/tbcrawler/crawler.py update 8 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | some db interface 5 | """ 6 | import pymongo 7 | import pycurl 8 | from BeautifulSoup import BeautifulSoup 9 | import StringIO 10 | import time 11 | from django.utils.encoding import smart_str, smart_unicode 12 | import os 13 | import traceback 14 | from datetime import datetime,timedelta 15 | import json 16 | #from smallgfw import GFW 17 | import os 18 | import os.path 19 | from pymongo import ASCENDING,DESCENDING 20 | import requests 21 | from urlparse import urlparse 22 | import sys 23 | import urlparse 24 | import re 25 | import types 26 | import sys 27 | mktime=lambda dt:time.mktime(dt.utctimetuple()) 28 | ######################db.init###################### 29 | connection = pymongo.Connection('localhost', 27017) 30 | 31 | db=connection.x 32 | 33 | #browser = requests.session() 34 | ######################gfw.init###################### 35 | #gfw = GFW() 36 | #gfw.set(open(os.path.join(os.path.dirname(__file__),'keyword.txt')).read().split('\n')) 37 | # 38 | #lgfw = GFW() 39 | #lgfw.set(['thunder://','magnet:','ed2k://']) 40 | 41 | 42 | 43 | def zp(data): 44 | """ 45 | print dict list 46 | """ 47 | for k in data: 48 | print '%s:'%k,data[k] 49 | 50 | def get_html(url,referer ='',verbose=False,protocol='http'): 51 | if not url.startswith(protocol): 52 | url = protocol+'://'+url 53 | url = str(url) 54 | print '============================================' 55 | print 'url:',[url] 56 | print '============================================' 57 | time.sleep(1) 58 | html='' 59 | headers = ['Cache-control: max-age=0',] 60 | try: 61 | crl = pycurl.Curl() 62 | crl.setopt(pycurl.VERBOSE,1) 63 | crl.setopt(pycurl.FOLLOWLOCATION, 1) 64 | crl.setopt(pycurl.MAXREDIRS, 5) 65 | crl.setopt(pycurl.CONNECTTIMEOUT, 8) 66 | crl.setopt(pycurl.TIMEOUT, 30) 67 | crl.setopt(pycurl.VERBOSE, verbose) 68 | crl.setopt(pycurl.MAXREDIRS,15) 69 | crl.setopt(pycurl.USERAGENT,'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1') 70 | #crl.setopt(pycurl.HTTPHEADER,headers) 71 | if referer: 72 | crl.setopt(pycurl.REFERER,referer) 73 | crl.fp = StringIO.StringIO() 74 | crl.setopt(pycurl.URL, url) 75 | crl.setopt(crl.WRITEFUNCTION, crl.fp.write) 76 | crl.perform() 77 | html=crl.fp.getvalue() 78 | crl.close() 79 | except Exception,e: 80 | print('\n'*9) 81 | traceback.print_exc() 82 | print('\n'*9) 83 | return None 84 | return html 85 | 86 | #r = requests.get(url) 87 | #return r.text 88 | 89 | #r = browser.get(url) 90 | #return r.content 91 | 92 | def transtime(stime): 93 | """ 94 | 将'11-12-13 11:30'类型的时间转换成unixtime 95 | """ 96 | if stime and ':' in stime: 97 | res=stime.split(' ') 98 | year,mon,day=[int(i) for i in res[0].split('-')] 99 | hour,second=[int(i) for i in res[1].split(':')] 100 | unixtime=mktime(datetime.datetime(year,mon,day,hour,second)) 101 | return unixtime 102 | else: 103 | return int(time.time()) 104 | 105 | 106 | def save_shop(shopurl,site='tb'): 107 | """ 108 | save shop info 109 | """ 110 | return 111 | coll = db.shop 112 | if site == 'tb': 113 | sinfo = getTaobaoShop(shopurl) 114 | elif site == 'tm': 115 | sinfo = getTmallShop(shopurl) 116 | print sinfo 117 | res = coll.find_one({'shopid':sinfo['shopid'],'site':site,'url':shopurl}) 118 | 119 | if res: 120 | pass 121 | #coll.update({'sid':sinfo['shopid'],'site':site}, 122 | # {'lastupdatetime':datetime.now()} 123 | #) 124 | else: 125 | coll.insert( 126 | { 127 | 'sid':sinfo['shopid'], 128 | 'name':sinfo['shopname'], 129 | 'sellerid':sinfo['sellerid'], 130 | 'site':site, 131 | 'url':shopurl, 132 | 'createtime':datetime.now(), 133 | 'lastupdatetime':datetime.now(), 134 | } 135 | ) 136 | 137 | def save_item_log(data): 138 | """ 139 | save item crawler log 140 | """ 141 | db.itemlog.insert({ 142 | 'itemid':data['itemid'], 143 | 'name':data['itemname'], 144 | 'price':data['price'], 145 | 'site':data['site'], 146 | 'realprice':data['realprice'], 147 | 'quantity':data['quantity'], 148 | 'total_count':data.get('total_count',0), 149 | 'createtime':datetime.now(), 150 | }) 151 | 152 | 153 | def save_item(data): 154 | """ 155 | save item info 156 | """ 157 | print '============================' 158 | print 'save a new item' 159 | print 'itemid:',data['itemid'] 160 | print 'name:',data['itemname'] 161 | print 'site:',data['site'] 162 | 163 | iteminfo = db.item.find_one({ 164 | 'itemid':data['itemid'], 165 | 'site':data['site'], 166 | }) 167 | if iteminfo : 168 | newcount = data['quantity']-iteminfo['quantity'] 169 | db.item.update({'itemid':iteminfo['itemid'],'site':iteminfo['site']}, 170 | {'$set':{'lastupdatetime':datetime.now(), 171 | 'quantity':data['quantity'], 172 | 'total_count':data.get('total_count',0), 173 | }, 174 | } 175 | ) 176 | print '[save data]:result:update this item info success!' 177 | else: 178 | print '[save data]:insert a new item' 179 | db.item.insert({ 180 | 'itemid':data['itemid'], 181 | 'itemname':data['itemname'], 182 | 'price':data['price'], 183 | 'realprice':data['realprice'], 184 | 'shopurl':data['shopurl'], 185 | #'pic':data['pic'], 186 | 'site':data['site'], 187 | 'keyword':data['keyword'], 188 | 'quantity':data['quantity'], 189 | 'total_count':data.get('total_count',data['quantity']), 190 | 'createtime':datetime.now(), 191 | 'lastupdatetime':datetime.now(), 192 | }) 193 | print 'result:insert success' 194 | save_shop(data['shopurl'],data['site']) 195 | save_item_log(data) 196 | print '============================' 197 | 198 | def searchcrawler(url,keyword=''): 199 | """ 200 | tb搜索页爬虫 201 | """ 202 | html=get_html(url) 203 | #print html 204 | if html: 205 | soup = BeautifulSoup(html,fromEncoding='gbk') 206 | items_row = soup.findAll('div',{'class':'row item icon-datalink'}) 207 | if items_row: 208 | print '=======================row search row==========================' 209 | #print items 210 | for item in items_row: 211 | item_info = item.find('div',{'class':'col title'}).h3.a 212 | item_url = item_info['href'] 213 | url_info = urlparse.urlparse(item_url) 214 | item_id = urlparse.parse_qs(url_info.query,True)['id'][0] 215 | print item_url 216 | print item_id 217 | judge_site(item_url,keyword) 218 | items_col = soup.findAll('div',{'class':'col item icon-datalink'}) 219 | if items_col: 220 | print '=======================row search col==========================' 221 | #print items 222 | for item in items_col: 223 | item_info = item.find('div',{'class':'item-box'}).h3.a 224 | item_url = item_info['href'] 225 | url_info = urlparse.urlparse(item_url) 226 | item_id = urlparse.parse_qs(url_info.query,True)['id'][0] 227 | print item_url 228 | print item_id 229 | judge_site(item_url,keyword) 230 | 231 | def check_item_update_time(iid,site,interval=86400): 232 | res = db.item.find_one({'itemid':iid,'site':site}) 233 | if res: 234 | delta = datetime.now()-res['lastupdatetime'] 235 | if delta.total_seconds()1: 566 | if sys.argv[1] == 'search': 567 | runcrawler() 568 | elif sys.argv[1] == 'update': 569 | update_item_date() 570 | 571 | #print '*******************************************' 572 | #url = "http://mdskip.taobao.com/core/initItemDetail.htm?tmallBuySupport=true&itemId=15765842063&service3C=true" 573 | #data = get_html(url,referer="http://detail.tmall.com/item.htm?id=15765842063").decode('gbk').replace('\r\n','').replace('\t','') 574 | #patt = '.+?(\w+:\s*".*")' 575 | 576 | #url = "http://s.taobao.com/search?q=无线键盘&commend=all&search_type=item&sourceId=tb.index" 577 | #searchcrawler(url) 578 | #print '*******************************************' 579 | #print res.decode('gbk') 580 | #print '+++++++++++++++++++++++++++++++++++++++++++++++++++++++=' 581 | #print parse_quantity(15517664123) 582 | #print res['comments'] 583 | #data = getTaobaoItemInfo(15846674458) 584 | #data = getTmallItemInfo(16659653478)#已经下架 585 | #data = getTmallItemInfo(18740852051) 586 | #print data 587 | #save_item(data) 588 | #zp(getTaobaoItemInfo(17699431781)) 589 | #zp(getTmallItemInfo(16659653478)) 590 | #zp(getTmallItemInfo(12434044828)) 591 | #print parse_price(17824234211,6800) 592 | #print itemcrawler(17824234211) 593 | #judge_site('http://item.taobao.com/item.htm?id=14992324812&ad_id=&am_id=&cm_id=140105335569ed55e27b&pm_id=') 594 | #print getTmallShop('logitech.tmall.com') 595 | #print getTaobaoShop('http://hjjh.taobao.com') 596 | #runcrawler() 597 | #url = "http://ext.mdskip.taobao.com/extension/dealRecords.htm?bid_page=1&page_size=15&is_start=false&item_type=b&ends=1377944879000&starts=1377340079000&item_id=22167436659&user_tag=34672672&old_quantity=905551&seller_num_id=1124016457&isFromDetail=yes&totalSQ=144923&sbn=37ad2e5f076636c83ee5af7500954ee1,showBuyerList" 598 | #data = get_html(url,referer="http://detail.tmall.com/item.htm?id=22167436659",verbose=True)#.decode('gbk').replace('\r\n','').replace('\t','') 599 | #print 'data:',data 600 | #print get_html('http://taipusm.tmall.com') 601 | 602 | 603 | --------------------------------------------------------------------------------