├── README.md
└── crawler.py


/README.md:
--------------------------------------------------------------------------------
1 | tbcrawler
2 | =============
3 | 淘宝和天猫的爬虫,可以根据搜索关键词,物品id来抓去页面的信息.
4 | db:MongoDB
5 | 
6 | tbcrawler
7 | * 2 * * *  python /data/git/tbcrawler/crawler.py update
8 | 


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | some db interface 
  5 | """
  6 | import pymongo
  7 | import pycurl
  8 | from BeautifulSoup import BeautifulSoup 
  9 | import StringIO
 10 | import time
 11 | from django.utils.encoding import smart_str, smart_unicode
 12 | import os 
 13 | import traceback
 14 | from datetime import datetime,timedelta
 15 | import  json
 16 | #from smallgfw import GFW
 17 | import os 
 18 | import os.path
 19 | from pymongo import ASCENDING,DESCENDING 
 20 | import requests 
 21 | from urlparse import urlparse
 22 | import sys
 23 | import urlparse
 24 | import re
 25 | import types 
 26 | import sys
 27 | mktime=lambda dt:time.mktime(dt.utctimetuple())
 28 | ######################db.init######################
 29 | connection = pymongo.Connection('localhost', 27017)
 30 | 
 31 | db=connection.x
 32 | 
 33 | #browser = requests.session()
 34 | ######################gfw.init######################
 35 | #gfw = GFW()
 36 | #gfw.set(open(os.path.join(os.path.dirname(__file__),'keyword.txt')).read().split('\n'))
 37 | #
 38 | #lgfw = GFW()
 39 | #lgfw.set(['thunder://','magnet:','ed2k://'])
 40 | 
 41 | 
 42 | 
 43 | def zp(data):
 44 |     """
 45 |     print dict list
 46 |     """
 47 |     for k in data:
 48 |         print '%s:'%k,data[k]
 49 | 
 50 | def get_html(url,referer ='',verbose=False,protocol='http'):
 51 |     if not url.startswith(protocol):
 52 |         url = protocol+'://'+url
 53 |     url = str(url)
 54 |     print '============================================'
 55 |     print 'url:',[url]
 56 |     print '============================================'
 57 |     time.sleep(1)
 58 |     html=''
 59 |     headers = ['Cache-control: max-age=0',]
 60 |     try:
 61 |         crl = pycurl.Curl()
 62 |         crl.setopt(pycurl.VERBOSE,1)
 63 |         crl.setopt(pycurl.FOLLOWLOCATION, 1)
 64 |         crl.setopt(pycurl.MAXREDIRS, 5)
 65 |         crl.setopt(pycurl.CONNECTTIMEOUT, 8)
 66 |         crl.setopt(pycurl.TIMEOUT, 30)
 67 |         crl.setopt(pycurl.VERBOSE, verbose)
 68 |         crl.setopt(pycurl.MAXREDIRS,15)
 69 |         crl.setopt(pycurl.USERAGENT,'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1')
 70 |         #crl.setopt(pycurl.HTTPHEADER,headers)
 71 |         if referer:
 72 |             crl.setopt(pycurl.REFERER,referer)
 73 |         crl.fp = StringIO.StringIO()
 74 |         crl.setopt(pycurl.URL, url)
 75 |         crl.setopt(crl.WRITEFUNCTION, crl.fp.write)
 76 |         crl.perform()
 77 |         html=crl.fp.getvalue()
 78 |         crl.close()
 79 |     except Exception,e:
 80 |         print('\n'*9)
 81 |         traceback.print_exc()
 82 |         print('\n'*9)
 83 |         return None
 84 |     return html
 85 | 
 86 |     #r = requests.get(url)
 87 |     #return r.text
 88 | 
 89 |     #r = browser.get(url)
 90 |     #return r.content
 91 | 
 92 | def transtime(stime):
 93 |     """
 94 |     将'11-12-13 11:30'类型的时间转换成unixtime
 95 |     """
 96 |     if stime and ':' in stime:
 97 |         res=stime.split(' ')
 98 |         year,mon,day=[int(i) for i in res[0].split('-')]
 99 |         hour,second=[int(i) for i in res[1].split(':')]
100 |         unixtime=mktime(datetime.datetime(year,mon,day,hour,second))
101 |         return unixtime
102 |     else:
103 |         return int(time.time())
104 | 
105 | 
106 | def save_shop(shopurl,site='tb'):
107 |     """
108 |     save shop info
109 |     """
110 |     return
111 |     coll = db.shop
112 |     if site == 'tb':
113 |         sinfo = getTaobaoShop(shopurl)
114 |     elif site == 'tm':
115 |         sinfo = getTmallShop(shopurl)
116 |     print  sinfo
117 |     res = coll.find_one({'shopid':sinfo['shopid'],'site':site,'url':shopurl}) 
118 | 
119 |     if res:
120 |         pass
121 |         #coll.update({'sid':sinfo['shopid'],'site':site},
122 |         #            {'lastupdatetime':datetime.now()}
123 |         #)  
124 |     else:
125 |         coll.insert(
126 |                    {
127 |                    'sid':sinfo['shopid'],
128 |                    'name':sinfo['shopname'],
129 |                    'sellerid':sinfo['sellerid'],
130 |                    'site':site,
131 |                    'url':shopurl,
132 |                    'createtime':datetime.now(),
133 |                    'lastupdatetime':datetime.now(),
134 |                    }
135 |         )   
136 |        
137 | def save_item_log(data):
138 |     """
139 |     save item crawler log
140 |     """
141 |     db.itemlog.insert({
142 |                       'itemid':data['itemid'],
143 |                       'name':data['itemname'],
144 |                       'price':data['price'],
145 |                       'site':data['site'],
146 |                       'realprice':data['realprice'],
147 |                       'quantity':data['quantity'],
148 |                       'total_count':data.get('total_count',0),
149 |                       'createtime':datetime.now(),
150 |     })
151 | 
152 | 
153 | def save_item(data):
154 |     """
155 |     save item info
156 |     """
157 |     print '============================'
158 |     print 'save a new item'
159 |     print 'itemid:',data['itemid']
160 |     print 'name:',data['itemname']
161 |     print 'site:',data['site']
162 | 
163 |     iteminfo = db.item.find_one({
164 |              'itemid':data['itemid'],
165 |              'site':data['site'],
166 |             })
167 |     if iteminfo :
168 |         newcount = data['quantity']-iteminfo['quantity']        
169 |         db.item.update({'itemid':iteminfo['itemid'],'site':iteminfo['site']},
170 |                        {'$set':{'lastupdatetime':datetime.now(),
171 |                                 'quantity':data['quantity'],
172 |                                 'total_count':data.get('total_count',0),
173 |                                 },
174 |                        }
175 |         )
176 |         print '[save data]:result:update this item info success!'
177 |     else:
178 |         print '[save data]:insert a new item'
179 |         db.item.insert({
180 |                         'itemid':data['itemid'],
181 |                         'itemname':data['itemname'],
182 |                         'price':data['price'],
183 |                         'realprice':data['realprice'],
184 |                         'shopurl':data['shopurl'],
185 |                         #'pic':data['pic'],
186 |                         'site':data['site'],
187 |                         'keyword':data['keyword'],
188 |                         'quantity':data['quantity'],
189 |                         'total_count':data.get('total_count',data['quantity']),
190 |                         'createtime':datetime.now(),
191 |                         'lastupdatetime':datetime.now(),
192 |         })
193 |         print 'result:insert success'
194 |     save_shop(data['shopurl'],data['site'])    
195 |     save_item_log(data)
196 |     print '============================'
197 | 
198 | def searchcrawler(url,keyword=''):
199 |     """
200 |     tb搜索页爬虫
201 |     """
202 |     html=get_html(url)
203 |     #print html
204 |     if html:
205 |         soup = BeautifulSoup(html,fromEncoding='gbk')
206 |         items_row = soup.findAll('div',{'class':'row item icon-datalink'})
207 |         if items_row:
208 |             print '=======================row search row=========================='
209 |             #print items
210 |             for item in items_row:
211 |                 item_info = item.find('div',{'class':'col title'}).h3.a
212 |                 item_url = item_info['href']
213 |                 url_info = urlparse.urlparse(item_url)
214 |                 item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
215 |                 print item_url
216 |                 print item_id
217 |                 judge_site(item_url,keyword)
218 |         items_col = soup.findAll('div',{'class':'col item icon-datalink'})
219 |         if items_col:
220 |             print '=======================row search col=========================='
221 |             #print items
222 |             for item in items_col:
223 |                 item_info = item.find('div',{'class':'item-box'}).h3.a
224 |                 item_url = item_info['href']
225 |                 url_info = urlparse.urlparse(item_url)
226 |                 item_id = urlparse.parse_qs(url_info.query,True)['id'][0]
227 |                 print item_url
228 |                 print item_id
229 |                 judge_site(item_url,keyword)
230 | 
231 | def check_item_update_time(iid,site,interval=86400):
232 |     res = db.item.find_one({'itemid':iid,'site':site})
233 |     if res: 
234 |         delta = datetime.now()-res['lastupdatetime']
235 |         if delta.total_seconds()<interval:
236 |             return True
237 |     return False
238 | 
239 | 
240 | def itemcrawler(iid,source='tb'):
241 |     """
242 |     tb物品页爬虫
243 |     """
244 |     if source == 'tb':
245 |         url="http://item.taobao.com/item.htm?id=%s"%iid
246 |     else:
247 |         url="http://detail.tmall.com/item.htm?id=%s"%iid
248 |         
249 |     html=get_html(url)
250 |     #print html
251 |     if html:
252 |         soup = BeautifulSoup(html,fromEncoding='gbk')
253 |         shop_info = {}
254 |         #请求销售数量url中需要的md5
255 |         quantity_md5_patt = 'sbn=(\w{32})'
256 |         qmd5 = re.findall(quantity_md5_patt,html)[0]
257 |         shopurl = soup.find('a',{'class':'hCard fn'})['href']
258 |         shop_info['shopurl'] = urlparse.urlparse(shopurl).netloc
259 |         for a in soup.find('meta',{'name':'microscope-data'})['content'].split(';'):
260 |             if a:
261 |                 k,v = a.strip().split('=')
262 |                 if k and v:
263 |                     shop_info[k] = int(v)
264 |         price = soup.find('li',{'id':'J_StrPriceModBox'}).find('em',{'class':'tb-rmb-num'}).text
265 |         #商品名称
266 |         item_name = json.loads(soup.find('div',{'id':'J_itemViewed'})['data-value'])['title']
267 |         shop_info['item_name'] = item_name
268 |         #店铺名称
269 |         shop_name = soup.find('a',{'class':'hCard fn'})['title']
270 |         shop_info['shop_name'] = shop_name
271 |         quantity_info = soup.find('li',{'class':'tb-sold-out tb-clearfix'})
272 |         #有可能是个价格范围,先取最小值
273 |         if '-' in price:
274 |             shop_info['price'] = float(price.split('-')[0].strip())
275 |         else:
276 |             shop_info['price'] = float(price)
277 |         shop_info['qmd5'] = qmd5
278 |         #print 'shop_info:',shop_info
279 |         return shop_info
280 | 
281 | def parse_price(iid,price,sellerid):
282 |     """
283 |     提取价格数据
284 |     """
285 |     data =  get_html('http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1&rcid=1&price=%s&sellerId=%s'%(iid,price,sellerid),referer='http://item.taobao.com/item.htm?id=%s'%iid,verbose=False)
286 |     intkey = ['price','quanity','interval']
287 |     resdict = {}
288 |     data = data.decode('gbk').strip().replace('\r\n','').replace('\t','')
289 |     patt_list = [r'price:\s*"(\w*\.\w*)"',
290 |                  r'type:\s*"(.*)",\s*price' ,
291 |                 ]
292 |     #print 'data:',data
293 |     if re.findall(patt_list[0],data):
294 |         real_price = float(re.findall(patt_list[0],data)[0])
295 |         price_type = str(re.findall(patt_list[1],data)[0]) 
296 |     else :
297 |         #无活动,价格就是原价,或者是个价格范围
298 |         real_price = price/100
299 |         price_type = '无'
300 |     return {'price':real_price,'type':price_type}
301 | 
302 | def parse_quantity(iid,sellerid,qmd5):
303 |     """
304 |     提取货物销量
305 |     """
306 |     url = "http://ajax.tbcdn.cn/json/ifq.htm?id=%s&sid=%s&p=1&ap=0&ss=0&free=0&q=1&ex=0&exs=0&at=b&ct=0&sbn=%s"%(iid,sellerid,qmd5)
307 |     data = get_html(url)
308 |     intkey = ['quanity','interval']
309 |     resdict = {}
310 |     data = data.decode('gbk').strip().replace('\r\n','').replace('\t','')
311 |     patt_list = [
312 |                 r'interval:\s*\w*',
313 |                 r'quanity:\s*\w*\.?\w*',
314 |                 r'location:\s*\'.*\',',
315 |                 r'carriage:\s*\'.*\'',
316 |                 ]
317 | 
318 |     complie_list = [re.compile(a) for a in patt_list]
319 |     for c in complie_list:
320 |         res = re.findall(c,data)
321 |         if res:
322 |             res= res[0]
323 |             for r in res.split(','):
324 |                 if r:
325 |                     key,value = r.split(':',1)
326 |                     key = key.strip()
327 |                     value = value.strip()
328 |                     if key in intkey:
329 |                         value = float(value)
330 |                     resdict[key]= value
331 |     return resdict
332 | 
333 | def getTmallItemInfo(iid,keyword=''):
334 |     """
335 |     获取tm的物品信息
336 |     """
337 |     temp = {'site':'tm','itemid':iid,'keyword':keyword}
338 |     patt_list = {
339 |                 #r""""sellerNickName"\s*:\s*(.*)'\s*,'isEcardAuction'""",
340 |                 'sellerid':r"'userId'\s*:\s*'(\w*)',",
341 |                 'shopid':r'rstShopId:(\w*),',
342 |                 'brand':r"'brand'\s*:\s*(.*)'\s*,'brandId'",
343 |                 'brandid':r"'brandId'\s*:\s*'(\w*)'", 
344 |                 'total_count':r'totalSQ=(\w*)', 
345 |     }
346 |     html = get_html("http://detail.tmall.com/item.htm?id=%s"%iid)
347 |     #print 'html:',html
348 |     htmlutf = html.replace('\r\n','').replace('\t','')
349 |     soup = BeautifulSoup(html,fromEncoding='gbk')
350 |     temp['shopurl'] = urlparse.urlparse(soup.find('span',{'class':'slogo'}).a['href']).netloc
351 |     temp['itemname'] = soup.find('input',{'name':'title'})['value']
352 |     temp['region'] = soup.find('input',{'name':'region'})['value']
353 |     temp['sellername'] = soup.find('input',{'name':'seller_nickname'})['value']
354 |     for k in patt_list:
355 |         patt = patt_list[k]
356 |         temp[k] = re.findall(patt,htmlutf)[0]
357 |     url = "http://mdskip.taobao.com/core/initItemDetail.htm?tmallBuySupport=true&itemId=%s&service3C=true"%(iid)
358 |     data = get_html(url,referer="http://detail.tmall.com/item.htm?id=%s"%iid).decode('gbk')#.replace('\r\n','').replace('\t','')
359 |     patt = '"priceInfo":(\{.*\}),"promType"'
360 |     price_info = re.findall(patt,data)
361 |     if price_info:
362 |         price_info = json.loads(price_info[0])
363 |         #print 'price_info:',price_info
364 |         if price_info.get('def'):
365 |             temp['price'] = float(price_info['def']['price'])
366 |             if price_info['def']['promotionList']:
367 |                 temp['realprice'] = float(price_info['def']['promotionList'][0]['price'])
368 |             else:
369 |                 if price_info['def'].get('tagPrice'):
370 |                     temp['realprice'] = float(price_info['def']['tagPrice'])
371 |                 else:
372 |                     temp['realprice'] = float(price_info['def']['price'])
373 |                     
374 |         else:
375 |             temp['price'] = float(price_info[price_info.keys()[0]]['price'])
376 |             temp['realprice'] = float(price_info[price_info.keys()[0]]['price'])
377 |             
378 |     patt = '"sellCountDO":(\{.*\}),"serviceDO"'
379 |     quantity_info = re.findall(patt,data)
380 |     if quantity_info:
381 |         quantity = re.findall(r'"sellCount":(\w*)',quantity_info[0])[0]
382 |         print 'quantity :',quantity
383 |         temp['quantity'] = float(quantity)
384 |     return temp
385 | 
386 | 
387 | def getTaobaoItemInfo(iid,keyword=''):
388 |     """
389 |     获取tb物品页信息
390 |     """
391 |     iteminfo = {'site':'tb','keyword':keyword}
392 |     item_original_info = itemcrawler(iid)
393 |     price_info = parse_price(iid,int(item_original_info['price']*100),item_original_info['userid'])
394 |     quantity_info = parse_quantity(iid,item_original_info['userid'],item_original_info['qmd5'])
395 |     #print '店名:',item_original_info['shop_name']
396 |     #print '物品名称:',item_original_info['item_name']
397 |     #print '物品原价:',item_original_info['price']
398 |     #if price_info:
399 |     #    print '活动:',price_info['type']
400 |     #    print '物品现价:',price_info['price']
401 |     #print '物品%s天内售出了%s件:'%(quantity_info['interval'],quantity_info['quanity'])
402 |     #zp(item_original_info)
403 |     #zp(price_info)
404 |     #zp(quantity_info)
405 |     iteminfo['itemid'] = iid
406 |     iteminfo['price'] = item_original_info['price']
407 |     iteminfo['itemname'] = item_original_info['item_name']
408 |     iteminfo['sellerid'] = item_original_info['userid']
409 |     iteminfo['shopid'] = item_original_info['shopId']
410 |     iteminfo['shopurl'] = item_original_info['shopurl']
411 |     iteminfo['realprice'] = price_info['price']
412 |     iteminfo['active'] = price_info['type']
413 |     iteminfo['interval'] = quantity_info['interval']
414 |     iteminfo['quantity'] = quantity_info['quanity']
415 |     iteminfo['location'] = quantity_info['location']
416 |     return iteminfo
417 | 
418 | def judge_site(url,keyword=''):
419 |     """
420 |     判断物品是tb还是tm
421 |     """
422 |     url_info = urlparse.urlparse(url)
423 |     urlkey = urlparse.parse_qs(url_info.query,True)
424 |     iid = int(urlkey['id'][0])
425 |     #print 'url_info:',url_info[1]
426 |     try:
427 |         if url_info[1] == 'detail.tmall.com':
428 |             print 'it is a tm item'
429 |             if check_item_update_time(iid,'tm'):
430 |                 return
431 |             data = getTmallItemInfo(iid,keyword)
432 |         elif urlkey.get('cm_id'):
433 |             print 'it is a tm item'
434 |             if check_item_update_time(iid,'tm'):
435 |                 return
436 |             data = getTmallItemInfo(iid,keyword)
437 |         else:
438 |             print 'it is a tb item'
439 |             if check_item_update_time(iid,'tb'):
440 |                 return
441 |             data = getTaobaoItemInfo(iid,keyword)
442 |     except Exception ,e:
443 |         print traceback.print_exc()
444 |         return
445 |     save_item(data)
446 | 
447 | def getTmallShop(url):
448 |     """
449 |     获取tm商铺信息
450 |     """
451 |     html = get_html(url)
452 |     if html:
453 |         soup = BeautifulSoup(html,fromEncoding='gbk')
454 |         hot_item_rank = soup.find('div',{'class':'rank-panels'})
455 |         shop_score = soup.find('div',{'class':'shop-rate'})
456 | 
457 |         shop_name = soup.find('span',{'class':'slogo'}).a.text
458 |         sinfo = {}
459 |         for a in soup.find('meta',{'name':'microscope-data'})['content'].split(';'):
460 |             if a:
461 |                 k,v = a.strip().split('=')
462 |                 if k and v:
463 |                     if v:
464 |                         k = k.strip()
465 |                         sinfo[k] = int(v)
466 |         sinfo['site']='tm'
467 |         sinfo['shopname']=shop_name
468 |         sinfo['sellerid']=sinfo.get('userid') or sinfo.get('userId')
469 |         sinfo['shopid']=sinfo['shopId']
470 |         if hot_item_rank:
471 |             sinfo['hot_item_rank'] = []
472 |             hot_item_rank = hot_item_rank.div.ul.findAll('li')
473 |             for item in hot_item_rank:
474 |                 divs = item.findAll('div')
475 |                 pic = divs[0].a.img['src']
476 |                 itemid = divs[0].a['href'].split('=')[-1] 
477 |                 itemname = divs[1].a.text
478 |                 sinfo['hot_item_rank'].append({
479 |                 'itemid':int(itemid),
480 |                 'pic':pic,
481 |                 'itemname':itemname,
482 |                 })
483 |         if shop_score:
484 |             lis = shop_score.findAll('li')
485 |             sinfo['desc'] = float(lis[0].a.em.text)
486 |             sinfo['service'] = float(lis[1].a.em.text)
487 |             sinfo['deliver'] = float(lis[2].a.em.text)
488 |         return sinfo
489 | 
490 | def getTaobaoShop(url):
491 |     """
492 |     获取tb店铺信息
493 |     """
494 |     html = get_html(url)
495 |     if html:
496 |         soup = BeautifulSoup(html,fromEncoding='gbk')
497 |         hot_item_rank = soup.find('div',{'class':'panels'})
498 |         shop_score = soup.find('div',{'class':'bd-right shop-credit'})
499 |         if soup.find('a',{'class':'shop-name '}):
500 |             shop_name = soup.find('a',{'class':'shop-name '}).text
501 |         else:
502 |             shop_name = soup.find('a',{'class':'hCard fn'}).text
503 |         sinfo = {}
504 |         sinfo['site'] = 'tb'
505 |         #print 'hot_item_rank:',hot_item_rank
506 |         for a in soup.find('meta',{'name':'microscope-data'})['content'].split(';'):
507 |             if a:
508 |                 k,v = a.strip().split('=')
509 |                 if k and v:
510 |                     sinfo[k] = int(v)
511 |         #print 'sinfo:',sinfo
512 |         sinfo['shopid'] = sinfo['shopId']
513 |         sinfo['sellerid'] = sinfo.get('userId',sinfo.get('userid',0))
514 |         sinfo['shopname'] = shop_name
515 |         if hot_item_rank:
516 |             hot_item_rank = hot_item_rank.div
517 |             sinfo['hot_item_rank'] = []            
518 |             hot_item_rank=hot_item_rank.ul.findAll('li')
519 |             for li in hot_item_rank:
520 |                 divs = li.findAll('div')        
521 |                 itemid = int(divs[1].a['href'].split('=')[-1])
522 |                 pic = divs[2].a['href']
523 |                 itemname = divs[3].p.a.text
524 |                 sinfo['hot_item_rank'].append({'itemid':itemid,'itemname':itemname,'pic':pic})
525 | 
526 |         if shop_score:
527 |             shop_score = shop_score.find('tbody')
528 |             trs = shop_score.findAll('tr')
529 |             sinfo['desc'] = float(trs[0].findAll('em')[0].text)
530 |             sinfo['service'] = float(trs[1].findAll('em')[0].text)
531 |             sinfo['deliver'] = float(trs[2].findAll('em')[0].text)
532 |         return sinfo 
533 | 
534 | def runcrawler():
535 |     url = "http://s.taobao.com/search?q=%s&commend=all&search_type=item&sourceId=tb.index"
536 |     for k in db.keyword.find():
537 |         try:
538 |             searchcrawler(url%k['name'],keyword=k['name'])
539 |             db.keyword.update({'_id':k['_id']},{'$set':{'lastupdatetime':datetime.now()}})
540 |         except:
541 |             print locals()
542 |             print traceback.print_exc()
543 | 
544 | def update_item_date(interval=86000):
545 |     for item in db.item.find():
546 |         try:
547 |             if check_item_update_time(item['itemid'],item['site'],interval):
548 |                 continue
549 |             if item['site'] == 'tm':
550 |                 data = getTmallItemInfo(item['itemid'],'tm')
551 |             elif item['site'] == 'tb':
552 |                 data = getTaobaoItemInfo(item['itemid'],'tb')
553 |             save_item(data)
554 |         except Exception ,e:
555 |             print locals()
556 |             print traceback.print_exc()
557 |             continue
558 | 
559 | def cleandata():
560 |     db.item.drop() 
561 |     db.itemlog.drop() 
562 |     db.shop.drop() 
563 | 
564 | if __name__ == "__main__":
565 |     if len(sys.argv) >1:
566 |         if sys.argv[1] == 'search':
567 |             runcrawler()
568 |         elif sys.argv[1] == 'update':
569 |             update_item_date()
570 |         
571 |     #print '*******************************************'
572 |     #url = "http://mdskip.taobao.com/core/initItemDetail.htm?tmallBuySupport=true&itemId=15765842063&service3C=true"
573 |     #data = get_html(url,referer="http://detail.tmall.com/item.htm?id=15765842063").decode('gbk').replace('\r\n','').replace('\t','')
574 |     #patt = '.+?(\w+:\s*".*")'
575 | 
576 |     #url = "http://s.taobao.com/search?q=无线键盘&commend=all&search_type=item&sourceId=tb.index"
577 |     #searchcrawler(url)
578 |     #print '*******************************************'
579 |     #print res.decode('gbk')
580 |     #print '+++++++++++++++++++++++++++++++++++++++++++++++++++++++='
581 |     #print parse_quantity(15517664123)
582 |     #print res['comments']
583 |     #data = getTaobaoItemInfo(15846674458)
584 |     #data = getTmallItemInfo(16659653478)#已经下架
585 |     #data = getTmallItemInfo(18740852051)
586 |     #print data
587 |     #save_item(data)
588 |     #zp(getTaobaoItemInfo(17699431781))
589 |     #zp(getTmallItemInfo(16659653478))
590 |     #zp(getTmallItemInfo(12434044828))
591 |     #print parse_price(17824234211,6800)
592 |     #print itemcrawler(17824234211)
593 |     #judge_site('http://item.taobao.com/item.htm?id=14992324812&ad_id=&am_id=&cm_id=140105335569ed55e27b&pm_id=')
594 |     #print getTmallShop('logitech.tmall.com')
595 |     #print getTaobaoShop('http://hjjh.taobao.com')
596 |     #runcrawler()
597 |     #url = "http://ext.mdskip.taobao.com/extension/dealRecords.htm?bid_page=1&page_size=15&is_start=false&item_type=b&ends=1377944879000&starts=1377340079000&item_id=22167436659&user_tag=34672672&old_quantity=905551&seller_num_id=1124016457&isFromDetail=yes&totalSQ=144923&sbn=37ad2e5f076636c83ee5af7500954ee1,showBuyerList"
598 |     #data = get_html(url,referer="http://detail.tmall.com/item.htm?id=22167436659",verbose=True)#.decode('gbk').replace('\r\n','').replace('\t','')
599 |     #print 'data:',data
600 |     #print get_html('http://taipusm.tmall.com')
601 | 
602 | 
603 | 


--------------------------------------------------------------------------------