├── README.md ├── crawl_detail.py ├── step4.py ├── crawl_img.py ├── crawl_property.py ├── step2.py ├── step1.py ├── step3.py ├── urldict.py └── parser.py /README.md: -------------------------------------------------------------------------------- 1 | # Tmall1212 2 | 天猫双12爬虫,附266万活动商品数据。 3 | 4 | 继:[《天猫双11爬虫(福利:212万条商品数据免费下载)》](http://blog.csdn.net/bone_ace/article/details/53181015)。
5 | 详情见:[《天猫双12爬虫(福利:266万条商品数据免费下载)》](http://blog.csdn.net/bone_ace/article/details/53574126)。 6 | 7 |
8 |
9 | ##**数据下载:**## 10 | 天猫双12商品原始数据 链接:http://pan.baidu.com/s/1bPV2u6 密码:t803
11 | 天猫双12商品活动数据 链接:http://pan.baidu.com/s/1gf5IOlt 密码:gs50
12 | 天猫双12商品参数数据 链接:http://pan.baidu.com/s/1qXWo9Zm 密码:hfwt
13 | 天猫双12商品图片数据 链接:http://pan.baidu.com/s/1eS82C9c 密码:r9me
14 | 15 | -------------------------------------------------------------------------------- /crawl_detail.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ---------------------------------------------------------------------- 3 | # 作用:根据商品ID抓取商品页面 4 | # 日期:2016-12-12 5 | # 作者:九茶 6 | # ---------------------------------------------------------------------- 7 | 8 | import requests 9 | import pymongo 10 | from multiprocessing import Pool, cpu_count 11 | 12 | client = pymongo.MongoClient('localhost', 27017) 13 | db = client['1212'] 14 | collection_items = db['Tmall_items'] 15 | collection_items_failure = db['Tmall_items_failure'] 16 | collection_details = db['Tmall_details'] 17 | 18 | 19 | def run(routine): 20 | url = 'https://detail.m.tmall.com/item.htm?id=%s' % routine['_id'] 21 | failure = 0 22 | while failure < 10: 23 | try: 24 | r = requests.get(url, timeout=10) 25 | except Exception, e: 26 | print e 27 | failure += 1 28 | continue 29 | routine['Content'] = r.content.decode('gbk', 'ignore') 30 | if routine['Content'].startswith('\r\n= 10: 40 | print 'Failed: %s' % routine['_id'] 41 | try: 42 | collection_items_failure.insert(routine) 43 | except Exception, e: 44 | pass 45 | 46 | 47 | if __name__ == '__main__': 48 | pool = Pool(cpu_count()) 49 | pool.map(run, collection_items.find()) 50 | pool.close() 51 | pool.join() 52 | -------------------------------------------------------------------------------- /step4.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ---------------------------------------------------------------------- 3 | # 作用:处理step3.py产生的Tmall_items_temp,主要是json的URL,此py获取json,并解析出商品ID 4 | # 日期:2016-12-12 5 | # 作者:九茶 6 | # ---------------------------------------------------------------------- 7 | 8 | import pymongo 9 | import requests 10 | import re 11 | from multiprocessing import Pool, cpu_count 12 | 13 | client = pymongo.MongoClient('localhost', 27017) 14 | db = client['1212'] 15 | collection_items = db['Tmall_items'] 16 | collection_items_temp = db['Tmall_items_temp'] 17 | 18 | 19 | def parse(content, routine): 20 | try: 21 | items = re.findall('com/item\.htm[^"]*id=(\d+)', content) 22 | for elem in list(set(items)): 23 | try: 24 | collection_items.insert({'_id': elem, 'ShopURL': routine['ShopURL'], 'Type': routine['Type']}) 25 | except Exception, e: 26 | pass 27 | return len(set(items)) 28 | except Exception, e: 29 | print e 30 | return 0 31 | 32 | 33 | def run(routine): 34 | url = routine['_id'] 35 | failure = 0 36 | while failure < 10: 37 | try: 38 | r = requests.get(url, timeout=10) 39 | except Exception, e: 40 | print e 41 | failure += 1 42 | continue 43 | items = parse(r.content.decode('gbk', 'ignore'), routine) 44 | print 'Successful: %s (Items:%s)' % (routine['_id'], items) 45 | break 46 | if failure >= 10: 47 | print 'Failed: %s' % url 48 | 49 | 50 | if __name__ == '__main__': 51 | pool = Pool(cpu_count()) 52 | pool.map(run, collection_items_temp.find()) 53 | pool.close() 54 | pool.join() 55 | 56 | 57 | -------------------------------------------------------------------------------- /crawl_img.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ---------------------------------------------------------------------- 3 | # 作用:根据图片信息下载图片,要先在此py的同目录下新建一个文件夹 "IMG"。 4 | # 日期:2016-12-12 5 | # 作者:九茶 6 | # ---------------------------------------------------------------------- 7 | 8 | import sys 9 | 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | import urllib 13 | import pymongo 14 | from hashlib import md5 15 | from multiprocessing import Pool 16 | 17 | client = pymongo.MongoClient('localhost', 27017) 18 | db = client['1111'] 19 | collection_img = db['Tmall_detail_imgs'] 20 | collection_img_finished = db['Tmall_detail_imgs_finished'] 21 | 22 | 23 | def run(_): 24 | try: 25 | routine = collection_img.find_one_and_delete({}) 26 | url = routine['_id'] 27 | m5 = md5() 28 | m5.update(url) 29 | routine['url_md5'] = m5.hexdigest() 30 | collection_img_finished.insert(routine) 31 | except Exception, e: 32 | print e 33 | return 34 | if url.endswith('jpg'): 35 | img_dir = './IMG/%s.jpg' % m5.hexdigest() 36 | else: 37 | img_dir = './IMG/%s.png' % m5.hexdigest() 38 | failure = 0 39 | while failure < 10: 40 | try: 41 | urllib.urlretrieve(url, img_dir) 42 | break 43 | except Exception, e: 44 | print e 45 | failure += 1 46 | continue 47 | if failure >= 10: 48 | print 'Failed: %s' % url 49 | with open('img_failure.txt', 'a') as f: 50 | f.write('%s\n' % url) 51 | 52 | 53 | if __name__ == '__main__': 54 | while collection_img.count() > 0: 55 | pool = Pool(8) 56 | pool.map(run, range(10000)) 57 | pool.close() 58 | pool.join() 59 | print '一万' 60 | -------------------------------------------------------------------------------- /crawl_property.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ---------------------------------------------------------------------- 3 | # 作用:抓取商品参数信息 4 | # 日期:2016-12-12 5 | # 作者:九茶 6 | # ---------------------------------------------------------------------- 7 | 8 | import requests 9 | import json 10 | import pymongo 11 | from multiprocessing import Pool, cpu_count 12 | 13 | client = pymongo.MongoClient('localhost', 27017) 14 | db = client['1212'] 15 | collection_itmes = db['Tmall_items'] 16 | collection_Tmall_others = db['Tmall_property'] 17 | 18 | 19 | def run(routine): 20 | sid = routine['_id'] 21 | url = 'https://mdetail.tmall.com/mobile/itemPackage.do?itemId=%s' % sid 22 | failure = 0 23 | while failure < 10: 24 | try: 25 | r = requests.get(url, timeout=10) 26 | js = json.loads(r.content.decode('gbk', 'ignore')) 27 | except Exception, e: 28 | print e 29 | failure += 1 30 | continue 31 | result = {'_id': sid} 32 | if 'model' in js.keys() and 'list' in js['model'].keys(): 33 | for one in js['model']['list']: 34 | if 'v' in one.keys(): 35 | for elem in one['v']: 36 | if 'k' in elem.keys() and 'v' in elem.keys(): 37 | result[elem['k']] = elem['v'] 38 | if len(result.keys()) == 1: 39 | print 'None: %s' % sid 40 | with open('failure.txt', 'a') as f: 41 | f.write('%s None\n' % sid) 42 | else: 43 | try: 44 | print 'Finish: %s' % sid 45 | collection_Tmall_others.insert(result) 46 | except Exception, e: 47 | print e 48 | break 49 | if failure >= 10: 50 | print 'Failed: %s' % sid 51 | with open('failure.txt', 'a') as f: 52 | f.write('%s erroe\n' % sid) 53 | 54 | 55 | if __name__ == '__main__': 56 | pool = Pool(cpu_count()) 57 | pool.map(run, collection_itmes.find()) 58 | pool.close() 59 | pool.join() 60 | -------------------------------------------------------------------------------- /step2.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ---------------------------------------------------------------------- 3 | # 作用:处理appid 4 | # 日期:2016-12-12 5 | # 作者:九茶 6 | # ---------------------------------------------------------------------- 7 | 8 | import requests 9 | import re 10 | import json 11 | import time 12 | import pymongo 13 | from multiprocessing import Pool, cpu_count 14 | 15 | client = pymongo.MongoClient('localhost', 27017) 16 | db = client['1212'] 17 | collection_shops = db['Tmall_shops'] 18 | collection_items = db['Tmall_items'] 19 | collection_appid = db['Tmall_appIDs'] 20 | 21 | 22 | def parse(content, routine): 23 | js = json.loads(content) 24 | s = 0 25 | 26 | # 解析里面的商店信息 27 | try: 28 | aa = js.values()[0] 29 | bb = aa['data'] 30 | if len(bb) > 0 and 'extList' in bb[0].keys(): 31 | bb = bb[0]['extList'] 32 | for elem in bb: 33 | if 'shopUrl' in elem.keys(): 34 | keyName = 'shopUrl' 35 | elif 'shopActUrl' in elem.keys(): 36 | keyName = 'shopActUrl' 37 | elif 'mbannerUrl' in elem.keys(): 38 | keyName = 'mbannerUrl' 39 | elif 'itemUrl' in elem.keys(): 40 | if 'com/item\.htm' in elem['itemUrl']: 41 | continue 42 | keyName = 'itemUrl' 43 | else: 44 | continue 45 | try: 46 | s += 1 47 | if elem[keyName].startswith('//'): 48 | collection_shops.insert({'_id': 'https:' + elem[keyName], 'Type': routine['Type']}) 49 | else: 50 | collection_shops.insert({'_id': elem[keyName], 'Type': routine['Type']}) 51 | except Exception, e: 52 | pass 53 | except Exception, e: 54 | print 'js error' 55 | 56 | # 解析里面的商品信息 57 | items = re.findall('com/item\.htm[^"]*id=(\d+)', content) 58 | for elem in list(set(items)): 59 | try: 60 | collection_items.insert({'_id': elem, 'Type': routine['Type']}) 61 | except Exception, e: 62 | pass 63 | 64 | return [len(set(items)), s] # 返回解析的数量 65 | 66 | 67 | def run(routine): 68 | url = 'https://ald.taobao.com/recommend2.htm?appId=%s&terminalType=1&_pvuuid=%s&source=huichang' % (routine['_id'], str(time.time()) + '000') 69 | failure = 0 70 | while failure < 10: 71 | try: 72 | r = requests.get(url, timeout=10) 73 | except Exception, e: 74 | print e 75 | failure += 1 76 | continue 77 | i, s = parse(r.content.decode('gbk', 'ignore'), routine) 78 | print 'Successful: %s(Items:%s; Shops:%s)' % (url, i, s) 79 | break 80 | if failure >= 10: 81 | print 'Failed: %s' % url 82 | 83 | 84 | if __name__ == '__main__': 85 | pool = Pool(cpu_count()) 86 | pool.map(run, collection_appid.find()) 87 | pool.close() 88 | pool.join() 89 | 90 | # run({'_id': 'lb-zebra-211303-1630287', 'Type': '21jfdiew'}) 91 | -------------------------------------------------------------------------------- /step1.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ---------------------------------------------------------------------- 3 | # 作用:抓取各个会场页面上的商品、商店、appid等重要信息 4 | # 日期:2016-12-12 5 | # 作者:九茶 6 | # ---------------------------------------------------------------------- 7 | 8 | import pymongo 9 | import requests 10 | import re 11 | from multiprocessing import Pool, cpu_count 12 | from urldict import urldict 13 | 14 | client = pymongo.MongoClient('localhost', 27017) 15 | db = client['1212'] 16 | collection_shops = db['Tmall_shops'] 17 | collection_items = db['Tmall_items'] 18 | collection_appid = db['Tmall_appIDs'] 19 | collection_tec = db['Tmall_tecs'] 20 | 21 | 22 | def parse(content, sourceURL): 23 | try: 24 | text = content.replace('/', '/').replace('"', '"').replace('&', '&') 25 | shopURL = re.findall('shopActUrl":"(.*?)"', text) 26 | itemURL0 = re.findall('itemId":"(\d+)"', text) # 有两种方式解析商品id 27 | itemURL1 = re.findall('item\.htm\?id=(\d+)', text) 28 | appid = re.findall('"appId":"(.*?)","terminalType', text) 29 | tec = re.findall('"tce_sid":(\d+)', text) 30 | others = re.findall('"itemUrl":"(.*?)"', text) 31 | for one in others: 32 | temp = re.findall('[\?&]id=(\d+)', one) 33 | if temp: 34 | itemURL0.append(temp[0]) 35 | else: 36 | shopURL.append(one) 37 | 38 | # 以下将各信息入库 39 | for elem in list(set(shopURL)): 40 | try: 41 | if elem.startswith('//'): 42 | elem = 'https:' + elem 43 | collection_shops.insert({'_id': elem, 'Type': urldict[sourceURL]}) 44 | except Exception, e: 45 | # print 'shops:' % e 46 | pass 47 | for elem in list(set(itemURL0 + itemURL1)): 48 | try: 49 | collection_items.insert({'_id': elem, 'Type': urldict[sourceURL]}) 50 | except Exception, e: 51 | # print 'items: %s' % e 52 | pass 53 | for elem in list(set(appid)): 54 | try: 55 | collection_appid.insert({'_id': elem, 'Type': urldict[sourceURL]}) 56 | except Exception, e: 57 | # print 'appid: %s' % e 58 | pass 59 | 60 | for elem in list(set(tec)): 61 | try: 62 | collection_tec.insert({'_id': elem, 'Type': urldict[sourceURL]}) 63 | except Exception, e: 64 | # print 'tec: %s' % e 65 | pass 66 | return [len(set(shopURL)), len(set(itemURL0 + itemURL1)), len(set(appid)), len(set(tec))] # 返回各数量 67 | except Exception, e: 68 | print '!!!!!!!!!!!!!!!!!!!' 69 | 70 | 71 | def run(url): 72 | failure = 0 73 | while failure < 10: 74 | try: 75 | r = requests.get(url, timeout=10) 76 | except Exception, e: 77 | print e 78 | failure += 1 79 | continue 80 | shops, items, appid, tec = parse(r.content, url) 81 | print 'Successful: %s (Shops:%s; Items:%s; AppID:%s; Tec:%s)' % (url, shops, items, appid, tec) 82 | break 83 | if failure >= 10: 84 | print 'Failed: %s' % url 85 | 86 | 87 | if __name__ == '__main__': 88 | pool = Pool(cpu_count()) 89 | pool.map(run, urldict.keys()) 90 | pool.close() 91 | pool.join() 92 | -------------------------------------------------------------------------------- /step3.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ---------------------------------------------------------------------- 3 | # 作用:抓取商店信息 4 | # 日期:2016-12-12 5 | # 作者:九茶 6 | # ---------------------------------------------------------------------- 7 | 8 | import pymongo 9 | import requests 10 | import re 11 | from lxml import etree 12 | from multiprocessing import Pool, cpu_count 13 | 14 | client = pymongo.MongoClient('localhost', 27017) 15 | db = client['1212'] 16 | collection_shops = db['Tmall_shops'] 17 | collection_items = db['Tmall_items'] 18 | collection_items_temp = db['Tmall_items_temp'] 19 | 20 | 21 | def parse(content, sourceURL, routine): 22 | try: 23 | text = content.replace('/', '/').replace('"', '"').replace('&', '&') 24 | 25 | # 解析出json的URL 26 | tree = etree.HTML(text) 27 | site_instance_id = re.findall('site_instance_id=(\d+)', text) 28 | data_widgetid = tree.xpath('//div[@class="J_TModule J_TAsyncModule"]/@data-widgetid') 29 | flag = 0 30 | if site_instance_id: 31 | if (site_instance_id[0] + '-/p/shj.htm') in text: # 复杂的贱货 32 | for elem in data_widgetid: 33 | if int(elem) % 2 == 0: 34 | continue 35 | host = re.findall('//([^/]*)', sourceURL) 36 | if host: 37 | url = 'https://' + host[0] + '/widgetAsync.htm?ids=' + elem + '%2C' + str(int(elem) + 1) + \ 38 | '&path=%2Fp%2Fshj.htm&callback=callbackGetMods' + elem + '&site_instance_id=' + site_instance_id[0] 39 | try: 40 | flag += 1 41 | collection_items_temp.insert({'_id': url, 'ShopURL': sourceURL, 'Type': routine['Type']}) 42 | except Exception, e: 43 | pass 44 | else: 45 | print 'No host' 46 | else: 47 | for elem in data_widgetid: 48 | host = re.findall('//([^/\?]*)', sourceURL) 49 | if host: 50 | url = 'https://' + host[0] + '/widgetAsync.htm?ids=' + elem + '&path=%2Fshop%2Fview_shop.htm&callback=callbackGetMods' + \ 51 | elem + '&site_instance_id=' + site_instance_id[0] 52 | try: 53 | flag += 1 54 | collection_items_temp.insert({'_id': url, 'ShopURL': sourceURL, 'Type': routine['Type']}) 55 | except Exception, e: 56 | pass 57 | else: 58 | print 'No host' 59 | 60 | # 解析商品ID 61 | items = re.findall('com/item\.htm[^"]*id=(\d+)', text) 62 | for elem in list(set(items)): 63 | try: 64 | collection_items.insert({'_id': elem, 'ShopURL': sourceURL, 'Type': routine['Type']}) 65 | except Exception, e: 66 | pass 67 | return [flag, len(set(items))] 68 | except Exception, e: 69 | print e 70 | return [0, 0] 71 | 72 | 73 | def run(routine): 74 | url = routine['_id'] 75 | if url.startswith('//'): 76 | url = 'https:' + url 77 | failure = 0 78 | while failure < 10: 79 | try: 80 | r = requests.get(url, timeout=10) 81 | except Exception, e: 82 | print e 83 | failure += 1 84 | continue 85 | temp, items = parse(r.content.decode('gbk', 'ignore'), url, routine) 86 | print 'Successful: %s (Temp:%s; Items:%s)' % (routine['_id'], temp, items) 87 | break 88 | if failure >= 10: 89 | print 'Failed: %s' % url 90 | 91 | 92 | if __name__ == '__main__': 93 | pool = Pool(cpu_count()) 94 | pool.map(run, collection_shops.find()) 95 | pool.close() 96 | pool.join() 97 | -------------------------------------------------------------------------------- /urldict.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | urldict = { 4 | 'https://pages.tmall.com/wow/act/16814/industry_68bc_3437?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_1_1218744': '女装会场', 5 | 'https://pages.tmall.com/wow/act/16814/industry_4kq9_3440?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_2_1218744': '男装会场', 6 | 'https://pages.tmall.com/wow/act/16814/industry_kzli_3404?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_3_1218744': '运动户外', 7 | 'https://pages.tmall.com/wow/act/16814/industry_5jb0_3445?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_4_1218744': '女鞋会场', 8 | 'https://pages.tmall.com/wow/act/16814/industry_yhcf_3403?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_5_1218744': '手表眼镜', 9 | 'https://pages.tmall.com/wow/act/16814/industry_47c4_3423?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_6_1218744': '女装商场同款', 10 | 'https://pages.tmall.com/wow/act/16814/industry_kvza_3419?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_7_1218744': '女装风格好店', 11 | 'https://pages.tmall.com/wow/act/16814/industry_0ftf_3441?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_8_1218744': '男装风格好店', 12 | 'https://pages.tmall.com/wow/act/16814/industry_ou2d_3340?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_9_1218744': '电脑办公会场', 13 | 'https://pages.tmall.com/wow/act/16814/industry_oxeo_3341?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_10_1218744': '潮酷数码会场', 14 | 'https://pages.tmall.com/wow/act/16814/industry_erpn_3342?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_11_1218744': '图书乐器会场', 15 | 'https://pages.tmall.com/wow/act/16814/industry_7zl3_3382?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_12_1218744': '手机会场', 16 | 'https://pages.tmall.com/wow/act/16814/industry_ibxt_3393?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_13_1218744': '食品主会场', 17 | 'https://pages.tmall.com/wow/act/16814/industry_4ase_3380?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_14_1218744': '零食会场', 18 | 'https://pages.tmall.com/wow/act/16814/industry_6xof_3452?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_15_1218744': '母婴主会场', 19 | 'https://pages.tmall.com/wow/act/16814/industry_3o4w_3465?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_16_1218744': '童装会场', 20 | 'https://pages.tmall.com/wow/act/16814/industry_l4qc_3350?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_17_1218744': '精品家具', 21 | 'https://pages.tmall.com/wow/act/16814/industry_2jz2_3398?acm=lb-zebra-24215-1516536.1003.4.1218744&scm=1003.4.lb-zebra-24215-1516536.OTHER_18_1218744': '休闲家具', 22 | 'https://pages.tmall.com/wow/act/16814/industry_pmmk_3357?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_19_1218744': '品质建材', 23 | 'https://pages.tmall.com/wow/act/16814/industry_vm8d_3351?acm=lb-zebra-24215-1516536.1003.4.1218744&scm=1003.4.lb-zebra-24215-1516536.OTHER_20_1218744': '灯具灯饰会场', 24 | 'https://pages.tmall.com/wow/act/16814/industry_1qao_3449?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_1_1218745': '内衣会场', 25 | 'https://pages.tmall.com/wow/act/16814/industry_e6uu_3447?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_2_1218745': '箱包配饰', 26 | 'https://pages.tmall.com/wow/act/16814/industry_2wi8_3489?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_3_1218745': '珠宝饰品', 27 | 'https://pages.tmall.com/wow/act/16814/industry_cza1_3337?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_4_1218745': '数码家电会场', 28 | 'https://pages.tmall.com/wow/act/16814/industry_h4h4_3339?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_5_1218745': '大家电会场', 29 | 'https://pages.tmall.com/wow/act/16814/industry_xqgh_3435?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_6_1218745': '小家电会场', 30 | 'https://pages.tmall.com/wow/act/16814/industry_uvly_3335?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_7_1218745': '美妆主会场', 31 | 'https://pages.tmall.com/wow/act/16814/industry_4zs9_3400?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_8_1218745': '洗护清洁会场', 32 | 'https://pages.tmall.com/wow/act/16814/industry_vrxy_3379?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_9_1218745': '高端洗护会场', 33 | 'https://pages.tmall.com/wow/act/16814/industry_gc5i_3366?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&spm=a312d.7832047.0.0.xatBrC&scm=1003.4.lb-zebra-24215-1516540.ITEM_10_1218745': '生鲜会场', 34 | 'https://pages.tmall.com/wow/act/16814/industry_3bt1_3381?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_11_1218745': '医药健康', 35 | 'https://pages.tmall.com/wow/act/16814/industry_8enm_3397?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_12_1218745': '百货会场', 36 | 'https://pages.tmall.com/wow/act/16814/industry_yq0f_3352?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_13_1218745': '家纺家饰会场', 37 | 'https://pages.tmall.com/wow/act/16814/industry_itzs_3359?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_14_1218745': '车品配件', 38 | 'https://pages.tmall.com/wow/act/16814/industry_3bsz_3355?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_15_1218745': '进口尖货', 39 | } 40 | -------------------------------------------------------------------------------- /parser.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | # ---------------------------------------------------------------------- 3 | # 作用:解析crawl_detail.py抓取到的页面 4 | # 日期:2016-12-12 5 | # 作者:九茶 6 | # ---------------------------------------------------------------------- 7 | 8 | import sys 9 | 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | import time 13 | import pymongo 14 | import re 15 | import json 16 | from lxml import etree 17 | from multiprocessing.dummy import Pool, cpu_count 18 | 19 | client = pymongo.MongoClient('localhost', 27017) 20 | db = client['1212'] 21 | collection_html = db['Tmall_details'] 22 | collection_imgs = db['Tmall_detail_imgs'] 23 | collection_result = db['Tmall_result'] 24 | collection_result1 = db['Tmall_source1'] 25 | collection_result2 = db['Tmall_source2'] 26 | collection_failure = db['Tmall_detail_failure'] 27 | 28 | 29 | def run(routine): 30 | try: 31 | img_routines = [] 32 | detail0 = [] 33 | tree = etree.HTML(routine['Content']) 34 | data_detail = re.findall('_DATA_Detail = *?\n?(.*?\});? ?\n', routine['Content']) 35 | data_mdskip = re.findall('_DATA_Mdskip = *?\n?(.*?\});? ?\n', routine['Content']) 36 | data_detail_js = json.loads(data_detail[0]) 37 | data_mdskip_js = json.loads(data_mdskip[0]) 38 | 39 | # 商品标题 40 | title = tree.xpath('//section[@id="s-title"]/div[@class="main"]/h1/text()') 41 | if title: 42 | title = title[0] 43 | else: 44 | title = tree.xpath('//section[@id="s-title"]/div[@class="main"]/h1/text()') 45 | if title: 46 | title = title[0].replace(' - 天猫Tmall.com', '') 47 | else: 48 | title = '' 49 | 50 | # 一个商品下可能有颜色、码数选择,不同的选择会有不同的照片 51 | if 'valItemInfo' in data_detail_js.keys() and 'skuPics' in data_detail_js['valItemInfo'].keys(): 52 | for key in data_detail_js['valItemInfo']['skuPics'].keys(): 53 | try: 54 | value = data_detail_js['valItemInfo']['skuPics'][key] 55 | if key.startswith(';'): 56 | key = key[1:] 57 | if key.endswith(';'): 58 | key = key[:-1] 59 | key = 'https://detail.tmall.com/item.htm?id=%s&sku_properties=%s' % ( 60 | routine['_id'], key.replace(';', '&')) 61 | if value.startswith('//'): 62 | value = 'http:' + value 63 | elif value.startswith('/'): 64 | value = 'http:/' + value 65 | elif not value.startswith('http'): 66 | value = 'http://' + value 67 | img_routines.append({'_id': value, '商品链接': key, '商品标题': title}) 68 | except Exception, e: 69 | print e 70 | # 网页下拉,商品介绍时显示的照片 71 | if 'api' in data_detail_js.keys() and 'newWapDescJson' in data_detail_js['api'].keys(): 72 | for one in data_detail_js['api']['newWapDescJson']: 73 | if 'moduleName' in one.keys() and one['moduleName'] == '商品图片' and 'data' in one.keys(): 74 | for elem in one['data']: 75 | try: 76 | temp = {'_id': elem['img']} 77 | if 'width' in elem.keys(): 78 | temp['width'] = elem['width'] 79 | if 'height' in elem.keys(): 80 | temp['height'] = elem['height'] 81 | temp['商品链接'] = 'https://detail.tmall.com/item.htm?id=%s' % routine['_id'] 82 | temp['商品标题'] = title 83 | img_routines.append(temp) 84 | except Exception, e: 85 | print e 86 | for img in img_routines: 87 | try: 88 | collection_imgs.insert(img) 89 | except Exception, e: 90 | pass 91 | 92 | # 服务保障 93 | fuwu = [] 94 | if 'defaultModel' in data_mdskip_js.keys() and 'consumerProtection' in data_mdskip_js[ 95 | 'defaultModel'].keys() and 'items' in data_mdskip_js[ 96 | 'defaultModel']['consumerProtection'].keys(): 97 | for one in data_mdskip_js['defaultModel']['consumerProtection']['items']: 98 | if 'title' in one.keys(): 99 | fuwu.append(one['title']) 100 | fuwu = ';'.join(fuwu) 101 | 102 | # 优惠活动 103 | youhui = [] 104 | if 'defaultModel' in data_mdskip_js.keys() and 'couponDataDo' in data_mdskip_js[ 105 | 'defaultModel'].keys() and 'couponList' in data_mdskip_js[ 106 | 'defaultModel']['couponDataDo'].keys(): 107 | for one in data_mdskip_js['defaultModel']['couponDataDo']['couponList']: 108 | if 'title' in one.keys() and one['title'] != '领取优惠券': 109 | youhui.append(one['title']) 110 | youhui = ';'.join(youhui) 111 | youhui = youhui.replace('.', '点') 112 | elif 'defaultModel' in data_mdskip_js.keys() and 'itemPriceResultDO' in data_mdskip_js[ 113 | 'defaultModel'].keys() and 'tmallShopProm' in \ 114 | data_mdskip_js['defaultModel']['itemPriceResultDO'].keys(): 115 | for one in data_mdskip_js['defaultModel']['itemPriceResultDO']['tmallShopProm']: 116 | if 'promPlanMsg' in one.keys(): 117 | youhui = ';'.join(one['promPlanMsg']) 118 | youhui = youhui.replace('.', '点') 119 | 120 | # 卖家地址及快递费: 121 | maijiadizhi = '' 122 | kuaidifei = '' 123 | if 'defaultModel' in data_mdskip_js.keys() and 'deliveryDO' in data_mdskip_js[ 124 | 'defaultModel'].keys() and 'deliverySkuMap' in data_mdskip_js[ 125 | 'defaultModel']['deliveryDO'].keys(): 126 | temp = data_mdskip_js['defaultModel']['deliveryDO']['deliverySkuMap'] 127 | if 'default' in temp.keys(): 128 | for one in temp['default']: 129 | if 'postage' in one.keys() and len(one['postage']) > 0: 130 | kuaidifei = one['postage'] 131 | if 'skuDeliveryAddress' in one.keys() and len(one['skuDeliveryAddress']) > 0: 132 | maijiadizhi = one['skuDeliveryAddress'] 133 | 134 | # 以上为不同颜色/型号商品共享的数据,以下求每个颜色/型号的商品信息 135 | if 'defaultModel' in data_mdskip_js.keys() and 'itemPriceResultDO' in data_mdskip_js[ 136 | 'defaultModel'].keys() and 'priceInfo' in data_mdskip_js[ 137 | 'defaultModel']['itemPriceResultDO'].keys(): 138 | for elem in data_mdskip_js['defaultModel']['itemPriceResultDO']['priceInfo'].keys(): 139 | value = data_mdskip_js['defaultModel']['itemPriceResultDO']['priceInfo'][elem] 140 | temp = {'_id': 'https://detail.tmall.com/item.htm?id=%s&skuId=%s' % (routine['_id'], elem)} 141 | if fuwu: 142 | temp['服务保障'] = fuwu 143 | if youhui: 144 | temp['优惠活动'] = youhui 145 | if maijiadizhi: 146 | temp['卖家地址'] = maijiadizhi 147 | if kuaidifei: 148 | temp['快递费'] = kuaidifei 149 | if 'tagPrice' in value.keys() and len(value['tagPrice']) > 0: 150 | temp['原价'] = value['tagPrice'] 151 | elif 'price' in value.keys() and len(value['price']) > 0: 152 | temp['原价'] = value['price'] 153 | if 'promotionList' in value.keys(): 154 | for one in value['promotionList']: 155 | if 'price' in one.keys() and len(one['price']) > 0: 156 | temp['现价'] = one['price'] 157 | if 'startTime' in one.keys(): 158 | temp['活动开始时间'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(one['startTime'] / 1000)) 159 | elif 'tradeResult' in data_mdskip_js['defaultModel'].keys() and 'startTime' in \ 160 | data_mdskip_js['defaultModel'][ 161 | 'tradeResult'].keys(): 162 | startTime = data_mdskip_js['defaultModel']['tradeResult']['startTime'] 163 | temp['活动开始时间'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(startTime / 1000)) 164 | if 'endTime' in one.keys(): 165 | temp['活动结束时间'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(one['endTime'] / 1000)) 166 | temp['标题'] = title 167 | if 'Type' in routine.keys(): 168 | temp['会场'] = routine['Type'] 169 | if 'ShopUrl' in routine.keys(): 170 | temp['商店链接'] = routine['ShopUrl'] 171 | temp['商品ID'] = routine['_id'] 172 | detail0.append(temp) 173 | for item in detail0: 174 | try: 175 | collection_result.insert(item) 176 | except Exception, e: 177 | print e 178 | 179 | try: 180 | data_detail_js['_id'] = routine['_id'] 181 | collection_result1.insert(data_detail_js) 182 | except Exception, e: 183 | print e 184 | try: 185 | data_mdskip_js['_id'] = routine['_id'] 186 | collection_result2.insert(data_mdskip_js) 187 | except Exception, e: 188 | print e 189 | with open('ids.txt', 'a') as f: 190 | f.write('%s\n' % routine['_id']) 191 | print 'Finish %s' % routine['_id'] 192 | except Exception, e: 193 | print e 194 | try: 195 | collection_failure.insert(routine) 196 | except Exception, e: 197 | print e 198 | 199 | 200 | if __name__ == '__main__': 201 | pool = Pool(cpu_count()) 202 | pool.map(run, collection_html.find()) 203 | pool.close() 204 | pool.join() 205 | --------------------------------------------------------------------------------