├── README.md
├── crawl_detail.py
├── step4.py
├── crawl_img.py
├── crawl_property.py
├── step2.py
├── step1.py
├── step3.py
├── urldict.py
└── parser.py
/README.md:
--------------------------------------------------------------------------------
1 | # Tmall1212
2 | 天猫双12爬虫,附266万活动商品数据。
3 |
4 | 继:[《天猫双11爬虫(福利:212万条商品数据免费下载)》](http://blog.csdn.net/bone_ace/article/details/53181015)。
5 | 详情见:[《天猫双12爬虫(福利:266万条商品数据免费下载)》](http://blog.csdn.net/bone_ace/article/details/53574126)。
6 |
7 |
8 |
9 | ##**数据下载:**##
10 | 天猫双12商品原始数据 链接:http://pan.baidu.com/s/1bPV2u6 密码:t803
11 | 天猫双12商品活动数据 链接:http://pan.baidu.com/s/1gf5IOlt 密码:gs50
12 | 天猫双12商品参数数据 链接:http://pan.baidu.com/s/1qXWo9Zm 密码:hfwt
13 | 天猫双12商品图片数据 链接:http://pan.baidu.com/s/1eS82C9c 密码:r9me
14 |
15 |
--------------------------------------------------------------------------------
/crawl_detail.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | # ----------------------------------------------------------------------
3 | # 作用:根据商品ID抓取商品页面
4 | # 日期:2016-12-12
5 | # 作者:九茶
6 | # ----------------------------------------------------------------------
7 |
8 | import requests
9 | import pymongo
10 | from multiprocessing import Pool, cpu_count
11 |
12 | client = pymongo.MongoClient('localhost', 27017)
13 | db = client['1212']
14 | collection_items = db['Tmall_items']
15 | collection_items_failure = db['Tmall_items_failure']
16 | collection_details = db['Tmall_details']
17 |
18 |
19 | def run(routine):
20 | url = 'https://detail.m.tmall.com/item.htm?id=%s' % routine['_id']
21 | failure = 0
22 | while failure < 10:
23 | try:
24 | r = requests.get(url, timeout=10)
25 | except Exception, e:
26 | print e
27 | failure += 1
28 | continue
29 | routine['Content'] = r.content.decode('gbk', 'ignore')
30 | if routine['Content'].startswith('\r\n= 10:
40 | print 'Failed: %s' % routine['_id']
41 | try:
42 | collection_items_failure.insert(routine)
43 | except Exception, e:
44 | pass
45 |
46 |
47 | if __name__ == '__main__':
48 | pool = Pool(cpu_count())
49 | pool.map(run, collection_items.find())
50 | pool.close()
51 | pool.join()
52 |
--------------------------------------------------------------------------------
/step4.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | # ----------------------------------------------------------------------
3 | # 作用:处理step3.py产生的Tmall_items_temp,主要是json的URL,此py获取json,并解析出商品ID
4 | # 日期:2016-12-12
5 | # 作者:九茶
6 | # ----------------------------------------------------------------------
7 |
8 | import pymongo
9 | import requests
10 | import re
11 | from multiprocessing import Pool, cpu_count
12 |
13 | client = pymongo.MongoClient('localhost', 27017)
14 | db = client['1212']
15 | collection_items = db['Tmall_items']
16 | collection_items_temp = db['Tmall_items_temp']
17 |
18 |
19 | def parse(content, routine):
20 | try:
21 | items = re.findall('com/item\.htm[^"]*id=(\d+)', content)
22 | for elem in list(set(items)):
23 | try:
24 | collection_items.insert({'_id': elem, 'ShopURL': routine['ShopURL'], 'Type': routine['Type']})
25 | except Exception, e:
26 | pass
27 | return len(set(items))
28 | except Exception, e:
29 | print e
30 | return 0
31 |
32 |
33 | def run(routine):
34 | url = routine['_id']
35 | failure = 0
36 | while failure < 10:
37 | try:
38 | r = requests.get(url, timeout=10)
39 | except Exception, e:
40 | print e
41 | failure += 1
42 | continue
43 | items = parse(r.content.decode('gbk', 'ignore'), routine)
44 | print 'Successful: %s (Items:%s)' % (routine['_id'], items)
45 | break
46 | if failure >= 10:
47 | print 'Failed: %s' % url
48 |
49 |
50 | if __name__ == '__main__':
51 | pool = Pool(cpu_count())
52 | pool.map(run, collection_items_temp.find())
53 | pool.close()
54 | pool.join()
55 |
56 |
57 |
--------------------------------------------------------------------------------
/crawl_img.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | # ----------------------------------------------------------------------
3 | # 作用:根据图片信息下载图片,要先在此py的同目录下新建一个文件夹 "IMG"。
4 | # 日期:2016-12-12
5 | # 作者:九茶
6 | # ----------------------------------------------------------------------
7 |
8 | import sys
9 |
10 | reload(sys)
11 | sys.setdefaultencoding('utf8')
12 | import urllib
13 | import pymongo
14 | from hashlib import md5
15 | from multiprocessing import Pool
16 |
17 | client = pymongo.MongoClient('localhost', 27017)
18 | db = client['1111']
19 | collection_img = db['Tmall_detail_imgs']
20 | collection_img_finished = db['Tmall_detail_imgs_finished']
21 |
22 |
23 | def run(_):
24 | try:
25 | routine = collection_img.find_one_and_delete({})
26 | url = routine['_id']
27 | m5 = md5()
28 | m5.update(url)
29 | routine['url_md5'] = m5.hexdigest()
30 | collection_img_finished.insert(routine)
31 | except Exception, e:
32 | print e
33 | return
34 | if url.endswith('jpg'):
35 | img_dir = './IMG/%s.jpg' % m5.hexdigest()
36 | else:
37 | img_dir = './IMG/%s.png' % m5.hexdigest()
38 | failure = 0
39 | while failure < 10:
40 | try:
41 | urllib.urlretrieve(url, img_dir)
42 | break
43 | except Exception, e:
44 | print e
45 | failure += 1
46 | continue
47 | if failure >= 10:
48 | print 'Failed: %s' % url
49 | with open('img_failure.txt', 'a') as f:
50 | f.write('%s\n' % url)
51 |
52 |
53 | if __name__ == '__main__':
54 | while collection_img.count() > 0:
55 | pool = Pool(8)
56 | pool.map(run, range(10000))
57 | pool.close()
58 | pool.join()
59 | print '一万'
60 |
--------------------------------------------------------------------------------
/crawl_property.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | # ----------------------------------------------------------------------
3 | # 作用:抓取商品参数信息
4 | # 日期:2016-12-12
5 | # 作者:九茶
6 | # ----------------------------------------------------------------------
7 |
8 | import requests
9 | import json
10 | import pymongo
11 | from multiprocessing import Pool, cpu_count
12 |
13 | client = pymongo.MongoClient('localhost', 27017)
14 | db = client['1212']
15 | collection_itmes = db['Tmall_items']
16 | collection_Tmall_others = db['Tmall_property']
17 |
18 |
19 | def run(routine):
20 | sid = routine['_id']
21 | url = 'https://mdetail.tmall.com/mobile/itemPackage.do?itemId=%s' % sid
22 | failure = 0
23 | while failure < 10:
24 | try:
25 | r = requests.get(url, timeout=10)
26 | js = json.loads(r.content.decode('gbk', 'ignore'))
27 | except Exception, e:
28 | print e
29 | failure += 1
30 | continue
31 | result = {'_id': sid}
32 | if 'model' in js.keys() and 'list' in js['model'].keys():
33 | for one in js['model']['list']:
34 | if 'v' in one.keys():
35 | for elem in one['v']:
36 | if 'k' in elem.keys() and 'v' in elem.keys():
37 | result[elem['k']] = elem['v']
38 | if len(result.keys()) == 1:
39 | print 'None: %s' % sid
40 | with open('failure.txt', 'a') as f:
41 | f.write('%s None\n' % sid)
42 | else:
43 | try:
44 | print 'Finish: %s' % sid
45 | collection_Tmall_others.insert(result)
46 | except Exception, e:
47 | print e
48 | break
49 | if failure >= 10:
50 | print 'Failed: %s' % sid
51 | with open('failure.txt', 'a') as f:
52 | f.write('%s erroe\n' % sid)
53 |
54 |
55 | if __name__ == '__main__':
56 | pool = Pool(cpu_count())
57 | pool.map(run, collection_itmes.find())
58 | pool.close()
59 | pool.join()
60 |
--------------------------------------------------------------------------------
/step2.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | # ----------------------------------------------------------------------
3 | # 作用:处理appid
4 | # 日期:2016-12-12
5 | # 作者:九茶
6 | # ----------------------------------------------------------------------
7 |
8 | import requests
9 | import re
10 | import json
11 | import time
12 | import pymongo
13 | from multiprocessing import Pool, cpu_count
14 |
15 | client = pymongo.MongoClient('localhost', 27017)
16 | db = client['1212']
17 | collection_shops = db['Tmall_shops']
18 | collection_items = db['Tmall_items']
19 | collection_appid = db['Tmall_appIDs']
20 |
21 |
22 | def parse(content, routine):
23 | js = json.loads(content)
24 | s = 0
25 |
26 | # 解析里面的商店信息
27 | try:
28 | aa = js.values()[0]
29 | bb = aa['data']
30 | if len(bb) > 0 and 'extList' in bb[0].keys():
31 | bb = bb[0]['extList']
32 | for elem in bb:
33 | if 'shopUrl' in elem.keys():
34 | keyName = 'shopUrl'
35 | elif 'shopActUrl' in elem.keys():
36 | keyName = 'shopActUrl'
37 | elif 'mbannerUrl' in elem.keys():
38 | keyName = 'mbannerUrl'
39 | elif 'itemUrl' in elem.keys():
40 | if 'com/item\.htm' in elem['itemUrl']:
41 | continue
42 | keyName = 'itemUrl'
43 | else:
44 | continue
45 | try:
46 | s += 1
47 | if elem[keyName].startswith('//'):
48 | collection_shops.insert({'_id': 'https:' + elem[keyName], 'Type': routine['Type']})
49 | else:
50 | collection_shops.insert({'_id': elem[keyName], 'Type': routine['Type']})
51 | except Exception, e:
52 | pass
53 | except Exception, e:
54 | print 'js error'
55 |
56 | # 解析里面的商品信息
57 | items = re.findall('com/item\.htm[^"]*id=(\d+)', content)
58 | for elem in list(set(items)):
59 | try:
60 | collection_items.insert({'_id': elem, 'Type': routine['Type']})
61 | except Exception, e:
62 | pass
63 |
64 | return [len(set(items)), s] # 返回解析的数量
65 |
66 |
67 | def run(routine):
68 | url = 'https://ald.taobao.com/recommend2.htm?appId=%s&terminalType=1&_pvuuid=%s&source=huichang' % (routine['_id'], str(time.time()) + '000')
69 | failure = 0
70 | while failure < 10:
71 | try:
72 | r = requests.get(url, timeout=10)
73 | except Exception, e:
74 | print e
75 | failure += 1
76 | continue
77 | i, s = parse(r.content.decode('gbk', 'ignore'), routine)
78 | print 'Successful: %s(Items:%s; Shops:%s)' % (url, i, s)
79 | break
80 | if failure >= 10:
81 | print 'Failed: %s' % url
82 |
83 |
84 | if __name__ == '__main__':
85 | pool = Pool(cpu_count())
86 | pool.map(run, collection_appid.find())
87 | pool.close()
88 | pool.join()
89 |
90 | # run({'_id': 'lb-zebra-211303-1630287', 'Type': '21jfdiew'})
91 |
--------------------------------------------------------------------------------
/step1.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | # ----------------------------------------------------------------------
3 | # 作用:抓取各个会场页面上的商品、商店、appid等重要信息
4 | # 日期:2016-12-12
5 | # 作者:九茶
6 | # ----------------------------------------------------------------------
7 |
8 | import pymongo
9 | import requests
10 | import re
11 | from multiprocessing import Pool, cpu_count
12 | from urldict import urldict
13 |
14 | client = pymongo.MongoClient('localhost', 27017)
15 | db = client['1212']
16 | collection_shops = db['Tmall_shops']
17 | collection_items = db['Tmall_items']
18 | collection_appid = db['Tmall_appIDs']
19 | collection_tec = db['Tmall_tecs']
20 |
21 |
22 | def parse(content, sourceURL):
23 | try:
24 | text = content.replace('/', '/').replace('"', '"').replace('&', '&')
25 | shopURL = re.findall('shopActUrl":"(.*?)"', text)
26 | itemURL0 = re.findall('itemId":"(\d+)"', text) # 有两种方式解析商品id
27 | itemURL1 = re.findall('item\.htm\?id=(\d+)', text)
28 | appid = re.findall('"appId":"(.*?)","terminalType', text)
29 | tec = re.findall('"tce_sid":(\d+)', text)
30 | others = re.findall('"itemUrl":"(.*?)"', text)
31 | for one in others:
32 | temp = re.findall('[\?&]id=(\d+)', one)
33 | if temp:
34 | itemURL0.append(temp[0])
35 | else:
36 | shopURL.append(one)
37 |
38 | # 以下将各信息入库
39 | for elem in list(set(shopURL)):
40 | try:
41 | if elem.startswith('//'):
42 | elem = 'https:' + elem
43 | collection_shops.insert({'_id': elem, 'Type': urldict[sourceURL]})
44 | except Exception, e:
45 | # print 'shops:' % e
46 | pass
47 | for elem in list(set(itemURL0 + itemURL1)):
48 | try:
49 | collection_items.insert({'_id': elem, 'Type': urldict[sourceURL]})
50 | except Exception, e:
51 | # print 'items: %s' % e
52 | pass
53 | for elem in list(set(appid)):
54 | try:
55 | collection_appid.insert({'_id': elem, 'Type': urldict[sourceURL]})
56 | except Exception, e:
57 | # print 'appid: %s' % e
58 | pass
59 |
60 | for elem in list(set(tec)):
61 | try:
62 | collection_tec.insert({'_id': elem, 'Type': urldict[sourceURL]})
63 | except Exception, e:
64 | # print 'tec: %s' % e
65 | pass
66 | return [len(set(shopURL)), len(set(itemURL0 + itemURL1)), len(set(appid)), len(set(tec))] # 返回各数量
67 | except Exception, e:
68 | print '!!!!!!!!!!!!!!!!!!!'
69 |
70 |
71 | def run(url):
72 | failure = 0
73 | while failure < 10:
74 | try:
75 | r = requests.get(url, timeout=10)
76 | except Exception, e:
77 | print e
78 | failure += 1
79 | continue
80 | shops, items, appid, tec = parse(r.content, url)
81 | print 'Successful: %s (Shops:%s; Items:%s; AppID:%s; Tec:%s)' % (url, shops, items, appid, tec)
82 | break
83 | if failure >= 10:
84 | print 'Failed: %s' % url
85 |
86 |
87 | if __name__ == '__main__':
88 | pool = Pool(cpu_count())
89 | pool.map(run, urldict.keys())
90 | pool.close()
91 | pool.join()
92 |
--------------------------------------------------------------------------------
/step3.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | # ----------------------------------------------------------------------
3 | # 作用:抓取商店信息
4 | # 日期:2016-12-12
5 | # 作者:九茶
6 | # ----------------------------------------------------------------------
7 |
8 | import pymongo
9 | import requests
10 | import re
11 | from lxml import etree
12 | from multiprocessing import Pool, cpu_count
13 |
14 | client = pymongo.MongoClient('localhost', 27017)
15 | db = client['1212']
16 | collection_shops = db['Tmall_shops']
17 | collection_items = db['Tmall_items']
18 | collection_items_temp = db['Tmall_items_temp']
19 |
20 |
21 | def parse(content, sourceURL, routine):
22 | try:
23 | text = content.replace('/', '/').replace('"', '"').replace('&', '&')
24 |
25 | # 解析出json的URL
26 | tree = etree.HTML(text)
27 | site_instance_id = re.findall('site_instance_id=(\d+)', text)
28 | data_widgetid = tree.xpath('//div[@class="J_TModule J_TAsyncModule"]/@data-widgetid')
29 | flag = 0
30 | if site_instance_id:
31 | if (site_instance_id[0] + '-/p/shj.htm') in text: # 复杂的贱货
32 | for elem in data_widgetid:
33 | if int(elem) % 2 == 0:
34 | continue
35 | host = re.findall('//([^/]*)', sourceURL)
36 | if host:
37 | url = 'https://' + host[0] + '/widgetAsync.htm?ids=' + elem + '%2C' + str(int(elem) + 1) + \
38 | '&path=%2Fp%2Fshj.htm&callback=callbackGetMods' + elem + '&site_instance_id=' + site_instance_id[0]
39 | try:
40 | flag += 1
41 | collection_items_temp.insert({'_id': url, 'ShopURL': sourceURL, 'Type': routine['Type']})
42 | except Exception, e:
43 | pass
44 | else:
45 | print 'No host'
46 | else:
47 | for elem in data_widgetid:
48 | host = re.findall('//([^/\?]*)', sourceURL)
49 | if host:
50 | url = 'https://' + host[0] + '/widgetAsync.htm?ids=' + elem + '&path=%2Fshop%2Fview_shop.htm&callback=callbackGetMods' + \
51 | elem + '&site_instance_id=' + site_instance_id[0]
52 | try:
53 | flag += 1
54 | collection_items_temp.insert({'_id': url, 'ShopURL': sourceURL, 'Type': routine['Type']})
55 | except Exception, e:
56 | pass
57 | else:
58 | print 'No host'
59 |
60 | # 解析商品ID
61 | items = re.findall('com/item\.htm[^"]*id=(\d+)', text)
62 | for elem in list(set(items)):
63 | try:
64 | collection_items.insert({'_id': elem, 'ShopURL': sourceURL, 'Type': routine['Type']})
65 | except Exception, e:
66 | pass
67 | return [flag, len(set(items))]
68 | except Exception, e:
69 | print e
70 | return [0, 0]
71 |
72 |
73 | def run(routine):
74 | url = routine['_id']
75 | if url.startswith('//'):
76 | url = 'https:' + url
77 | failure = 0
78 | while failure < 10:
79 | try:
80 | r = requests.get(url, timeout=10)
81 | except Exception, e:
82 | print e
83 | failure += 1
84 | continue
85 | temp, items = parse(r.content.decode('gbk', 'ignore'), url, routine)
86 | print 'Successful: %s (Temp:%s; Items:%s)' % (routine['_id'], temp, items)
87 | break
88 | if failure >= 10:
89 | print 'Failed: %s' % url
90 |
91 |
92 | if __name__ == '__main__':
93 | pool = Pool(cpu_count())
94 | pool.map(run, collection_shops.find())
95 | pool.close()
96 | pool.join()
97 |
--------------------------------------------------------------------------------
/urldict.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 |
3 | urldict = {
4 | 'https://pages.tmall.com/wow/act/16814/industry_68bc_3437?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_1_1218744': '女装会场',
5 | 'https://pages.tmall.com/wow/act/16814/industry_4kq9_3440?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_2_1218744': '男装会场',
6 | 'https://pages.tmall.com/wow/act/16814/industry_kzli_3404?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_3_1218744': '运动户外',
7 | 'https://pages.tmall.com/wow/act/16814/industry_5jb0_3445?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_4_1218744': '女鞋会场',
8 | 'https://pages.tmall.com/wow/act/16814/industry_yhcf_3403?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_5_1218744': '手表眼镜',
9 | 'https://pages.tmall.com/wow/act/16814/industry_47c4_3423?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_6_1218744': '女装商场同款',
10 | 'https://pages.tmall.com/wow/act/16814/industry_kvza_3419?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_7_1218744': '女装风格好店',
11 | 'https://pages.tmall.com/wow/act/16814/industry_0ftf_3441?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_8_1218744': '男装风格好店',
12 | 'https://pages.tmall.com/wow/act/16814/industry_ou2d_3340?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_9_1218744': '电脑办公会场',
13 | 'https://pages.tmall.com/wow/act/16814/industry_oxeo_3341?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_10_1218744': '潮酷数码会场',
14 | 'https://pages.tmall.com/wow/act/16814/industry_erpn_3342?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_11_1218744': '图书乐器会场',
15 | 'https://pages.tmall.com/wow/act/16814/industry_7zl3_3382?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_12_1218744': '手机会场',
16 | 'https://pages.tmall.com/wow/act/16814/industry_ibxt_3393?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_13_1218744': '食品主会场',
17 | 'https://pages.tmall.com/wow/act/16814/industry_4ase_3380?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_14_1218744': '零食会场',
18 | 'https://pages.tmall.com/wow/act/16814/industry_6xof_3452?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_15_1218744': '母婴主会场',
19 | 'https://pages.tmall.com/wow/act/16814/industry_3o4w_3465?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_16_1218744': '童装会场',
20 | 'https://pages.tmall.com/wow/act/16814/industry_l4qc_3350?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_17_1218744': '精品家具',
21 | 'https://pages.tmall.com/wow/act/16814/industry_2jz2_3398?acm=lb-zebra-24215-1516536.1003.4.1218744&scm=1003.4.lb-zebra-24215-1516536.OTHER_18_1218744': '休闲家具',
22 | 'https://pages.tmall.com/wow/act/16814/industry_pmmk_3357?acm=lb-zebra-24215-1516536.1003.4.1218744&wh_weex=true&scm=1003.4.lb-zebra-24215-1516536.OTHER_19_1218744': '品质建材',
23 | 'https://pages.tmall.com/wow/act/16814/industry_vm8d_3351?acm=lb-zebra-24215-1516536.1003.4.1218744&scm=1003.4.lb-zebra-24215-1516536.OTHER_20_1218744': '灯具灯饰会场',
24 | 'https://pages.tmall.com/wow/act/16814/industry_1qao_3449?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_1_1218745': '内衣会场',
25 | 'https://pages.tmall.com/wow/act/16814/industry_e6uu_3447?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_2_1218745': '箱包配饰',
26 | 'https://pages.tmall.com/wow/act/16814/industry_2wi8_3489?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_3_1218745': '珠宝饰品',
27 | 'https://pages.tmall.com/wow/act/16814/industry_cza1_3337?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_4_1218745': '数码家电会场',
28 | 'https://pages.tmall.com/wow/act/16814/industry_h4h4_3339?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_5_1218745': '大家电会场',
29 | 'https://pages.tmall.com/wow/act/16814/industry_xqgh_3435?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_6_1218745': '小家电会场',
30 | 'https://pages.tmall.com/wow/act/16814/industry_uvly_3335?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_7_1218745': '美妆主会场',
31 | 'https://pages.tmall.com/wow/act/16814/industry_4zs9_3400?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_8_1218745': '洗护清洁会场',
32 | 'https://pages.tmall.com/wow/act/16814/industry_vrxy_3379?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_9_1218745': '高端洗护会场',
33 | 'https://pages.tmall.com/wow/act/16814/industry_gc5i_3366?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&spm=a312d.7832047.0.0.xatBrC&scm=1003.4.lb-zebra-24215-1516540.ITEM_10_1218745': '生鲜会场',
34 | 'https://pages.tmall.com/wow/act/16814/industry_3bt1_3381?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_11_1218745': '医药健康',
35 | 'https://pages.tmall.com/wow/act/16814/industry_8enm_3397?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_12_1218745': '百货会场',
36 | 'https://pages.tmall.com/wow/act/16814/industry_yq0f_3352?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_13_1218745': '家纺家饰会场',
37 | 'https://pages.tmall.com/wow/act/16814/industry_itzs_3359?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_14_1218745': '车品配件',
38 | 'https://pages.tmall.com/wow/act/16814/industry_3bsz_3355?acm=lb-zebra-24215-1516540.1003.4.1218745&wh_weex=true&scm=1003.4.lb-zebra-24215-1516540.ITEM_15_1218745': '进口尖货',
39 | }
40 |
--------------------------------------------------------------------------------
/parser.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | # ----------------------------------------------------------------------
3 | # 作用:解析crawl_detail.py抓取到的页面
4 | # 日期:2016-12-12
5 | # 作者:九茶
6 | # ----------------------------------------------------------------------
7 |
8 | import sys
9 |
10 | reload(sys)
11 | sys.setdefaultencoding('utf8')
12 | import time
13 | import pymongo
14 | import re
15 | import json
16 | from lxml import etree
17 | from multiprocessing.dummy import Pool, cpu_count
18 |
19 | client = pymongo.MongoClient('localhost', 27017)
20 | db = client['1212']
21 | collection_html = db['Tmall_details']
22 | collection_imgs = db['Tmall_detail_imgs']
23 | collection_result = db['Tmall_result']
24 | collection_result1 = db['Tmall_source1']
25 | collection_result2 = db['Tmall_source2']
26 | collection_failure = db['Tmall_detail_failure']
27 |
28 |
29 | def run(routine):
30 | try:
31 | img_routines = []
32 | detail0 = []
33 | tree = etree.HTML(routine['Content'])
34 | data_detail = re.findall('_DATA_Detail = *?\n?(.*?\});? ?\n', routine['Content'])
35 | data_mdskip = re.findall('_DATA_Mdskip = *?\n?(.*?\});? ?\n', routine['Content'])
36 | data_detail_js = json.loads(data_detail[0])
37 | data_mdskip_js = json.loads(data_mdskip[0])
38 |
39 | # 商品标题
40 | title = tree.xpath('//section[@id="s-title"]/div[@class="main"]/h1/text()')
41 | if title:
42 | title = title[0]
43 | else:
44 | title = tree.xpath('//section[@id="s-title"]/div[@class="main"]/h1/text()')
45 | if title:
46 | title = title[0].replace(' - 天猫Tmall.com', '')
47 | else:
48 | title = ''
49 |
50 | # 一个商品下可能有颜色、码数选择,不同的选择会有不同的照片
51 | if 'valItemInfo' in data_detail_js.keys() and 'skuPics' in data_detail_js['valItemInfo'].keys():
52 | for key in data_detail_js['valItemInfo']['skuPics'].keys():
53 | try:
54 | value = data_detail_js['valItemInfo']['skuPics'][key]
55 | if key.startswith(';'):
56 | key = key[1:]
57 | if key.endswith(';'):
58 | key = key[:-1]
59 | key = 'https://detail.tmall.com/item.htm?id=%s&sku_properties=%s' % (
60 | routine['_id'], key.replace(';', '&'))
61 | if value.startswith('//'):
62 | value = 'http:' + value
63 | elif value.startswith('/'):
64 | value = 'http:/' + value
65 | elif not value.startswith('http'):
66 | value = 'http://' + value
67 | img_routines.append({'_id': value, '商品链接': key, '商品标题': title})
68 | except Exception, e:
69 | print e
70 | # 网页下拉,商品介绍时显示的照片
71 | if 'api' in data_detail_js.keys() and 'newWapDescJson' in data_detail_js['api'].keys():
72 | for one in data_detail_js['api']['newWapDescJson']:
73 | if 'moduleName' in one.keys() and one['moduleName'] == '商品图片' and 'data' in one.keys():
74 | for elem in one['data']:
75 | try:
76 | temp = {'_id': elem['img']}
77 | if 'width' in elem.keys():
78 | temp['width'] = elem['width']
79 | if 'height' in elem.keys():
80 | temp['height'] = elem['height']
81 | temp['商品链接'] = 'https://detail.tmall.com/item.htm?id=%s' % routine['_id']
82 | temp['商品标题'] = title
83 | img_routines.append(temp)
84 | except Exception, e:
85 | print e
86 | for img in img_routines:
87 | try:
88 | collection_imgs.insert(img)
89 | except Exception, e:
90 | pass
91 |
92 | # 服务保障
93 | fuwu = []
94 | if 'defaultModel' in data_mdskip_js.keys() and 'consumerProtection' in data_mdskip_js[
95 | 'defaultModel'].keys() and 'items' in data_mdskip_js[
96 | 'defaultModel']['consumerProtection'].keys():
97 | for one in data_mdskip_js['defaultModel']['consumerProtection']['items']:
98 | if 'title' in one.keys():
99 | fuwu.append(one['title'])
100 | fuwu = ';'.join(fuwu)
101 |
102 | # 优惠活动
103 | youhui = []
104 | if 'defaultModel' in data_mdskip_js.keys() and 'couponDataDo' in data_mdskip_js[
105 | 'defaultModel'].keys() and 'couponList' in data_mdskip_js[
106 | 'defaultModel']['couponDataDo'].keys():
107 | for one in data_mdskip_js['defaultModel']['couponDataDo']['couponList']:
108 | if 'title' in one.keys() and one['title'] != '领取优惠券':
109 | youhui.append(one['title'])
110 | youhui = ';'.join(youhui)
111 | youhui = youhui.replace('.', '点')
112 | elif 'defaultModel' in data_mdskip_js.keys() and 'itemPriceResultDO' in data_mdskip_js[
113 | 'defaultModel'].keys() and 'tmallShopProm' in \
114 | data_mdskip_js['defaultModel']['itemPriceResultDO'].keys():
115 | for one in data_mdskip_js['defaultModel']['itemPriceResultDO']['tmallShopProm']:
116 | if 'promPlanMsg' in one.keys():
117 | youhui = ';'.join(one['promPlanMsg'])
118 | youhui = youhui.replace('.', '点')
119 |
120 | # 卖家地址及快递费:
121 | maijiadizhi = ''
122 | kuaidifei = ''
123 | if 'defaultModel' in data_mdskip_js.keys() and 'deliveryDO' in data_mdskip_js[
124 | 'defaultModel'].keys() and 'deliverySkuMap' in data_mdskip_js[
125 | 'defaultModel']['deliveryDO'].keys():
126 | temp = data_mdskip_js['defaultModel']['deliveryDO']['deliverySkuMap']
127 | if 'default' in temp.keys():
128 | for one in temp['default']:
129 | if 'postage' in one.keys() and len(one['postage']) > 0:
130 | kuaidifei = one['postage']
131 | if 'skuDeliveryAddress' in one.keys() and len(one['skuDeliveryAddress']) > 0:
132 | maijiadizhi = one['skuDeliveryAddress']
133 |
134 | # 以上为不同颜色/型号商品共享的数据,以下求每个颜色/型号的商品信息
135 | if 'defaultModel' in data_mdskip_js.keys() and 'itemPriceResultDO' in data_mdskip_js[
136 | 'defaultModel'].keys() and 'priceInfo' in data_mdskip_js[
137 | 'defaultModel']['itemPriceResultDO'].keys():
138 | for elem in data_mdskip_js['defaultModel']['itemPriceResultDO']['priceInfo'].keys():
139 | value = data_mdskip_js['defaultModel']['itemPriceResultDO']['priceInfo'][elem]
140 | temp = {'_id': 'https://detail.tmall.com/item.htm?id=%s&skuId=%s' % (routine['_id'], elem)}
141 | if fuwu:
142 | temp['服务保障'] = fuwu
143 | if youhui:
144 | temp['优惠活动'] = youhui
145 | if maijiadizhi:
146 | temp['卖家地址'] = maijiadizhi
147 | if kuaidifei:
148 | temp['快递费'] = kuaidifei
149 | if 'tagPrice' in value.keys() and len(value['tagPrice']) > 0:
150 | temp['原价'] = value['tagPrice']
151 | elif 'price' in value.keys() and len(value['price']) > 0:
152 | temp['原价'] = value['price']
153 | if 'promotionList' in value.keys():
154 | for one in value['promotionList']:
155 | if 'price' in one.keys() and len(one['price']) > 0:
156 | temp['现价'] = one['price']
157 | if 'startTime' in one.keys():
158 | temp['活动开始时间'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(one['startTime'] / 1000))
159 | elif 'tradeResult' in data_mdskip_js['defaultModel'].keys() and 'startTime' in \
160 | data_mdskip_js['defaultModel'][
161 | 'tradeResult'].keys():
162 | startTime = data_mdskip_js['defaultModel']['tradeResult']['startTime']
163 | temp['活动开始时间'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(startTime / 1000))
164 | if 'endTime' in one.keys():
165 | temp['活动结束时间'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(one['endTime'] / 1000))
166 | temp['标题'] = title
167 | if 'Type' in routine.keys():
168 | temp['会场'] = routine['Type']
169 | if 'ShopUrl' in routine.keys():
170 | temp['商店链接'] = routine['ShopUrl']
171 | temp['商品ID'] = routine['_id']
172 | detail0.append(temp)
173 | for item in detail0:
174 | try:
175 | collection_result.insert(item)
176 | except Exception, e:
177 | print e
178 |
179 | try:
180 | data_detail_js['_id'] = routine['_id']
181 | collection_result1.insert(data_detail_js)
182 | except Exception, e:
183 | print e
184 | try:
185 | data_mdskip_js['_id'] = routine['_id']
186 | collection_result2.insert(data_mdskip_js)
187 | except Exception, e:
188 | print e
189 | with open('ids.txt', 'a') as f:
190 | f.write('%s\n' % routine['_id'])
191 | print 'Finish %s' % routine['_id']
192 | except Exception, e:
193 | print e
194 | try:
195 | collection_failure.insert(routine)
196 | except Exception, e:
197 | print e
198 |
199 |
200 | if __name__ == '__main__':
201 | pool = Pool(cpu_count())
202 | pool.map(run, collection_html.find())
203 | pool.close()
204 | pool.join()
205 |
--------------------------------------------------------------------------------