├── order_spider ├── src │ ├── conf │ │ ├── __init__.py │ │ └── project_paths.py │ ├── test │ │ ├── __init__.py │ │ ├── test_download_cookie.py │ │ ├── test_splite.py │ │ └── cookie.txt │ ├── common │ │ ├── __init__.py │ │ ├── chrome_cookie.py │ │ └── excel_utils.py │ ├── cookies │ │ ├── __init__.py │ │ └── jd_cookies.py │ ├── outputers │ │ ├── __init__.py │ │ └── generate_my_booklist.py │ ├── parsers │ │ ├── __init__.py │ │ ├── jd_book_html_parser.py │ │ └── jd_order_html_parser.py │ ├── downloaders │ │ ├── __init__.py │ │ └── jd_page_downloader.py │ └── spider_main │ │ ├── __init__.py │ │ └── jd_spider.py ├── datas │ ├── cookie_file │ │ └── jd_cookie_file.txt │ ├── my_books │ │ ├── input_my_booklist.xlsx │ │ └── output_my_booklist.html │ └── url_file │ │ └── jd_url_list.txt ├── .settings │ └── org.eclipse.core.resources.prefs ├── .project └── .pydevproject └── README.md /order_spider/src/conf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /order_spider/src/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /order_spider/src/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /order_spider/src/cookies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /order_spider/src/outputers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /order_spider/src/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /order_spider/src/downloaders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /order_spider/src/spider_main/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /order_spider/datas/cookie_file/jd_cookie_file.txt: -------------------------------------------------------------------------------- 1 | xxx -------------------------------------------------------------------------------- /order_spider/src/cookies/jd_cookies.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/cookies/jd_cookies.py -------------------------------------------------------------------------------- /order_spider/src/common/chrome_cookie.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/common/chrome_cookie.py -------------------------------------------------------------------------------- /order_spider/src/parsers/jd_book_html_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/parsers/jd_book_html_parser.py -------------------------------------------------------------------------------- /order_spider/src/test/test_download_cookie.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/test/test_download_cookie.py -------------------------------------------------------------------------------- /order_spider/src/parsers/jd_order_html_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/parsers/jd_order_html_parser.py -------------------------------------------------------------------------------- /order_spider/datas/my_books/input_my_booklist.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/datas/my_books/input_my_booklist.xlsx -------------------------------------------------------------------------------- /order_spider/datas/my_books/output_my_booklist.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/datas/my_books/output_my_booklist.html -------------------------------------------------------------------------------- /order_spider/src/outputers/generate_my_booklist.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/outputers/generate_my_booklist.py -------------------------------------------------------------------------------- /order_spider/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//datas/my_books/output_my_booklist.html=gbk 3 | encoding//src/common/chrome_cookie.py=GB18030 4 | encoding//src/common/excel_utils.py=gbk 5 | encoding//src/conf/project_paths.py=GB18030 6 | encoding//src/cookies/jd_cookies.py=GB18030 7 | encoding//src/outputers/generate_my_booklist.py=gbk 8 | -------------------------------------------------------------------------------- /order_spider/src/common/excel_utils.py: -------------------------------------------------------------------------------- 1 | # coding: gb18030 2 | import xlrd 3 | 4 | def read_excel_to_list(excel_file_path): 5 | ll = [] 6 | data = xlrd.open_workbook(excel_file_path) 7 | table = data.sheets()[0] 8 | curr_row = 0 9 | while curr_row < table.nrows: 10 | ll.append(table.row_values(curr_row)) 11 | curr_row += 1 12 | return ll 13 | -------------------------------------------------------------------------------- /order_spider/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | order_spider 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /order_spider/.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME}/src 5 | 6 | python 2.7 7 | D:\software\Python27\python.exe 8 | 9 | -------------------------------------------------------------------------------- /order_spider/src/test/test_splite.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | 4 | ll = ['1077341', '495791,1466722', '883195,883194,883191', '1308786', '854240', '1150545,1251186,1251185,1311029,1251179,1549769,1251317', '1592641', '1311494,828164,258238,1275478,1203725,755185,1077341,319867,258214,1362641,501085,1018779', '11671959,10096879', '198930', '1598371', '1538076', '1052836117', '188057,188056', '11253922', '188057,188056', '1205735,1205735,1205736,1205736', '1581748625', '1551208220'] 5 | 6 | for e in ll: 7 | print "#########" 8 | print e 9 | print e.split(',') -------------------------------------------------------------------------------- /order_spider/src/test/cookie.txt: -------------------------------------------------------------------------------- 1 | # Netscape HTTP Cookie File 2 | # http://curl.haxx.se/rfc/cookie_spec.html 3 | # This is a generated file! Do not edit. 4 | 5 | .baidu.com TRUE / FALSE 3594881356 BAIDUID 18F3BA256478FF6AED28301058DFEE4D:FG=1 6 | .baidu.com TRUE / FALSE 3594881356 BIDUPSID 18F3BA256478FF6AED28301058DFEE4D 7 | .baidu.com TRUE / FALSE H_PS_PSSID 17772_17520_1455_17619_17923_17900_17943_17783_17927_10212_17971_17001_17073_15851_12404 8 | .baidu.com TRUE / FALSE 3594881356 PSTM 1447397709 9 | www.baidu.com FALSE / FALSE BDSVRTM 0 10 | www.baidu.com FALSE / FALSE BD_HOME 0 11 | -------------------------------------------------------------------------------- /order_spider/src/conf/project_paths.py: -------------------------------------------------------------------------------- 1 | # coding: GB18030 2 | 3 | # project dir 4 | PROJECT_DIR = "D:/workbench/python/order_spider/order_spider" 5 | 6 | # project codes dir 7 | PROJECT_CODES_DIR = "%s/src" % PROJECT_DIR 8 | 9 | # project data dir 10 | PROJECT_DATA_DIR = "%s/datas" % PROJECT_DIR 11 | 12 | # cookie file dir 13 | COOKIE_FILE_DIR = "%s/cookie_file" % PROJECT_DATA_DIR 14 | # jd.com cookie file 15 | JD_COOKIE_FILE = "%s/jd_cookie_file.txt" % COOKIE_FILE_DIR 16 | 17 | 18 | # url file dier 19 | URL_FILE_DIR = "%s/url_file" % PROJECT_DATA_DIR 20 | JD_URL_LIST_FILE = "%s/jd_url_list.txt" % URL_FILE_DIR 21 | 22 | # jd data dir 23 | JD_DATA_DIR = "%s/output/jd_data" % PROJECT_DATA_DIR 24 | 25 | # my book list excel 26 | MY_BOOK_LIST_EXCEL = "%s/my_books/input_my_booklist.xlsx" % PROJECT_DATA_DIR 27 | MY_BOOK_LIST_OUTHTML = "%s/my_books/output_my_booklist.html" % PROJECT_DATA_DIR 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # order_spider 2 | 抓取购物网站(京东、淘宝、亚马逊)的个人订单页面的物品列表以及物品详情 3 | 4 | # 自动抓取京东的个人订单,并提取出书籍列表 5 | 6 | 1、按照/order_spider/src/common/chrome_cookie.py的提示,将获取的cookie,粘贴到/order_spider/datas/cookie_file/jd_cookie_file.txt
7 | 2、进入京东的个人订单页面,把所有的订单页面的URL,添加到/order_spider/datas/url_file/jd_url_list.txt 8 |
9 | 3、修改/order_spider/src/conf/project_paths.py中的PROJECT_DIR为自己的代码路径 10 |
11 | 4、运行/order_spider/src/spider_main/jd_spider.py 12 |
13 | 5、生成结果在/order_spider/datas/output/jd_data,包括所有的商品、书籍商品、非书籍商品三个列表文件 14 | 15 | 16 | # 自动生成博客书单 17 | 步骤: 18 |
19 | 1、修改excel中的书单,地址在/order_spider/datas/my_books/input_my_booklist.xlsx 20 |
21 | 2、运行/order_spider/src/outputers/generate_my_booklist.py即可生成书单的HTML 22 |
23 | 3、输出HTML地址为:/order_spider/datas/my_books/output_my_booklist.html 24 |
25 | 4、复制HTML中的内容,到博客文章页面即可 26 |
27 | 28 |
29 | 生成的效果地址:
30 | http://www.crazyant.net/my_book_list 31 | 32 | -------------------------------------------------------------------------------- /order_spider/src/downloaders/jd_page_downloader.py: -------------------------------------------------------------------------------- 1 | # coding: GB18030 2 | ''' 3 | @author: crazyant 4 | ''' 5 | import urllib2 6 | 7 | from cookies import jd_cookies 8 | 9 | 10 | def get_jd_opener(): 11 | handler = urllib2.HTTPCookieProcessor(jd_cookies.get_cookiejar()) 12 | opener = urllib2.build_opener(handler) 13 | return opener 14 | 15 | def get_jd_request(url): 16 | request = urllib2.Request(url) 17 | request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64)") 18 | request.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") 19 | return request 20 | 21 | def down_jd_page(jd_url): 22 | print "download jd url:%s" % jd_url 23 | urllib2.install_opener(get_jd_opener()) 24 | response = urllib2.urlopen(get_jd_request(jd_url)) 25 | if response.getcode() != 200: 26 | print "url can not access" 27 | return None 28 | html_doc = response.read() 29 | return html_doc 30 | 31 | -------------------------------------------------------------------------------- /order_spider/datas/url_file/jd_url_list.txt: -------------------------------------------------------------------------------- 1 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=1 2 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=2 3 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=3 4 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=4 5 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=5 6 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=6 7 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=7 8 | http://order.jd.com/center/list.action?d=2014&s=4096&t=&keyword=&search=0&page=1 9 | http://order.jd.com/center/list.action?d=2014&s=4096&t=&keyword=&search=0&page=2 10 | http://order.jd.com/center/list.action?d=2013&s=4096&t=&keyword=&search=0&page=1 11 | http://order.jd.com/center/list.action?d=2012&s=4096&t=&keyword=&search=0&page=1 12 | http://order.jd.com/center/list.action?search=0&d=3&s=4096&t= -------------------------------------------------------------------------------- /order_spider/src/spider_main/jd_spider.py: -------------------------------------------------------------------------------- 1 | # coding: GB18030 2 | from conf.project_paths import JD_URL_LIST_FILE, JD_DATA_DIR 3 | from parsers import jd_order_html_parser, jd_book_html_parser 4 | from downloaders import jd_page_downloader 5 | 6 | item_ids = [] 7 | for item_page_url in open(JD_URL_LIST_FILE): 8 | print "##############################" 9 | html_cont = jd_page_downloader.down_jd_page(item_page_url) 10 | item_ids.extend(jd_order_html_parser.get_item_ids(item_page_url, html_cont)) 11 | 12 | print item_ids 13 | 14 | fout_all_data = open("%s/my_all_data.txt" % JD_DATA_DIR, 'w') 15 | fout_book_data = open("%s/my_jd_books.txt" % JD_DATA_DIR, 'w') 16 | fout_not_book_data = open("%s/my_jd_not_books.txt" % JD_DATA_DIR, 'w') 17 | 18 | for item_id in set(item_ids): 19 | item_url = "http://item.jd.com/%s.html" % item_id 20 | print item_url 21 | 22 | item_html = jd_page_downloader.down_jd_page(item_url) 23 | if not item_html: 24 | continue 25 | 26 | item_data = jd_book_html_parser.get_jd_book_data(item_url, item_html) 27 | if item_data is not None: 28 | fout_all_data.write("\t".join((item_url, item_data["page_title"].encode('GB18030'))) + "\n") 29 | if item_data['is_book_page']: 30 | print item_data["title"], item_data["author"] 31 | fout_book_data.write("\t".join((item_data["title"], item_data["author"])) + "\n") 32 | else: 33 | print "not a book page:" + item_data["page_title"] 34 | fout_not_book_data.write(item_data["page_title"].encode('GB18030') + "\n") 35 | 36 | fout_all_data.flush() 37 | fout_all_data.close() 38 | fout_book_data.flush() 39 | fout_book_data.close() 40 | fout_not_book_data.flush() 41 | fout_not_book_data.close() 42 | --------------------------------------------------------------------------------