├── order_spider
├── src
│ ├── conf
│ │ ├── __init__.py
│ │ └── project_paths.py
│ ├── test
│ │ ├── __init__.py
│ │ ├── test_download_cookie.py
│ │ ├── test_splite.py
│ │ └── cookie.txt
│ ├── common
│ │ ├── __init__.py
│ │ ├── chrome_cookie.py
│ │ └── excel_utils.py
│ ├── cookies
│ │ ├── __init__.py
│ │ └── jd_cookies.py
│ ├── outputers
│ │ ├── __init__.py
│ │ └── generate_my_booklist.py
│ ├── parsers
│ │ ├── __init__.py
│ │ ├── jd_book_html_parser.py
│ │ └── jd_order_html_parser.py
│ ├── downloaders
│ │ ├── __init__.py
│ │ └── jd_page_downloader.py
│ └── spider_main
│ │ ├── __init__.py
│ │ └── jd_spider.py
├── datas
│ ├── cookie_file
│ │ └── jd_cookie_file.txt
│ ├── my_books
│ │ ├── input_my_booklist.xlsx
│ │ └── output_my_booklist.html
│ └── url_file
│ │ └── jd_url_list.txt
├── .settings
│ └── org.eclipse.core.resources.prefs
├── .project
└── .pydevproject
└── README.md
/order_spider/src/conf/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/order_spider/src/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/order_spider/src/common/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/order_spider/src/cookies/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/order_spider/src/outputers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/order_spider/src/parsers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/order_spider/src/downloaders/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/order_spider/src/spider_main/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/order_spider/datas/cookie_file/jd_cookie_file.txt:
--------------------------------------------------------------------------------
1 | xxx
--------------------------------------------------------------------------------
/order_spider/src/cookies/jd_cookies.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/cookies/jd_cookies.py
--------------------------------------------------------------------------------
/order_spider/src/common/chrome_cookie.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/common/chrome_cookie.py
--------------------------------------------------------------------------------
/order_spider/src/parsers/jd_book_html_parser.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/parsers/jd_book_html_parser.py
--------------------------------------------------------------------------------
/order_spider/src/test/test_download_cookie.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/test/test_download_cookie.py
--------------------------------------------------------------------------------
/order_spider/src/parsers/jd_order_html_parser.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/parsers/jd_order_html_parser.py
--------------------------------------------------------------------------------
/order_spider/datas/my_books/input_my_booklist.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/datas/my_books/input_my_booklist.xlsx
--------------------------------------------------------------------------------
/order_spider/datas/my_books/output_my_booklist.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/datas/my_books/output_my_booklist.html
--------------------------------------------------------------------------------
/order_spider/src/outputers/generate_my_booklist.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peiss/order_spider/HEAD/order_spider/src/outputers/generate_my_booklist.py
--------------------------------------------------------------------------------
/order_spider/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//datas/my_books/output_my_booklist.html=gbk
3 | encoding//src/common/chrome_cookie.py=GB18030
4 | encoding//src/common/excel_utils.py=gbk
5 | encoding//src/conf/project_paths.py=GB18030
6 | encoding//src/cookies/jd_cookies.py=GB18030
7 | encoding//src/outputers/generate_my_booklist.py=gbk
8 |
--------------------------------------------------------------------------------
/order_spider/src/common/excel_utils.py:
--------------------------------------------------------------------------------
1 | # coding: gb18030
2 | import xlrd
3 |
4 | def read_excel_to_list(excel_file_path):
5 | ll = []
6 | data = xlrd.open_workbook(excel_file_path)
7 | table = data.sheets()[0]
8 | curr_row = 0
9 | while curr_row < table.nrows:
10 | ll.append(table.row_values(curr_row))
11 | curr_row += 1
12 | return ll
13 |
--------------------------------------------------------------------------------
/order_spider/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | order_spider
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/order_spider/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | /${PROJECT_DIR_NAME}/src
5 |
6 | python 2.7
7 | D:\software\Python27\python.exe
8 |
9 |
--------------------------------------------------------------------------------
/order_spider/src/test/test_splite.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 |
4 | ll = ['1077341', '495791,1466722', '883195,883194,883191', '1308786', '854240', '1150545,1251186,1251185,1311029,1251179,1549769,1251317', '1592641', '1311494,828164,258238,1275478,1203725,755185,1077341,319867,258214,1362641,501085,1018779', '11671959,10096879', '198930', '1598371', '1538076', '1052836117', '188057,188056', '11253922', '188057,188056', '1205735,1205735,1205736,1205736', '1581748625', '1551208220']
5 |
6 | for e in ll:
7 | print "#########"
8 | print e
9 | print e.split(',')
--------------------------------------------------------------------------------
/order_spider/src/test/cookie.txt:
--------------------------------------------------------------------------------
1 | # Netscape HTTP Cookie File
2 | # http://curl.haxx.se/rfc/cookie_spec.html
3 | # This is a generated file! Do not edit.
4 |
5 | .baidu.com TRUE / FALSE 3594881356 BAIDUID 18F3BA256478FF6AED28301058DFEE4D:FG=1
6 | .baidu.com TRUE / FALSE 3594881356 BIDUPSID 18F3BA256478FF6AED28301058DFEE4D
7 | .baidu.com TRUE / FALSE H_PS_PSSID 17772_17520_1455_17619_17923_17900_17943_17783_17927_10212_17971_17001_17073_15851_12404
8 | .baidu.com TRUE / FALSE 3594881356 PSTM 1447397709
9 | www.baidu.com FALSE / FALSE BDSVRTM 0
10 | www.baidu.com FALSE / FALSE BD_HOME 0
11 |
--------------------------------------------------------------------------------
/order_spider/src/conf/project_paths.py:
--------------------------------------------------------------------------------
1 | # coding: GB18030
2 |
3 | # project dir
4 | PROJECT_DIR = "D:/workbench/python/order_spider/order_spider"
5 |
6 | # project codes dir
7 | PROJECT_CODES_DIR = "%s/src" % PROJECT_DIR
8 |
9 | # project data dir
10 | PROJECT_DATA_DIR = "%s/datas" % PROJECT_DIR
11 |
12 | # cookie file dir
13 | COOKIE_FILE_DIR = "%s/cookie_file" % PROJECT_DATA_DIR
14 | # jd.com cookie file
15 | JD_COOKIE_FILE = "%s/jd_cookie_file.txt" % COOKIE_FILE_DIR
16 |
17 |
18 | # url file dier
19 | URL_FILE_DIR = "%s/url_file" % PROJECT_DATA_DIR
20 | JD_URL_LIST_FILE = "%s/jd_url_list.txt" % URL_FILE_DIR
21 |
22 | # jd data dir
23 | JD_DATA_DIR = "%s/output/jd_data" % PROJECT_DATA_DIR
24 |
25 | # my book list excel
26 | MY_BOOK_LIST_EXCEL = "%s/my_books/input_my_booklist.xlsx" % PROJECT_DATA_DIR
27 | MY_BOOK_LIST_OUTHTML = "%s/my_books/output_my_booklist.html" % PROJECT_DATA_DIR
28 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # order_spider
2 | 抓取购物网站(京东、淘宝、亚马逊)的个人订单页面的物品列表以及物品详情
3 |
4 | # 自动抓取京东的个人订单,并提取出书籍列表
5 |
6 | 1、按照/order_spider/src/common/chrome_cookie.py的提示,将获取的cookie,粘贴到/order_spider/datas/cookie_file/jd_cookie_file.txt
7 | 2、进入京东的个人订单页面,把所有的订单页面的URL,添加到/order_spider/datas/url_file/jd_url_list.txt
8 |
9 | 3、修改/order_spider/src/conf/project_paths.py中的PROJECT_DIR为自己的代码路径
10 |
11 | 4、运行/order_spider/src/spider_main/jd_spider.py
12 |
13 | 5、生成结果在/order_spider/datas/output/jd_data,包括所有的商品、书籍商品、非书籍商品三个列表文件
14 |
15 |
16 | # 自动生成博客书单
17 | 步骤:
18 |
19 | 1、修改excel中的书单,地址在/order_spider/datas/my_books/input_my_booklist.xlsx
20 |
21 | 2、运行/order_spider/src/outputers/generate_my_booklist.py即可生成书单的HTML
22 |
23 | 3、输出HTML地址为:/order_spider/datas/my_books/output_my_booklist.html
24 |
25 | 4、复制HTML中的内容,到博客文章页面即可
26 |
27 |
28 |
29 | 生成的效果地址:
30 | http://www.crazyant.net/my_book_list
31 |
32 |
--------------------------------------------------------------------------------
/order_spider/src/downloaders/jd_page_downloader.py:
--------------------------------------------------------------------------------
1 | # coding: GB18030
2 | '''
3 | @author: crazyant
4 | '''
5 | import urllib2
6 |
7 | from cookies import jd_cookies
8 |
9 |
10 | def get_jd_opener():
11 | handler = urllib2.HTTPCookieProcessor(jd_cookies.get_cookiejar())
12 | opener = urllib2.build_opener(handler)
13 | return opener
14 |
15 | def get_jd_request(url):
16 | request = urllib2.Request(url)
17 | request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64)")
18 | request.add_header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
19 | return request
20 |
21 | def down_jd_page(jd_url):
22 | print "download jd url:%s" % jd_url
23 | urllib2.install_opener(get_jd_opener())
24 | response = urllib2.urlopen(get_jd_request(jd_url))
25 | if response.getcode() != 200:
26 | print "url can not access"
27 | return None
28 | html_doc = response.read()
29 | return html_doc
30 |
31 |
--------------------------------------------------------------------------------
/order_spider/datas/url_file/jd_url_list.txt:
--------------------------------------------------------------------------------
1 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=1
2 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=2
3 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=3
4 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=4
5 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=5
6 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=6
7 | http://order.jd.com/center/list.action?d=2&s=4096&t=&keyword=&search=0&page=7
8 | http://order.jd.com/center/list.action?d=2014&s=4096&t=&keyword=&search=0&page=1
9 | http://order.jd.com/center/list.action?d=2014&s=4096&t=&keyword=&search=0&page=2
10 | http://order.jd.com/center/list.action?d=2013&s=4096&t=&keyword=&search=0&page=1
11 | http://order.jd.com/center/list.action?d=2012&s=4096&t=&keyword=&search=0&page=1
12 | http://order.jd.com/center/list.action?search=0&d=3&s=4096&t=
--------------------------------------------------------------------------------
/order_spider/src/spider_main/jd_spider.py:
--------------------------------------------------------------------------------
1 | # coding: GB18030
2 | from conf.project_paths import JD_URL_LIST_FILE, JD_DATA_DIR
3 | from parsers import jd_order_html_parser, jd_book_html_parser
4 | from downloaders import jd_page_downloader
5 |
6 | item_ids = []
7 | for item_page_url in open(JD_URL_LIST_FILE):
8 | print "##############################"
9 | html_cont = jd_page_downloader.down_jd_page(item_page_url)
10 | item_ids.extend(jd_order_html_parser.get_item_ids(item_page_url, html_cont))
11 |
12 | print item_ids
13 |
14 | fout_all_data = open("%s/my_all_data.txt" % JD_DATA_DIR, 'w')
15 | fout_book_data = open("%s/my_jd_books.txt" % JD_DATA_DIR, 'w')
16 | fout_not_book_data = open("%s/my_jd_not_books.txt" % JD_DATA_DIR, 'w')
17 |
18 | for item_id in set(item_ids):
19 | item_url = "http://item.jd.com/%s.html" % item_id
20 | print item_url
21 |
22 | item_html = jd_page_downloader.down_jd_page(item_url)
23 | if not item_html:
24 | continue
25 |
26 | item_data = jd_book_html_parser.get_jd_book_data(item_url, item_html)
27 | if item_data is not None:
28 | fout_all_data.write("\t".join((item_url, item_data["page_title"].encode('GB18030'))) + "\n")
29 | if item_data['is_book_page']:
30 | print item_data["title"], item_data["author"]
31 | fout_book_data.write("\t".join((item_data["title"], item_data["author"])) + "\n")
32 | else:
33 | print "not a book page:" + item_data["page_title"]
34 | fout_not_book_data.write(item_data["page_title"].encode('GB18030') + "\n")
35 |
36 | fout_all_data.flush()
37 | fout_all_data.close()
38 | fout_book_data.flush()
39 | fout_book_data.close()
40 | fout_not_book_data.flush()
41 | fout_not_book_data.close()
42 |
--------------------------------------------------------------------------------