├── results
    └── cars.txt
├── .gitignore
├── requirements
├── setting.py
├── app.py
├── readme.md
└── lib.py


/results/cars.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/requirements:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.4.0
2 | requests==2.7.0
3 | 


--------------------------------------------------------------------------------
/setting.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | #  伪装配置
 5 | headers = {
 6 |     "Accept-Encoding":"gzip",
 7 |     "Cache-Control": "max-age=0",
 8 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36',
 9 |     "Accept-Language":  "zh-CN,zh;q=0.8,en;q=0.6,en-US;q=0.4,zh-TW;q=0.2",
10 |     "Connection" :  "keep-alive",
11 |     "Accept-Encoding" :  "gzip, deflate",
12 |     "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
13 | }
14 | 
15 | # 输出文件路径
16 | file_output = './results/cars.txt'
17 | 
18 | # 主域名,用于拼接完整URL
19 | domain = "http://car.autohome.com.cn"
20 | 
21 | # 最开始的品牌入口
22 | start_url = 'http://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId=0%20&fctId=0%20&seriesId=0'
23 | 
24 | # 每一页下载等待时间
25 | wait_sec = 1
26 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | from lib import get_cars
 7 | from setting import headers, domain, start_url, file_output
 8 | import json
 9 | 
10 | output_file = open(file_output, 'a')
11 | 
12 | # 第一步提取 品牌列表
13 | # 第二部通过品牌列表提取 车辆详细列表(下一页)
14 | 
15 | 
16 | #print url
17 | # 设置隐藏爬虫痕迹
18 | result = requests.get(start_url, headers=headers)
19 | #result.encoding = 'gbk'  ## @TODO 这一行干嘛用的
20 | # @TODO 验证是否下载成功
21 | html_content = result.content
22 | html_content = html_content.decode('gbk').encode('utf-8')
23 | # @TODO 保存原始文档
24 | # beautifulsoup 设置解析器（不然迁移可能出错）
25 | html_content_soup = BeautifulSoup(html_content,'html.parser')
26 | brands_tag =  html_content_soup.find_all('li')
27 | 
28 | 
29 | for brand_tag in brands_tag:
30 |     cars = []
31 |     brand_name  = brand_tag.get_text(',').split(',')[0]
32 |     brand_href = domain + brand_tag.a['href']
33 |     cars =  get_cars(brand_name, brand_href)
34 |     # 输出中文问题
35 |     for car in cars:
36 |         line = json.dumps(car, encoding="UTF-8", ensure_ascii=False)
37 |         print type(line)
38 |         # 输出 Unicode 到文件
39 |         line = line.encode('UTF-8')
40 |         output_file.write(line)
41 |         output_file.write('\n')
42 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ###项目简介
 2 | 
 3 | 
 4 | 本项目主要用于介绍使用 requests 和 BeautifulSoup 进行爬虫开发，最后采集到的条目格式如下:
 5 | 
 6 | 
 7 | ```
 8 | {
 9 |     "外观颜色": "晨露白,布里奇沃特青铜,马达加斯加橙,鲜绿,塞勒涅青铜,深蓝色,栗子黑", 
10 |     "name": "Vanquish", 
11 |     "url": "http://car.autohome.com.cn/price/brand-35.html", 
12 |     "brand": "阿斯顿·马丁", 
13 |     "车身结构": "硬顶跑车", 
14 |     "变速箱": "自动", 
15 |     "发动机": "6.0L", 
16 |     "级别": "跑车", 
17 |     "price": "526.88-628.00万"
18 | }
19 | ```
20 | 
21 | 
22 | * requests 文档: [http://docs.python-requests.org/en/latest/](http://docs.python-requests.org/en/latest/)
23 | * BeautifulSoup 文档: [http://www.crummy.com/software/BeautifulSoup/bs4/doc/](http://www.crummy.com/software/BeautifulSoup/bs4/doc/)
24 | * Chrome DevTools 文档: [https://developer.chrome.com/devtools](https://developer.chrome.com/devtools)
25 | 
26 | ### 使用须知
27 | 
28 | 1. clone 本项目
29 | 
30 | ```
31 | # git clone https://github.com/William-Sang/autohome_crawler.git
32 | ```
33 | 2. 配置依赖
34 | 
35 |     ```
36 | 	# cd autohome_crawler
37 | 	# pip install -r requirements
38 | 	```
39 | 	
40 | 3. 修改配置(如果有需要)
41 | 
42 | 
43 |     ```
44 | 	# vim setting.py
45 | 	```
46 | 	
47 | 4. 执行爬取任务，默认结果会下载到 requests 目录下
48 | 
49 | 
50 |     ```
51 | 	# python app.py
52 | 	```
53 | 
54 | ### 需要加强功能
55 | 
56 | 1.  下载重试功能 http://www.coglib.com/~icordasc/blog/2014/12/retries-in-requests.html
57 | 
58 | ### 可能出现的问题
59 | 
60 | 1. 抓取具体车型信息的时候，会出现颜色无法抓取成功的情况。（有时）
61 | 


--------------------------------------------------------------------------------
/lib.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | def get_cars(brand_name, start_url):
 5 |     print 'start_url', start_url
 6 |     from setting import headers, wait_sec, domain
 7 |     from bs4 import BeautifulSoup
 8 |     import requests
 9 |     import time
10 |     cars = []
11 |     # 设置referer
12 |     headers['referer'] = start_url
13 | 
14 |     # 设置起始抓取页面
15 |     now_url = start_url
16 |     # next_url 为空是结束抓取，返回数据的条件
17 |     next_url = ''
18 |     while True:
19 |         result = requests.get(now_url, headers=headers)
20 |         print result.request.headers
21 |         html_content = result.content
22 |         html_content = result.content.decode('gbk').encode('utf-8')
23 |         html_content_soup = BeautifulSoup(html_content, 'html.parser')
24 |         cars_tag = html_content_soup.find_all(class_='list-cont-bg')
25 |         # 结束逻辑
26 |         # 1. 一开始就没有翻页
27 |         # 2. 唯一获取 page-item-next
28 |         # 3. 循环
29 | 
30 |         if html_content_soup.find(class_="price-page") is None:
31 |             next_url = ''
32 |         else:
33 |             next_url_tag = html_content_soup.find(class_="price-page").find(class_="page-item-next")
34 |             # 结束翻页
35 |             if next_url_tag['href'] == 'javascript:void(0)':
36 |                 next_url = ''
37 |             else:
38 |                 next_url = domain + next_url_tag['href']
39 |         print 'next_url is ', next_url
40 |         for car_tag in cars_tag:
41 |             car = {}
42 |             car['brand'] = brand_name
43 |             car['url'] = now_url
44 |             car['name'] = car_tag.find(class_='main-title').get_text(strip=True)
45 |             car['price'] = car_tag.find(class_='font-arial').get_text(strip=True)
46 |             # @TODO 颜色还有问题
47 |             for car_attr_tag  in car_tag.find('ul', class_='lever-ul').find_all('li'):
48 |                 car_attr = car_attr_tag.get_text(',',strip=True)
49 |                 if len(car_attr.split(u'：')) < 2 :
50 |                     continue
51 |                 car_attr_key = car_attr.split(u'：')[0]
52 |                 car_attr_value = car_attr.split(u'：')[1]
53 |                 # 直接空格无效，因为gbk无法转换'\xa0'字符(http://www.educity.cn/wenda/350839.html)
54 |                 car_attr_key = car_attr_key.replace(u'\xa0', '')
55 |                 car[car_attr_key] = car_attr_value.strip(',')
56 |             cars.append(car)
57 |         time.sleep(wait_sec)
58 |         # 抓取结束，返回数据
59 |         if next_url == '':
60 |             return cars
61 | 
62 |         # 更换页面
63 |         now_url = next_url
64 | 


--------------------------------------------------------------------------------