├── results └── cars.txt ├── .gitignore ├── requirements ├── setting.py ├── app.py ├── readme.md └── lib.py /results/cars.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /requirements: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.4.0 2 | requests==2.7.0 3 | -------------------------------------------------------------------------------- /setting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | # 伪装配置 5 | headers = { 6 | "Accept-Encoding":"gzip", 7 | "Cache-Control": "max-age=0", 8 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36', 9 | "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,en-US;q=0.4,zh-TW;q=0.2", 10 | "Connection" : "keep-alive", 11 | "Accept-Encoding" : "gzip, deflate", 12 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" 13 | } 14 | 15 | # 输出文件路径 16 | file_output = './results/cars.txt' 17 | 18 | # 主域名,用于拼接完整URL 19 | domain = "http://car.autohome.com.cn" 20 | 21 | # 最开始的品牌入口 22 | start_url = 'http://car.autohome.com.cn/AsLeftMenu/As_LeftListNew.ashx?typeId=1%20&brandId=0%20&fctId=0%20&seriesId=0' 23 | 24 | # 每一页下载等待时间 25 | wait_sec = 1 26 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from lib import get_cars 7 | from setting import headers, domain, start_url, file_output 8 | import json 9 | 10 | output_file = open(file_output, 'a') 11 | 12 | # 第一步提取 品牌列表 13 | # 第二部通过品牌列表提取 车辆详细列表(下一页) 14 | 15 | 16 | #print url 17 | # 设置隐藏爬虫痕迹 18 | result = requests.get(start_url, headers=headers) 19 | #result.encoding = 'gbk' ## @TODO 这一行干嘛用的 20 | # @TODO 验证是否下载成功 21 | html_content = result.content 22 | html_content = html_content.decode('gbk').encode('utf-8') 23 | # @TODO 保存原始文档 24 | # beautifulsoup 设置解析器(不然迁移可能出错) 25 | html_content_soup = BeautifulSoup(html_content,'html.parser') 26 | brands_tag = html_content_soup.find_all('li') 27 | 28 | 29 | for brand_tag in brands_tag: 30 | cars = [] 31 | brand_name = brand_tag.get_text(',').split(',')[0] 32 | brand_href = domain + brand_tag.a['href'] 33 | cars = get_cars(brand_name, brand_href) 34 | # 输出中文问题 35 | for car in cars: 36 | line = json.dumps(car, encoding="UTF-8", ensure_ascii=False) 37 | print type(line) 38 | # 输出 Unicode 到文件 39 | line = line.encode('UTF-8') 40 | output_file.write(line) 41 | output_file.write('\n') 42 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ###项目简介 2 | 3 | 4 | 本项目主要用于介绍使用 requests 和 BeautifulSoup 进行爬虫开发,最后采集到的条目格式如下: 5 | 6 | 7 | ``` 8 | { 9 | "外观颜色": "晨露白,布里奇沃特青铜,马达加斯加橙,鲜绿,塞勒涅青铜,深蓝色,栗子黑", 10 | "name": "Vanquish", 11 | "url": "http://car.autohome.com.cn/price/brand-35.html", 12 | "brand": "阿斯顿·马丁", 13 | "车身结构": "硬顶跑车", 14 | "变速箱": "自动", 15 | "发动机": "6.0L", 16 | "级别": "跑车", 17 | "price": "526.88-628.00万" 18 | } 19 | ``` 20 | 21 | 22 | * requests 文档: [http://docs.python-requests.org/en/latest/](http://docs.python-requests.org/en/latest/) 23 | * BeautifulSoup 文档: [http://www.crummy.com/software/BeautifulSoup/bs4/doc/](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) 24 | * Chrome DevTools 文档: [https://developer.chrome.com/devtools](https://developer.chrome.com/devtools) 25 | 26 | ### 使用须知 27 | 28 | 1. clone 本项目 29 | 30 | ``` 31 | # git clone https://github.com/William-Sang/autohome_crawler.git 32 | ``` 33 | 2. 配置依赖 34 | 35 | ``` 36 | # cd autohome_crawler 37 | # pip install -r requirements 38 | ``` 39 | 40 | 3. 修改配置(如果有需要) 41 | 42 | 43 | ``` 44 | # vim setting.py 45 | ``` 46 | 47 | 4. 执行爬取任务,默认结果会下载到 requests 目录下 48 | 49 | 50 | ``` 51 | # python app.py 52 | ``` 53 | 54 | ### 需要加强功能 55 | 56 | 1. 下载重试功能 http://www.coglib.com/~icordasc/blog/2014/12/retries-in-requests.html 57 | 58 | ### 可能出现的问题 59 | 60 | 1. 抓取具体车型信息的时候,会出现颜色无法抓取成功的情况。(有时) 61 | -------------------------------------------------------------------------------- /lib.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | def get_cars(brand_name, start_url): 5 | print 'start_url', start_url 6 | from setting import headers, wait_sec, domain 7 | from bs4 import BeautifulSoup 8 | import requests 9 | import time 10 | cars = [] 11 | # 设置referer 12 | headers['referer'] = start_url 13 | 14 | # 设置起始抓取页面 15 | now_url = start_url 16 | # next_url 为空是结束抓取,返回数据的条件 17 | next_url = '' 18 | while True: 19 | result = requests.get(now_url, headers=headers) 20 | print result.request.headers 21 | html_content = result.content 22 | html_content = result.content.decode('gbk').encode('utf-8') 23 | html_content_soup = BeautifulSoup(html_content, 'html.parser') 24 | cars_tag = html_content_soup.find_all(class_='list-cont-bg') 25 | # 结束逻辑 26 | # 1. 一开始就没有翻页 27 | # 2. 唯一获取 page-item-next 28 | # 3. 循环 29 | 30 | if html_content_soup.find(class_="price-page") is None: 31 | next_url = '' 32 | else: 33 | next_url_tag = html_content_soup.find(class_="price-page").find(class_="page-item-next") 34 | # 结束翻页 35 | if next_url_tag['href'] == 'javascript:void(0)': 36 | next_url = '' 37 | else: 38 | next_url = domain + next_url_tag['href'] 39 | print 'next_url is ', next_url 40 | for car_tag in cars_tag: 41 | car = {} 42 | car['brand'] = brand_name 43 | car['url'] = now_url 44 | car['name'] = car_tag.find(class_='main-title').get_text(strip=True) 45 | car['price'] = car_tag.find(class_='font-arial').get_text(strip=True) 46 | # @TODO 颜色还有问题 47 | for car_attr_tag in car_tag.find('ul', class_='lever-ul').find_all('li'): 48 | car_attr = car_attr_tag.get_text(',',strip=True) 49 | if len(car_attr.split(u':')) < 2 : 50 | continue 51 | car_attr_key = car_attr.split(u':')[0] 52 | car_attr_value = car_attr.split(u':')[1] 53 | # 直接空格无效,因为gbk无法转换'\xa0'字符(http://www.educity.cn/wenda/350839.html) 54 | car_attr_key = car_attr_key.replace(u'\xa0', '') 55 | car[car_attr_key] = car_attr_value.strip(',') 56 | cars.append(car) 57 | time.sleep(wait_sec) 58 | # 抓取结束,返回数据 59 | if next_url == '': 60 | return cars 61 | 62 | # 更换页面 63 | now_url = next_url 64 | --------------------------------------------------------------------------------