├── 2.png ├── 3.png ├── README.md └── aiohttp_lianjia.py /2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Crypto-KK/aiohttp-spider/HEAD/2.png -------------------------------------------------------------------------------- /3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Crypto-KK/aiohttp-spider/HEAD/3.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aiohttp-spider 2 | python链家网异步爬虫,使用asyncio、aiohttp和aiomysql 3 | # 效果图 4 |  5 |  6 | -------------------------------------------------------------------------------- /aiohttp_lianjia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | :author: KK 4 | :url: http://github.com/PythonerKK 5 | :copyright: © 2019 KK <705555262@qq.com.com> 6 | """ 7 | import asyncio 8 | import re 9 | import aiohttp 10 | from pyquery import PyQuery 11 | import aiomysql 12 | from lxml import etree 13 | 14 | pool = '' 15 | #sem = asyncio.Semaphore(4) 用来控制并发数,不指定会全速运行 16 | stop = False 17 | headers = { 18 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 19 | } 20 | MAX_PAGE = 10 21 | TABLE_NAME = 'data' #数据表名 22 | city = 'zh' #城市简写 23 | url = 'https://{}.lianjia.com/ershoufang/pg{}/' #url地址拼接 24 | urls = [] #所有页的url列表 25 | links_detail = set() #爬取中的详情页链接的集合 26 | crawled_links_detail = set() #爬取完成的链接集合,方便去重 27 | 28 | 29 | async def fetch(url, session): 30 | ''' 31 | aiohttp获取网页源码 32 | ''' 33 | # async with sem: 34 | try: 35 | async with session.get(url, headers=headers, verify_ssl=False) as resp: 36 | if resp.status in [200, 201]: 37 | data = await resp.text() 38 | return data 39 | except Exception as e: 40 | print(e) 41 | 42 | def extract_links(source): 43 | ''' 44 | 提取出详情页的链接 45 | ''' 46 | pq = PyQuery(source) 47 | for link in pq.items("a"): 48 | _url = link.attr("href") 49 | if _url and re.match('https://.*?/\d+.html', _url) and _url.find('{}.lianjia.com'.format(city)): 50 | links_detail.add(_url) 51 | 52 | print(links_detail) 53 | 54 | def extract_elements(source): 55 | ''' 56 | 提取出详情页里面的详情内容 57 | ''' 58 | try: 59 | dom = etree.HTML(source) 60 | id = dom.xpath('//link[@rel="canonical"]/@href')[0] 61 | title = dom.xpath('//title/text()')[0] 62 | price = dom.xpath('//span[@class="unitPriceValue"]/text()')[0] 63 | information = dict(re.compile('