├── 2.png ├── 3.png ├── README.md └── aiohttp_lianjia.py /2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Crypto-KK/aiohttp-spider/HEAD/2.png -------------------------------------------------------------------------------- /3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Crypto-KK/aiohttp-spider/HEAD/3.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # aiohttp-spider 2 | python链家网异步爬虫,使用asyncio、aiohttp和aiomysql 3 | # 效果图 4 | ![Image text](https://github.com/PythonerKK/aiohttp-spider/blob/master/2.png) 5 | ![Image text](https://github.com/PythonerKK/aiohttp-spider/blob/master/3.png) 6 | -------------------------------------------------------------------------------- /aiohttp_lianjia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | :author: KK 4 | :url: http://github.com/PythonerKK 5 | :copyright: © 2019 KK <705555262@qq.com.com> 6 | """ 7 | import asyncio 8 | import re 9 | import aiohttp 10 | from pyquery import PyQuery 11 | import aiomysql 12 | from lxml import etree 13 | 14 | pool = '' 15 | #sem = asyncio.Semaphore(4) 用来控制并发数,不指定会全速运行 16 | stop = False 17 | headers = { 18 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 19 | } 20 | MAX_PAGE = 10 21 | TABLE_NAME = 'data' #数据表名 22 | city = 'zh' #城市简写 23 | url = 'https://{}.lianjia.com/ershoufang/pg{}/' #url地址拼接 24 | urls = [] #所有页的url列表 25 | links_detail = set() #爬取中的详情页链接的集合 26 | crawled_links_detail = set() #爬取完成的链接集合,方便去重 27 | 28 | 29 | async def fetch(url, session): 30 | ''' 31 | aiohttp获取网页源码 32 | ''' 33 | # async with sem: 34 | try: 35 | async with session.get(url, headers=headers, verify_ssl=False) as resp: 36 | if resp.status in [200, 201]: 37 | data = await resp.text() 38 | return data 39 | except Exception as e: 40 | print(e) 41 | 42 | def extract_links(source): 43 | ''' 44 | 提取出详情页的链接 45 | ''' 46 | pq = PyQuery(source) 47 | for link in pq.items("a"): 48 | _url = link.attr("href") 49 | if _url and re.match('https://.*?/\d+.html', _url) and _url.find('{}.lianjia.com'.format(city)): 50 | links_detail.add(_url) 51 | 52 | print(links_detail) 53 | 54 | def extract_elements(source): 55 | ''' 56 | 提取出详情页里面的详情内容 57 | ''' 58 | try: 59 | dom = etree.HTML(source) 60 | id = dom.xpath('//link[@rel="canonical"]/@href')[0] 61 | title = dom.xpath('//title/text()')[0] 62 | price = dom.xpath('//span[@class="unitPriceValue"]/text()')[0] 63 | information = dict(re.compile('
  • (.*?)(.*?)
  • ').findall(source)) 64 | information.update(title=title, price=price, url=id) 65 | print(information) 66 | asyncio.ensure_future(save_to_database(information, pool=pool)) 67 | 68 | except Exception as e: 69 | print('解析详情页出错!') 70 | pass 71 | 72 | async def save_to_database(information, pool): 73 | ''' 74 | 使用异步IO方式保存数据到mysql中 75 | 注:如果不存在数据表,则创建对应的表 76 | ''' 77 | COLstr = '' # 列的字段 78 | ROWstr = '' # 行字段 79 | ColumnStyle = ' VARCHAR(255)' 80 | for key in information.keys(): 81 | COLstr = COLstr + ' ' + key + ColumnStyle + ',' 82 | ROWstr = (ROWstr + '"%s"' + ',') % (information[key]) 83 | # 异步IO方式插入数据库 84 | async with pool.acquire() as conn: 85 | async with conn.cursor() as cur: 86 | try: 87 | await cur.execute("SELECT * FROM %s" % (TABLE_NAME)) 88 | await cur.execute("INSERT INTO %s VALUES (%s)"%(TABLE_NAME, ROWstr[:-1])) 89 | print('插入数据成功') 90 | except aiomysql.Error as e: 91 | await cur.execute("CREATE TABLE %s (%s)" % (TABLE_NAME, COLstr[:-1])) 92 | await cur.execute("INSERT INTO %s VALUES (%s)" % (TABLE_NAME, ROWstr[:-1])) 93 | except aiomysql.Error as e: 94 | print('mysql error %d: %s' % (e.args[0], e.args[1])) 95 | 96 | async def handle_elements(link, session): 97 | ''' 98 | 获取详情页的内容并解析 99 | ''' 100 | print('开始获取: {}'.format(link)) 101 | source = await fetch(link, session) 102 | #添加到已爬取的集合中 103 | crawled_links_detail.add(link) 104 | extract_elements(source) 105 | 106 | async def consumer(): 107 | ''' 108 | 消耗未爬取的链接 109 | ''' 110 | async with aiohttp.ClientSession() as session: 111 | while not stop: 112 | if len(urls) != 0: 113 | _url = urls.pop() 114 | source = await fetch(_url, session) 115 | print(_url) 116 | extract_links(source) 117 | 118 | if len(links_detail) == 0: 119 | print('目前没有待爬取的链接') 120 | await asyncio.sleep(2) 121 | continue 122 | 123 | link = links_detail.pop() 124 | if link not in crawled_links_detail: 125 | asyncio.ensure_future(handle_elements(link, session)) 126 | 127 | async def main(loop): 128 | global pool 129 | pool = await aiomysql.create_pool(host='127.0.0.1', port=3306, 130 | user='root', password='998219', 131 | db='aiomysql_lianjia', loop=loop, charset='utf8', 132 | autocommit=True) 133 | 134 | for i in range(1, MAX_PAGE): 135 | urls.append(url.format(city, str(i))) 136 | print('爬取总页数:{} 任务开始...'.format(str(MAX_PAGE))) 137 | asyncio.ensure_future(consumer()) 138 | 139 | if __name__ == '__main__': 140 | loop = asyncio.get_event_loop() 141 | asyncio.ensure_future(main(loop)) 142 | loop.run_forever() --------------------------------------------------------------------------------