├── 2.png
├── 3.png
├── README.md
└── aiohttp_lianjia.py


/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Crypto-KK/aiohttp-spider/HEAD/2.png


--------------------------------------------------------------------------------
/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Crypto-KK/aiohttp-spider/HEAD/3.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # aiohttp-spider
2 | python链家网异步爬虫，使用asyncio、aiohttp和aiomysql
3 | # 效果图
4 | ![Image text](https://github.com/PythonerKK/aiohttp-spider/blob/master/2.png)
5 | ![Image text](https://github.com/PythonerKK/aiohttp-spider/blob/master/3.png)
6 | 


--------------------------------------------------------------------------------
/aiohttp_lianjia.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 |     :author: KK
  4 |     :url: http://github.com/PythonerKK
  5 |     :copyright: © 2019 KK <705555262@qq.com.com>
  6 | """
  7 | import asyncio
  8 | import re
  9 | import aiohttp
 10 | from pyquery import PyQuery
 11 | import aiomysql
 12 | from lxml import etree
 13 | 
 14 | pool = ''
 15 | #sem = asyncio.Semaphore(4)  用来控制并发数，不指定会全速运行
 16 | stop = False
 17 | headers = {
 18 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 19 | }
 20 | MAX_PAGE = 10
 21 | TABLE_NAME = 'data'  #数据表名
 22 | city = 'zh'  #城市简写
 23 | url = 'https://{}.lianjia.com/ershoufang/pg{}/'  #url地址拼接
 24 | urls = []  #所有页的url列表
 25 | links_detail = set()  #爬取中的详情页链接的集合
 26 | crawled_links_detail = set()  #爬取完成的链接集合，方便去重
 27 | 
 28 | 
 29 | async def fetch(url, session):
 30 |     '''
 31 |     aiohttp获取网页源码
 32 |     '''
 33 |     # async with sem:
 34 |     try:
 35 |         async with session.get(url, headers=headers, verify_ssl=False) as resp:
 36 |             if resp.status in [200, 201]:
 37 |                 data = await resp.text()
 38 |                 return data
 39 |     except Exception as e:
 40 |         print(e)
 41 | 
 42 | def extract_links(source):
 43 |     '''
 44 |     提取出详情页的链接
 45 |     '''
 46 |     pq = PyQuery(source)
 47 |     for link in pq.items("a"):
 48 |         _url = link.attr("href")
 49 |         if _url and re.match('https://.*?/\d+.html', _url) and _url.find('{}.lianjia.com'.format(city)):
 50 |             links_detail.add(_url)
 51 | 
 52 |     print(links_detail)
 53 | 
 54 | def extract_elements(source):
 55 |     '''
 56 |     提取出详情页里面的详情内容
 57 |     '''
 58 |     try:
 59 |         dom = etree.HTML(source)
 60 |         id = dom.xpath('//link[@rel="canonical"]/@href')[0]
 61 |         title = dom.xpath('//title/text()')[0]
 62 |         price = dom.xpath('//span[@class="unitPriceValue"]/text()')[0]
 63 |         information = dict(re.compile('<li><span class="label">(.*?)</span>(.*?)</li>').findall(source))
 64 |         information.update(title=title, price=price, url=id)
 65 |         print(information)
 66 |         asyncio.ensure_future(save_to_database(information, pool=pool))
 67 | 
 68 |     except Exception as e:
 69 |         print('解析详情页出错！')
 70 |         pass
 71 | 
 72 | async def save_to_database(information, pool):
 73 |     '''
 74 |     使用异步IO方式保存数据到mysql中
 75 |     注：如果不存在数据表，则创建对应的表
 76 |     '''
 77 |     COLstr = ''  # 列的字段
 78 |     ROWstr = ''  # 行字段
 79 |     ColumnStyle = ' VARCHAR(255)'
 80 |     for key in information.keys():
 81 |         COLstr = COLstr + ' ' + key + ColumnStyle + ','
 82 |         ROWstr = (ROWstr + '"%s"' + ',') % (information[key])
 83 |     # 异步IO方式插入数据库
 84 |     async with pool.acquire() as conn:
 85 |         async with conn.cursor() as cur:
 86 |             try:
 87 |                 await cur.execute("SELECT * FROM  %s" % (TABLE_NAME))
 88 |                 await cur.execute("INSERT INTO %s VALUES (%s)"%(TABLE_NAME, ROWstr[:-1]))
 89 |                 print('插入数据成功')
 90 |             except aiomysql.Error as e:
 91 |                 await cur.execute("CREATE TABLE %s (%s)" % (TABLE_NAME, COLstr[:-1]))
 92 |                 await cur.execute("INSERT INTO %s VALUES (%s)" % (TABLE_NAME, ROWstr[:-1]))
 93 |             except aiomysql.Error as e:
 94 |                 print('mysql error %d: %s' % (e.args[0], e.args[1]))
 95 | 
 96 | async def handle_elements(link, session):
 97 |     '''
 98 |     获取详情页的内容并解析
 99 |     '''
100 |     print('开始获取: {}'.format(link))
101 |     source = await fetch(link, session)
102 |     #添加到已爬取的集合中
103 |     crawled_links_detail.add(link)
104 |     extract_elements(source)
105 | 
106 | async def consumer():
107 |     '''
108 |     消耗未爬取的链接
109 |     '''
110 |     async with aiohttp.ClientSession() as session:
111 |         while not stop:
112 |             if len(urls) != 0:
113 |                 _url = urls.pop()
114 |                 source = await fetch(_url, session)
115 |                 print(_url)
116 |                 extract_links(source)
117 | 
118 |             if len(links_detail) == 0:
119 |                 print('目前没有待爬取的链接')
120 |                 await asyncio.sleep(2)
121 |                 continue
122 | 
123 |             link = links_detail.pop()
124 |             if link not in crawled_links_detail:
125 |                 asyncio.ensure_future(handle_elements(link, session))
126 | 
127 | async def main(loop):
128 |     global pool
129 |     pool = await aiomysql.create_pool(host='127.0.0.1', port=3306,
130 |                                       user='root', password='998219',
131 |                                       db='aiomysql_lianjia', loop=loop, charset='utf8',
132 |                                       autocommit=True)
133 | 
134 |     for i in range(1, MAX_PAGE):
135 |         urls.append(url.format(city, str(i)))
136 |     print('爬取总页数：{} 任务开始...'.format(str(MAX_PAGE)))
137 |     asyncio.ensure_future(consumer())
138 | 
139 | if __name__ == '__main__':
140 |     loop = asyncio.get_event_loop()
141 |     asyncio.ensure_future(main(loop))
142 |     loop.run_forever()


--------------------------------------------------------------------------------