├── 58同城 └── 58同城网址获取.py ├── 99藏书网 ├── 99藏书网小说_1.py ├── 99藏书网小说_2.py └── 99藏书网小说_3.py ├── LICENSE ├── README.md ├── 丁香医生 └── 疾病标签.py ├── 京东 └── 京东商品爬虫_1.py ├── 从零开始的网络爬虫 ├── 01_基础网站爬虫.py ├── 02_基础网站爬虫.py ├── 03_基础网站爬虫.py ├── 04_无HTTPS证书网站爬虫.py ├── 05_HTTP认证网站爬虫.py ├── 06_慢速网站爬虫.py ├── 07_异步加载网站爬虫.py ├── 08_动态页面渲染网站爬虫.js ├── 08_动态页面渲染网站爬虫_1.py ├── 08_动态页面渲染网站爬虫_2.py ├── 09_无页码翻页网站爬虫_1.py ├── 09_无页码翻页网站爬虫_2.py ├── 10_异步智能页面网站爬虫.py ├── 11_大批量动态页面网站爬虫.py ├── 12_数据接口参数加密网站爬虫.py ├── 13_模拟登陆网站爬虫_1.py ├── 13_模拟登陆网站爬虫_2.py ├── 15_CSS样式偏移网站爬虫.py └── 21_模拟登陆网站爬虫.py ├── 代码示例 ├── JA3指纹.py ├── JA3指纹破解.py ├── URL参数修改.py ├── 分布式爬虫示例 │ ├── 加入任务队列.py │ └── 读取任务队列.py ├── 加密解密.py └── 重试装饰器示例.py ├── 其他网络爬虫 ├── GXNAS壁纸爬虫_1.py ├── GXNAS壁纸爬虫_2.py ├── myCancerGenome.py ├── 下厨房网爬虫.py ├── 北京市政务数据资源网爬虫.py ├── 小米步数.py ├── 彼岸图网爬虫.py ├── 抖音无水印视频爬虫.py ├── 有道翻译.py ├── 百度百科爬虫项目.py ├── 糗事百科爬虫.py ├── 豆瓣Top250爬虫.py └── 铅笔小说网爬虫.py ├── 前程无忧 ├── 51job爬虫_1.py └── 51job爬虫_2.py ├── 哔哩哔哩 ├── B站弹幕爬虫.py ├── B站评论爬虫.py └── msyh.ttc ├── 好豆网 └── 好豆网爬虫.py ├── 淘宝 └── 淘宝数据爬虫.py ├── 美食天下 └── 美食天下.py ├── 超能网 └── 超能网文章内容爬虫.py └── 链家 └── 链家二手房爬虫_1.py /58同城/58同城网址获取.py: -------------------------------------------------------------------------------- 1 | from requests_html import HTMLSession 2 | import json 3 | 4 | 5 | def get_html(): 6 | session = HTMLSession() 7 | html = session.get('https://www.58.com/changecity.html?catepath=chuzu') 8 | html.html.render() 9 | data_1 = html.html.find('.hot-city') 10 | data_2 = html.html.find('.content-city') 11 | url = {item.text: 'https:' + item.attrs['href'] for item in data_1} 12 | for item in data_2: 13 | url[item.text] = 'https:' + item.attrs['href'] 14 | return url 15 | 16 | 17 | def save(data): 18 | with open('area.txt', 'w+') as file: 19 | data = json.dumps(data) 20 | file.write(data) 21 | 22 | 23 | def main(): 24 | data = get_html() 25 | save(data) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /99藏书网/99藏书网小说_1.py: -------------------------------------------------------------------------------- 1 | from CharacterFilter import clean 2 | from pyppeteer import launch 3 | import asyncio 4 | import re 5 | import time 6 | 7 | 8 | async def get_html(url): 9 | """单页爬取""" 10 | browser = await launch({'headless': True}) 11 | page = await browser.newPage() 12 | await page.setViewport({'width': 1920, 'height': 1080}) 13 | await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chro' 14 | 'me/89.0.4389.90 Safari/537.36') 15 | await page.evaluateOnNewDocument('''() =>{ 16 | Object.defineProperties(navigator,{ webdriver:{ get: () => false } }); 17 | window.navigator.chrome = { runtime: {}, }; 18 | Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); 19 | Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); 20 | }''') 21 | await page.goto(url) 22 | title = await page.content() 23 | title = re.findall(re.compile(r'(.*?)_在线阅读_九九藏书网'), title) 24 | if not title: 25 | await browser.close() 26 | print('出错了') 27 | return 28 | title = clean(title[0]) 29 | for _ in range(20): # 若爬取内容不全,可增加迭代次数 30 | await page.keyboard.press('End') 31 | await page.waitFor(500) 32 | await page.waitFor(3000) 33 | png = await page.querySelector('div#content') 34 | await png.screenshot(path=f'{title}.png') 35 | await browser.close() 36 | 37 | 38 | def main(): 39 | url = input('输入网址:') 40 | start = time.time() 41 | if re.findall( 42 | re.compile(r'^http://www.99csw.com/book/[0-9]+?/[0-9]+?.htm$'), 43 | url): 44 | asyncio.get_event_loop().run_until_complete(get_html(url)) 45 | print('程序运行时间:{:.6f}'.format(time.time() - start)) 46 | 47 | 48 | if __name__ == '__main__': 49 | """爬取正文并保存为图像""" 50 | main() 51 | -------------------------------------------------------------------------------- /99藏书网/99藏书网小说_2.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from fake_useragent import FakeUserAgent 4 | import base64 5 | import time 6 | import re 7 | 8 | 9 | class Spider: 10 | def __init__(self): 11 | self.headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,' 13 | 'application/signed-exchange;v=b3;q=0.9', 14 | 'Accept-Encoding': 'gzip, deflate', 15 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 16 | 'Cache-Control': 'no-cache', 17 | 'Connection': 'keep-alive', 18 | 'DNT': '1', 19 | 'Host': 'www.99csw.com', 20 | 'Pragma': 'no-cache', 21 | 'Upgrade-Insecure-Requests': '1', 22 | 'User-Agent': FakeUserAgent().chrome} 23 | 24 | def get_html(self, url): 25 | response = requests.get(url, headers=self.headers) 26 | if response.status_code == 200: 27 | return response.content.decode('utf-8') 28 | return None 29 | 30 | def get_data(self, tree): 31 | if tree is None: 32 | return None 33 | meta = [int(i) for i in self.get_meta(tree)] 34 | text = self.get_text(tree) 35 | if len(meta) != len(text) - 1: 36 | return None 37 | result = ['' for _ in meta] 38 | x = 0 39 | for i, j in enumerate(meta): 40 | if j < 3: 41 | result[j] = text[i + 1] 42 | x += 1 43 | else: 44 | result[j - x] = text[i + 1] 45 | x += 2 46 | return result 47 | 48 | @staticmethod 49 | def get_meta(tree): 50 | meta = tree.xpath(".//meta[5]/@content")[0] 51 | if meta: 52 | data = base64.b64decode(meta).decode("ascii") 53 | return re.split(r'[A-Z]+%', data) 54 | return None 55 | 56 | @staticmethod 57 | def get_text(tree): 58 | if tree is None: 59 | return None 60 | content = [tree.xpath(".//div[@id='content']/h2/text()")[0]] 61 | content[1:1] = [i.text for i in tree.xpath( 62 | ".//div[@id='content']/div")] 63 | return content 64 | 65 | def run(self, url): 66 | html = self.get_html(url) 67 | tree = etree.HTML(html) if html else None 68 | data = self.get_data(tree) 69 | for i in data: 70 | print(i) 71 | 72 | 73 | def main(): 74 | url = input('输入网址:') 75 | if not re.findall(r'http://www.99csw.com/book/[0-9]+/[0-9]+.htm', url): 76 | print('网址格式错误!') 77 | else: 78 | start = time.time() 79 | spider = Spider() 80 | spider.run(url) 81 | print('程序运行时间:{:.6f}'.format(time.time() - start)) 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /99藏书网/99藏书网小说_3.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from fake_useragent import FakeUserAgent 4 | import time 5 | import re 6 | import base64 7 | 8 | 9 | def get_meta(url): 10 | headers = { 11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,' 12 | 'application/signed-exchange;v=b3;q=0.9', 13 | 'Accept-Encoding': 'gzip, deflate', 14 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 15 | 'Cache-Control': 'no-cache', 16 | 'Connection': 'keep-alive', 17 | 'DNT': '1', 18 | 'Host': 'www.99csw.com', 19 | 'Pragma': 'no-cache', 20 | 'Upgrade-Insecure-Requests': '1', 21 | 'User-Agent': FakeUserAgent().chrome} 22 | response = requests.get(url, headers=headers) 23 | tree = etree.HTML(response.content) 24 | return tree.xpath('//meta[5]/@content')[0] 25 | 26 | 27 | def decrypt(text): 28 | data = base64.b64decode(text).decode("ascii") 29 | return [int(i) for i in re.split(r'[A-Z]+%', data)] 30 | 31 | 32 | def get_data(text): 33 | data = [] 34 | i = 0 35 | for j in text: 36 | if j < 3: 37 | data.append(j) 38 | i += 1 39 | else: 40 | data.append(j - i) 41 | i += 2 42 | return data 43 | 44 | 45 | def main(): 46 | url = input('输入网址:') 47 | start = time.time() 48 | meta = get_meta(url) 49 | data = get_data(decrypt(meta)) 50 | print(data) 51 | print('程序运行时间:{:.6f}'.format(time.time() - start)) 52 | 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python 网络爬虫实例 2 | 3 | **分享各类网站数据爬虫实例,全部爬虫代码开源,可供学习和使用,不可用于非法用途。** 4 | 5 | _**2021-12-19:今后不定时更新**_ 6 | 7 | ## 爬虫案例 8 | 9 | 1. 基础网站爬虫(一) 10 | 2. 基础网站爬虫(二) 11 | 3. 基础网站爬虫(三) 12 | 4. 无 HTTPS 证书网站爬虫 13 | 5. HTTP 认证网站爬虫 14 | 6. 慢速网站爬虫 15 | 7. 异步加载网站爬虫 16 | 8. 动态页面渲染网站爬虫 17 | 9. 无页码翻页网站爬虫 18 | 10. 异步智能页面网站爬虫 19 | 11. 大批量动态页面网站爬虫 20 | 12. 数据接口参数加密网站爬虫 21 | 13. 模拟登录网站爬虫 22 | 14. CSS样式偏移网站爬虫 23 | 24 | ## 爬虫名单 25 | 26 | * 豆瓣 Top250 电影爬虫 27 | * 京东商品数据爬虫 28 | * 淘宝商品数据爬虫 29 | * 糗事百科段子爬虫 30 | * 彼岸图网图片爬虫 31 | * GXNAS 精美壁纸图片爬虫 32 | * 抖音无水印视频爬虫 33 | * 哔哩哔哩视频弹幕爬虫 34 | * 哔哩哔哩视频评论爬虫 35 | * 铅笔小说网爬虫 36 | * 半次元无水印图片爬虫 37 | * 下厨房网数据爬虫 38 | * 百度百科爬虫项目 39 | * 北京市政务数据资源网爬虫 40 | * 51Job 职位数据爬虫 41 | * 癌症医学数据库爬虫 42 | * 有道在线翻译爬虫 43 | * 链家二手房数据爬虫 44 | * 美食天下数据爬虫 45 | * 丁香医生数据爬虫 46 | * 九九藏书网小说爬虫 47 | * 好豆网数据爬虫 48 | -------------------------------------------------------------------------------- /丁香医生/疾病标签.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | from fake_useragent import FakeUserAgent 5 | from requests_html import HTMLSession 6 | 7 | 8 | def get_link(session, headers): 9 | url = 'https://dxy.com/diseases' 10 | response = session.get(url, headers=headers) 11 | link = [ 12 | i.attrs['href'] 13 | for i in response.html.find('div.section-filter-box-wrap.normal > a') 14 | ] 15 | return session, link 16 | 17 | 18 | def get_disease_link(session, link, headers): 19 | json_re = re.compile(r'window.\$\$data=(.*)') 20 | disease = [] 21 | tag_name = [] 22 | for i in link: 23 | response = session.get(i, headers=headers) 24 | script = response.html.find('script') 25 | if not script[3]: 26 | print('匹配 script 失败!') 27 | break 28 | json_ = re.findall(json_re, script[3].text) 29 | if not json_: 30 | print('JSON 格式化失败!') 31 | break 32 | data = json.loads(json_[0]) 33 | for j in data['diseases'][1:]: 34 | for x in j['tag_list']: 35 | tag_id = x['tag_id'] 36 | tag_name.append(x['tag_name']) 37 | disease.append(tag_id) 38 | break # 调试代码 39 | return session, disease, tag_name 40 | 41 | 42 | def get_tag(session, disease, headers): 43 | url = 'https://dxy.com/disease/{}/detail' 44 | data = [] 45 | for i in disease: 46 | response = session.get(url.format(i), headers=headers) 47 | data.append([j.text for j in response.html.find( 48 | 'div.tag-content-tags > a')]) 49 | break # 调试代码 50 | return data 51 | 52 | 53 | def main(): 54 | session = HTMLSession() 55 | headers = { 56 | 'user-agent': FakeUserAgent().chrome, 57 | 'referer': 'https://dxy.com/', 58 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,applic' 59 | 'ation/signed-exchange;v=b3;q=0.9', 60 | 'accept-encoding': 'gzip, deflate, br', 61 | 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 62 | 'upgrade-insecure-requests': '1'} 63 | session, link = get_link(session, headers) 64 | session, disease, tag_name = get_disease_link(session, link, headers) 65 | data = get_tag(session, disease, headers) 66 | # if len(tag_name) != len(data): 67 | # print('爬取发生异常!') 68 | # exit() 69 | print(data) 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /京东/京东商品爬虫_1.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from pyppeteer.launcher import launch 3 | import asyncio 4 | import time 5 | import random 6 | from fake_useragent import FakeUserAgent 7 | import csv 8 | 9 | 10 | def get_url(keyword, pages): 11 | url_list = [] 12 | while 1 <= pages <= 100: 13 | if pages == 1: 14 | url = 'https://search.jd.com/Search?keyword={}'.format( 15 | keyword) 16 | else: 17 | url = 'https://search.jd.com/Search?keyword={}&page={}'.format( 18 | keyword, pages * 2 - 1) 19 | pages -= 1 20 | url_list.append(url) 21 | if not url_list: 22 | raise ValueError('页数输入错误') 23 | url_list.reverse() 24 | return url_list 25 | 26 | 27 | def deal_data(html): 28 | soup = BeautifulSoup(html, 'lxml') 29 | data = [] 30 | for item in soup.select('ul.gl-warp.clearfix > li.gl-item'): 31 | _ = [] 32 | cache = item.select('div.p-price > strong > i') 33 | price = cache[0].text.strip() if cache else None 34 | cache = item.select('div.p-name.p-name-type-2 > a') 35 | link = 'https:' + cache[0]['href'] if cache else None 36 | cache = item.select('div.p-name.p-name-type-2 > a > em') 37 | if cache and len(cache[0].text) <= 4: 38 | cache = item.select('div.p-name.p-name-type-2 > a > i') 39 | describe = cache[0].text.replace('\n', '').strip() if cache else None 40 | cache = item.select('div.p-commit > strong > a') 41 | evaluate = cache[0].text.strip() if cache else None 42 | cache = item.select('div.p-shop > span > a') 43 | shop_name = cache[0].text.strip() if cache else None 44 | cache = item.select('div.p-icons > i') 45 | label = [i.text.strip() for i in cache] if cache else None 46 | _.append(link) 47 | _.append(price) 48 | _.append(describe) 49 | _.append(evaluate) 50 | _.append(shop_name) 51 | _.append(label) 52 | data.append(_) 53 | return data 54 | 55 | 56 | def save_data(data): 57 | with open(f'{time.time()}.csv', 'w', encoding='utf-8', newline='') as f: 58 | f = csv.writer(f, delimiter=',') 59 | f.writerows(data) 60 | 61 | 62 | async def get_html(url): 63 | browser = await launch() 64 | page = await browser.newPage() 65 | await page.setUserAgent(FakeUserAgent().chrome) 66 | await page.evaluateOnNewDocument('Object.defineProperty(navigator, "webdriver", {get: () => undefined})') 67 | for i in url: 68 | await page.goto(i) 69 | await page.waitFor(random.randrange(2, 5, 1)) 70 | for _ in range(12): 71 | await page.keyboard.press('PageDown') 72 | time.sleep(random.randrange(1, 4, 1)) 73 | html = await page.content() 74 | data = deal_data(html) 75 | save_data(data) 76 | await browser.close() 77 | 78 | 79 | def main(): 80 | kw = input('搜索关键字:') 81 | pages = int(input('爬取页数:')) 82 | urls = get_url(kw, pages) 83 | asyncio.get_event_loop().run_until_complete(get_html(urls)) 84 | 85 | 86 | if __name__ == '__main__': 87 | """使用无头浏览器爬取""" 88 | main() 89 | -------------------------------------------------------------------------------- /从零开始的网络爬虫/01_基础网站爬虫.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from bs4 import BeautifulSoup 3 | import re 4 | import sqlite3 5 | import time 6 | 7 | 8 | def get_html(url): 9 | template = url + 'page/' 10 | html = '' 11 | for page in range(1, 11): 12 | url = template + str(page) 13 | response = open_url(url) 14 | html += response 15 | return html 16 | 17 | 18 | def open_url(url): 19 | response = urllib.request.urlopen(url) 20 | return response.read().decode('utf-8') 21 | 22 | 23 | def get_data(html): 24 | findname = re.compile( 25 | r'

(.*?) - (.*?)

') 26 | findtype = re.compile( 27 | r'