├── LICENSE ├── README.md ├── blob ├── crawler.py ├── main.py └── templates │ └── index.html ├── c01.py ├── c02.py ├── c03.py ├── c04.py ├── e01.py ├── e02.py ├── e03.py ├── e04.py ├── h01.py ├── h02.py ├── h03.py ├── h04.py ├── h05.py ├── h06.py ├── n01.py ├── n02.py ├── n03.py ├── n04.py ├── n05.py ├── n06.py ├── n07.py ├── python_begin.py ├── requirements.txt ├── s01.py ├── s02.py ├── s03.py ├── s04.py ├── s05.py ├── s06.py ├── s07.py ├── s08.py └── s4-1.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 hhuayuan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spiderbuf 爬虫练习网站 2 | Spiderbuf 爬虫实战案例示例代码 3 | 4 | ## 官方网站 5 | https://spiderbuf.cn 6 | 7 | [爬虫实战练习列表](https://spiderbuf.cn/web-scraping-practices) 8 | 9 | 专注于 Python 爬虫练习的网站. 10 | 11 | 提供丰富的爬虫教程、爬虫案例解析和爬虫练习题. 12 | 13 | Python爬虫开发强化练习,在矛与盾的攻防中不断提高技术水平,通过大量的爬虫实战掌握常见的爬虫与反爬套路。 14 | 15 | 引导式爬虫案例 + 免费爬虫视频教程,以闯关的形式挑战各个爬虫任务,培养爬虫开发的直觉及经验,验证自身爬虫开发与反爬虫实力的时候到了。 16 | 17 | ## 代码运行环境 18 | Ubuntu 20.04.6 LTS 19 | macOS 15+ 20 | 21 | Python3.8+ 22 | 23 | ## 更新日志 24 | | 编号 | 名称 | 更新日期 | 25 | | ---- | ---- | ---- | 26 | | [C08](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c08 "JS逆向爬虫实战练习(金融数据)") | JS逆向爬虫实战练习(金融数据) | 2025-05-31 | 27 | | [C07](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c07 "JavaScript 逆向爬虫实战案例") | JavaScript 逆向爬虫实战案例 | 2025-05-14 | 28 | | [C06](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c06 "JavaScript 逆向爬虫实战案例") | JavaScript 逆向爬虫实战案例 | 2025-04-15 | 29 | | [C05](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c05 "爬虫实战练习") | 爬虫实战练习 | 2025-02-26 | 30 | | [C04](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c04 "爬虫实战练习") | 爬虫实战练习 | 2025-02-11 | 31 | | [C03](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c03 "爬虫实战练习") | 爬虫实战练习 | 2025-01-15 | 32 | | [C02](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c02 "爬虫实战练习") | 爬虫实战练习 | 2024-12-16 | 33 | | [C01](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c01 "爬虫实战练习") | 爬虫实战练习 | 2024-11-17 | 34 | | [N07](https://spiderbuf.cn/web-scraping-practice/random-css-classname "随机CSS样式类名,无Element ID") | 随机CSS样式类名,无Element ID | 2024-09-08 | 35 | | [E04](https://spiderbuf.cn/web-scraping-practice/block-ip-proxy "被屏蔽IP后使用代理服务器爬取页面") | 被屏蔽IP后使用代理服务器爬取页面 | 2024-07-23 | 36 | | [N06](https://spiderbuf.cn/web-scraping-practice/scraping-form-rpa "网页表单爬取(RPA初阶)") | 网页表单爬取(RPA初阶) | 2024-03-26 | 37 | | [N05](https://spiderbuf.cn/web-scraping-practice/css-sprites "CSS Sprites (雪碧图)反爬") | CSS Sprites (雪碧图)反爬 | 2024-02-18 | 38 | | [N04](https://spiderbuf.cn/web-scraping-practice/css-pseudo-elements "CSS伪元素反爬") | CSS伪元素反爬 | 2024-01-11 | 39 | | [H06](https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper "初识浏览器指纹:Selenium是如何被反爬的") | 初识浏览器指纹:Selenium是如何被反爬的 | 2023-12-22 | 40 | | [H05](https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp "js逆向破解时间戳反爬") | js逆向破解时间戳反爬 | 2023-11-26 | 41 | | [H04](https://spiderbuf.cn/web-scraping-practice/javascript-confuse-encrypt-reverse "js加密混淆及简单反调试") | js加密混淆及简单反调试 | 2023-11-11 | 42 | | [H03](https://spiderbuf.cn/web-scraping-practice/scraping-scroll-load "网页滚动加载的原理及爬取(JavaScript加密混淆逆向基础)") | 网页滚动加载的原理及爬取(JavaScript加密混淆逆向基础) | 2023-10-20 | 43 | | [H02](https://spiderbuf.cn/web-scraping-practice/scraping-douban-movies-xpath-advanced "高分电影列表复杂页面的解析(仿豆瓣电影)- xpath高级用法") | 高分电影列表复杂页面的解析(仿豆瓣电影)- xpath高级用法 | 2023-10-10 | 44 | | [N03](https://spiderbuf.cn/web-scraping-practice/scraper-bypass-request-limit "限制访问频率不低于1秒") | 限制访问频率不低于1秒 | 2023-07-02 | 45 | | [N02](https://spiderbuf.cn/web-scraping-practice/scraping-images-base64 "使用Base64编码的图片爬取与解码还原") | 使用Base64编码的图片爬取与解码还原 | 2023-06-30 | 46 | | [H01](https://spiderbuf.cn/web-scraping-practice/scraping-css-confuse-offset "CSS样式偏移混淆文本内容的解析与爬取") | CSS样式偏移混淆文本内容的解析与爬取 | 2023-06-25 | 47 | | [N01](https://spiderbuf.cn/web-scraping-practice/user-agent-referrer "User-Agent与Referer校验反爬") | User-Agent与Referer校验反爬 | 2022-11-05 | 48 | | [E03](https://spiderbuf.cn/web-scraping-practice/scraping-random-pagination "无序号翻页") | 无序号翻页 | 2022-11-01 | 49 | | [E02](https://spiderbuf.cn/web-scraping-practice/web-scraping-with-captcha "带验证码的登录爬取") | 带验证码的登录爬取 | 2022-09-17 | 50 | | [E01](https://spiderbuf.cn/web-scraping-practice/scraper-login-username-password "用户名密码登录爬取后台数据") | 用户名密码登录爬取后台数据 | 2022-08-21 | 51 | | [S08](https://spiderbuf.cn/web-scraping-practice/scraper-via-http-post "http post请求的数据爬取") | http post请求的数据爬取 | 2021-06-21 | 52 | | [S07](https://spiderbuf.cn/web-scraping-practice/scraping-ajax-api "ajax动态加载数据的爬取") | ajax动态加载数据的爬取 | 2021-06-21 | 53 | | [S06](https://spiderbuf.cn/web-scraping-practice/scraping-iframe "带iframe的页面源码分析及数据爬取") | 带iframe的页面源码分析及数据爬取 | 2021-06-21 | 54 | | [S05](https://spiderbuf.cn/web-scraping-practice/scraping-images-from-web "网页图片的爬取及本地保存") | 网页图片的爬取及本地保存 | 2021-06-21 | 55 | | [S04](https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper "分页参数分析及翻页爬取") | 分页参数分析及翻页爬取 | 2021-06-21 | 56 | | [S03](https://spiderbuf.cn/web-scraping-practice/lxml-xpath-advanced "lxml库进阶语法及解析练习") | lxml库进阶语法及解析练习 | 2021-06-21 | 57 | | [S02](https://spiderbuf.cn/web-scraping-practice/scraper-http-header "http请求分析及头构造使用") | http请求分析及头构造使用 | 2021-06-21 | 58 | | [S01](https://spiderbuf.cn/web-scraping-practice/requests-lxml-for-scraping-beginner "requests库及lxml库入门") | requests库及lxml库入门 | 2021-06-21 | 59 | 60 | # 课程 61 | [《深入了解Python爬虫攻防》](https://www.udemy.com/course/python-spiderbuf/?referralCode=77D640F3DB5A310151DB "深入了解Python爬虫攻防") 62 | 63 | [《Axure RP 9 从入门到精通:打造高保真交互原型》](https://www.udemy.com/course/axure-rp-9/?referralCode=3374A9C2D8B735FC54A1 "Axure RP 9 从入门到精通:打造高保真交互原型") 64 | -------------------------------------------------------------------------------- /blob/crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | def download_video(video_url, output_path): 4 | try: 5 | # 发送 GET 请求 6 | response = requests.get(video_url, stream=True) 7 | 8 | # 如果请求成功 9 | if response.status_code == 200: 10 | # 打开文件并写入下载的数据 11 | with open(output_path, 'wb') as f: 12 | for chunk in response.iter_content(chunk_size=1024): 13 | if chunk: # 防止下载过程中产生空数据 14 | f.write(chunk) 15 | print(f"Video downloaded successfully: {output_path}") 16 | else: 17 | print(f"Failed to retrieve video, HTTP status code: {response.status_code}") 18 | 19 | except requests.exceptions.RequestException as e: 20 | print(f"An error occurred: {e}") 21 | 22 | if __name__ == "__main__": 23 | # 视频文件的 URL 24 | video_url = 'http://localhost:5000/video' # 替换成目标视频 URL 25 | 26 | # 本地保存的路径 27 | output_path = 'downloaded_video.mp4' 28 | 29 | # 下载视频 30 | download_video(video_url, output_path) 31 | -------------------------------------------------------------------------------- /blob/main.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, Response, send_file, render_template 2 | import os 3 | 4 | app = Flask(__name__) 5 | 6 | @app.route('/') 7 | def index(): 8 | return render_template('index.html') 9 | 10 | @app.route('/video') 11 | def stream_video(): 12 | # 请自行找一个mp4视频文件,放在当前目录下(即与main.py同一个目录) 13 | video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'spiderbuf.mp4') # 这里替换为你的视频文件路径 14 | print(video_path) 15 | if not os.path.exists(video_path): 16 | return "Video not found", 404 17 | 18 | # 打开视频文件,以二进制流的形式发送 19 | def generate_video(): 20 | with open(video_path, 'rb') as f: 21 | while chunk := f.read(1024 * 1024): # 每次读取 1MB 22 | yield chunk 23 | 24 | return Response(generate_video(), content_type='video/mp4') 25 | 26 | 27 | if __name__ == '__main__': 28 | app.run(debug=True) 29 | -------------------------------------------------------------------------------- /blob/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 视频流爬虫 - Spiderbuf 7 | 8 | 9 |
10 |

视频流爬虫 - Spiderbuf

11 |

src="blob:https://spiderbuf.cn....."

12 | 16 |
17 | 18 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /c01.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | import numpy as np 6 | 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c01/mnist' 8 | 9 | my_headers = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36', 11 | 'Referer': 'https://spiderbuf.cn/web-scraping-practice/c01', 12 | 'Cookie': '__cgf3t=G0gzgFKDRlLtmZH7NrzqOb1x4pek1xNQk12KKc4g21Y-1731624199;'} 13 | 14 | 15 | html_bytes = requests.get(base_url, headers=my_headers).content 16 | html = html_bytes.decode() 17 | root = etree.HTML(html) 18 | with open('./data/c01/c01.html', 'w', encoding='utf-8') as f: 19 | f.write(html) 20 | # print(html) 21 | 22 | trs = root.xpath('//tbody/tr') 23 | 24 | 25 | pix1_arry = [] 26 | for tr in trs: 27 | tds = tr.xpath('td') 28 | # 把 pix1 列的值添加到数组 29 | pix1_arry.append([int(tds[1].text) if len(tds) > 1 else 0]) 30 | # 计算 pix1 列的平均值并四舍五入至两位小数 31 | print(round(np.mean(pix1_arry),2)) -------------------------------------------------------------------------------- /c02.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | from selenium import webdriver 6 | from selenium.webdriver import ChromeOptions, ActionChains 7 | from selenium.webdriver.common.by import By 8 | import time 9 | import base64 10 | import json 11 | import numpy as np 12 | 13 | 14 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c02' 15 | 16 | myheaders = { 17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 18 | 19 | def getHTML(url,file_name=''): 20 | client = webdriver.Chrome() 21 | client.get(url) 22 | time.sleep(10) 23 | 24 | # 事件参数对象 25 | actionChains = ActionChains(client) 26 | 27 | # 捕捉滑块元素 28 | slide_btn = client.find_element(By.ID, 'slider') 29 | # 观察网站滑块移动的长度和位置 30 | actionChains.click_and_hold(slide_btn) 31 | actionChains.move_by_offset(220,0) 32 | # 这里要注意: 33 | # 以下三个是以上面的坐标(220,0)为起点来计算的 34 | # 所以最终移动的距离是220加上以下的累计 35 | actionChains.move_by_offset(11,0) 36 | actionChains.move_by_offset(13,0) 37 | actionChains.move_by_offset(10,0) 38 | 39 | actionChains.release() 40 | actionChains.perform() 41 | 42 | html = client.page_source 43 | print(html) 44 | client.quit() 45 | 46 | if file_name != '': 47 | with open(file_name, 'w', encoding='utf-8') as f: 48 | f.write(html) 49 | return html 50 | 51 | 52 | def parseHTML(html): 53 | root = etree.HTML(html) 54 | trs = root.xpath('//tr') 55 | 56 | prices = [] 57 | for tr in trs: 58 | tds = tr.xpath('./td') 59 | if len(tds) > 2: 60 | prices.append(int(tds[2].text)) 61 | print(prices) 62 | print(np.mean(prices)) 63 | 64 | 65 | if __name__ == '__main__': 66 | # example: 1 67 | html = getHTML(base_url, './data/c02/c02.html') 68 | parseHTML(html) 69 | 70 | # example: 2 71 | # html = requests.get(base_url, headers=myheaders).text 72 | # a = html.index('encryptedData = "') + 17 73 | # html = html[a:] 74 | # b = html.index('";') 75 | # html = html[:b] 76 | # print(html) 77 | # dic = eval(base64.b64decode(html.encode('utf-8'))) 78 | # objs = dic['flights'] 79 | # prices = [] 80 | # for obj in objs: 81 | # print(obj) 82 | # prices.append(obj['price']) 83 | 84 | # print(prices) 85 | # print(np.mean(prices)) 86 | 87 | -------------------------------------------------------------------------------- /c03.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | from selenium import webdriver 6 | from selenium.webdriver.common.by import By 7 | import time 8 | import json 9 | import hashlib 10 | import random 11 | import numpy as np 12 | 13 | 14 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c03' 15 | 16 | myheaders = { 17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 18 | 19 | def getHTML(url,file_name=''): 20 | sepal_width_arr = [] 21 | client = webdriver.Chrome() 22 | client.get(url) 23 | time.sleep(5) 24 | html = client.page_source 25 | # print(html) 26 | parseHTML(html,sepal_width_arr) 27 | if file_name != '': 28 | with open(file_name + '_1.html', 'w', encoding='utf-8') as f: 29 | f.write(html) 30 | for i in range(1,5): 31 | client.find_elements(By.XPATH, '//ul/li/a')[i].click() 32 | time.sleep(5) 33 | html = client.page_source 34 | # print(html) 35 | parseHTML(html,sepal_width_arr) 36 | if file_name != '': 37 | with open(file_name + f'_{i+1}.html', 'w', encoding='utf-8') as f: 38 | f.write(html) 39 | 40 | client.quit() 41 | print(sepal_width_arr) 42 | print(np.sum(sepal_width_arr)) 43 | return html 44 | 45 | 46 | def parseHTML(html,sepal_width_arr): 47 | root = etree.HTML(html) 48 | trs = root.xpath('//tr') 49 | for tr in trs: 50 | tds = tr.xpath('./td') 51 | if len(tds) > 2: 52 | sepal_width_arr.append(float(tds[2].text)) 53 | 54 | 55 | 56 | 57 | if __name__ == '__main__': 58 | # example: 1 59 | # html = getHTML(base_url, './data/c03/c03') 60 | 61 | # example: 2 62 | sepal_width_arr = [] 63 | for i in range(1, 6): 64 | random_value = random.randint(2000, 10000) 65 | timestamp = int(time.time()) 66 | xorResult = i ^ timestamp 67 | md5_hash = hashlib.md5() 68 | md5_hash.update(f'{xorResult}{timestamp}'.encode('utf-8')) 69 | hash = md5_hash.hexdigest() 70 | payload = { 71 | 'random': random_value, 72 | 'timestamp': timestamp, 73 | 'hash': hash, 74 | 'xorResult': xorResult 75 | } 76 | # print(payload) 77 | json_response = requests.post(base_url, headers=myheaders,json=payload).text 78 | 79 | print(json_response) 80 | json_data = json.loads(json_response) 81 | for item in json_data: 82 | # print(item) 83 | sepal_width_arr.append(item['sepal_width']) 84 | 85 | print(sepal_width_arr) 86 | print(np.sum(sepal_width_arr)) 87 | 88 | -------------------------------------------------------------------------------- /c04.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # @Author: spiderbuf 3 | from lxml import etree 4 | from selenium import webdriver 5 | from selenium.webdriver import ChromeOptions, ActionChains 6 | from selenium.webdriver.common.by import By 7 | import time 8 | import random 9 | import numpy as np 10 | import re 11 | 12 | 13 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c04' 14 | 15 | myheaders = { 16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 17 | 18 | 19 | if __name__ == '__main__': 20 | options = webdriver.ChromeOptions() 21 | options.add_argument('disable-infobars') 22 | options.set_capability('goog:loggingPrefs', {'browser': 'ALL'}) 23 | 24 | options.add_argument('--disable-blink-features=AutomationControlled') # 改变navigator.webdriver 属性值 25 | 26 | client = webdriver.Chrome(options=options) 27 | print('Getting page...') 28 | client.get(base_url) 29 | time.sleep(3) 30 | 31 | # 模拟用户在页面上滑动光标 32 | actionChains = ActionChains(client) 33 | actionChains.move_by_offset(430,330) 34 | for i in range(20): 35 | step = random.randint(1, 10) 36 | actionChains.move_by_offset(step,step).perform() 37 | 38 | checkbox = client.find_element(By.ID, 'captcha') 39 | checkbox.click() 40 | print('Checkbox clicked...') 41 | time.sleep(3) 42 | html = client.page_source 43 | # print(html) 44 | client.quit() 45 | 46 | with open('./data/c04/c04.html', 'w', encoding='utf-8') as f: 47 | f.write(html) 48 | 49 | root = etree.HTML(html) 50 | items = root.xpath('//div[@class="stats"]') 51 | results = [] 52 | for item in items: 53 | spans = item.xpath('.//span') 54 | s = ''.join(spans[3].xpath('string(.)')) 55 | results.append(int(re.findall('\d+',spans[0].text)[0]) + int(''.join(re.findall('\d+',s)))) 56 | 57 | print(np.average(results)) -------------------------------------------------------------------------------- /e01.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-login-username-password/login' 7 | 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 9 | 10 | payload = {'username':'admin','password':'123456'} 11 | 12 | html = requests.post(url, headers=myheaders, data=payload).text 13 | print(html) 14 | 15 | f = open('./data/e01/e01.html', 'w', encoding='utf-8') 16 | f.write(html) 17 | f.close() 18 | 19 | root = etree.HTML(html) 20 | trs = root.xpath('//tr') 21 | 22 | f = open('./data/e01/data_e01.txt', 'w', encoding='utf-8') 23 | for tr in trs: 24 | tds = tr.xpath('./td') 25 | s = '' 26 | for td in tds: 27 | # print(td.text) 28 | s = s + str(td.text) + '|' 29 | print(s) 30 | if s != '': 31 | f.write(s + '\n') 32 | 33 | f.close() 34 | 35 | -------------------------------------------------------------------------------- /e02.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/web-scraping-with-captcha/list' 7 | 8 | # 注意:要把Cookie改成自己的 9 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36', 10 | 'Cookie':'admin=a66abb5684c45962d887564f08346e8d'} 11 | 12 | payload = {'username':'admin','password':'123456'} 13 | 14 | html = requests.get(url, headers=myheaders, data=payload).text 15 | print(html) 16 | # exit(); 17 | f = open('./data/e02/e02.html', 'w', encoding='utf-8') 18 | f.write(html) 19 | f.close() 20 | 21 | root = etree.HTML(html) 22 | trs = root.xpath('//tr') 23 | 24 | f = open('./data/e02/data_e02.txt', 'w', encoding='utf-8') 25 | for tr in trs: 26 | tds = tr.xpath('./td') 27 | s = '' 28 | for td in tds: 29 | # print(td.text) 30 | s = s + str(td.text) + '|' 31 | print(s) 32 | if s != '': 33 | f.write(s + '\n') 34 | 35 | f.close() 36 | 37 | -------------------------------------------------------------------------------- /e03.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | import re 6 | 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-random-pagination' 8 | # https://spiderbuf.cn/e03/5f685274073b 9 | 10 | myheaders = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 12 | 13 | # 取页数 14 | html = requests.get(base_url, headers=myheaders).text 15 | root = etree.HTML(html) 16 | print(html) 17 | 18 | lis = root.xpath('//ul[@class="pagination"]/li/a/@href') 19 | print(lis) 20 | 21 | i = 1 22 | for item in lis: 23 | print(item) 24 | s = item.replace('/web-scraping-practice/scraping-random-pagination','') 25 | print(base_url + s) 26 | url = base_url + s 27 | # print(url) 28 | html = requests.get(url, headers=myheaders).text 29 | # print(html) 30 | # 31 | f = open('./data/e03/e03_%d.html' % i, 'w', encoding='utf-8') 32 | f.write(html) 33 | f.close() 34 | # 35 | root = etree.HTML(html) 36 | trs = root.xpath('//tr') 37 | 38 | f = open('./data/e03/e03_%d.txt' % i, 'w', encoding='utf-8') 39 | for tr in trs: 40 | tds = tr.xpath('./td') 41 | s = '' 42 | for td in tds: 43 | s = s + str(td.xpath('string(.)')) + '|' 44 | # s = s + str(td.text) + '|' 45 | print(s) 46 | if s != '': 47 | f.write(s + '\n') 48 | 49 | f.close() 50 | i += 1 -------------------------------------------------------------------------------- /e04.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | import re 6 | 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/block-ip-proxy' 8 | 9 | myheaders = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 11 | 12 | proxies = {'http':'47.122.65.254:8080'} 13 | # 取页数 14 | html = requests.get(base_url, headers=myheaders,proxies=proxies).text 15 | root = etree.HTML(html) 16 | # print(html) 17 | lis = root.xpath('//ul[@class="pagination"]/li/a') 18 | pages = [] 19 | for item in lis: 20 | print(item.attrib['href']) 21 | if item.attrib['class'] != 'item trap': 22 | pages.append(item.attrib['href']) 23 | print(pages) 24 | i = 1 25 | for item in pages: 26 | print(item) 27 | s = item.replace('/web-scraping-practice/block-ip-proxy','') 28 | print(base_url + s) 29 | url = base_url + s 30 | # print(url) 31 | html = requests.get(url, headers=myheaders).text 32 | # print(html) 33 | # 34 | f = open('./data/e04/e04_%d.html' % i, 'w', encoding='utf-8') 35 | f.write(html) 36 | f.close() 37 | # 38 | root = etree.HTML(html) 39 | trs = root.xpath('//tr') 40 | 41 | f = open('./data/e04/e04_%d.txt' % i, 'w', encoding='utf-8') 42 | for tr in trs: 43 | tds = tr.xpath('./td') 44 | s = '' 45 | for td in tds: 46 | s = s + str(td.xpath('string(.)')) + '|' 47 | # s = s + str(td.text) + '|' 48 | print(s) 49 | if s != '': 50 | f.write(s + '\n') 51 | 52 | f.close() 53 | i += 1 54 | -------------------------------------------------------------------------------- /h01.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-css-confuse-offset' 7 | 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36', 9 | 'Referer':'https://spiderbuf.cn/list'} 10 | 11 | html = requests.get(url, headers=myheaders).text 12 | print(html) 13 | 14 | f = open('./data/h01/h01.html', 'w', encoding='utf-8') 15 | f.write(html) 16 | f.close() 17 | 18 | root = etree.HTML(html) 19 | ls = root.xpath('//div[@class ="container"]/div/div') 20 | # page_text = ls[0].xpath('string(.)') 21 | # print(page_text) 22 | 23 | f = open('./data/h01/h01.txt', 'w', encoding='utf-8') 24 | for item in ls: 25 | hnodes = item.xpath('./h2') 26 | temp = hnodes[0].xpath('string(.)') 27 | s0 = temp[1:2] + temp[0:1] + temp[2:] 28 | print(s0) 29 | 30 | pnodes = item.xpath('./p') 31 | s1 = pnodes[0].text 32 | print(s1) 33 | temp = pnodes[1].xpath('string(.)').replace('企业估值(亿元):','') 34 | s2 = temp[1:2] + temp[0:1] + temp[2:] 35 | print(s2) 36 | s3 = pnodes[2].text 37 | print(s3) 38 | s4 = pnodes[3].text 39 | print(s4) 40 | # 富邦金融控股排名:50企业估值(亿元):2135CEO:蔡明兴行业:金融服务 41 | s = s0 + '|' + s1.replace('排名:','') + '|' + s2.replace('企业估值(亿元):','') + '|' \ 42 | + s3.replace('CEO:','') + '|' + s4.replace('行业:','') + '\n' 43 | print(s) 44 | f.write(s) 45 | 46 | f.close() -------------------------------------------------------------------------------- /h02.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os.path 3 | 4 | import requests 5 | from lxml import etree 6 | import time 7 | 8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-douban-movies-xpath-advanced' 9 | 10 | myheaders = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 12 | 13 | def getHTML(url,file_name=''): 14 | html = requests.get(url, headers=myheaders).text 15 | if file_name != '': 16 | with open(file_name, 'w', encoding='utf-8') as f: 17 | f.write(html) 18 | return html 19 | 20 | 21 | def downloadImage(url, path=''): 22 | img_data = requests.get(url, headers=myheaders).content 23 | # get image file name 24 | file_name = url.split('/').pop() 25 | 26 | with open(os.path.join(path, file_name), 'wb') as img: 27 | img.write(img_data) 28 | 29 | 30 | def parseHTML(html): 31 | # parse html source code here 32 | root = etree.HTML(html) 33 | divs = root.xpath('/html/body/div/div[@style="margin-top: 10px;"]') 34 | i = 1 35 | for div in divs: 36 | #
37 | #
38 | #

肖申克的救赎 The Shawshank Redemption


39 | #
40 | # 肖申克的救赎 The Shawshank Redemption 41 | #
42 | #
43 | # 豆瓣电影评分: 9.7
44 | # 导演 : 弗兰克·德拉邦特
45 | # 编剧 : 弗兰克·德拉邦特 / 弗兰克·德拉邦特/ 斯蒂芬·金
46 | # 主演 : 蒂姆·罗宾斯 / 蒂姆·罗宾斯/ 摩根·弗里曼/ 鲍勃·冈顿/ 威廉姆·赛德勒/ 克兰西·布朗/ 吉尔·贝罗斯/ 马克·罗斯顿/ 詹姆斯·惠特摩/ 杰弗里·德曼/ 拉里·布兰登伯格/ 尼尔·吉恩托利/ 布赖恩·利比/ 大卫·普罗瓦尔/ 约瑟夫·劳格诺/ 祖德·塞克利拉/ 保罗·麦克兰尼/ 芮妮·布莱恩/ 阿方索·弗里曼/ V·J·福斯特/ 弗兰克·梅德拉诺/ 马克·迈尔斯/ 尼尔·萨默斯/ 耐德·巴拉米/ 布赖恩·戴拉特/ 唐·麦克马纳斯
47 | # 类型: 剧情 / 剧情/ 犯罪
48 | # 制片国家/ 地区: 美国
49 | # 语言: 英语
50 | # 上映日期: 1994 - 09 - 10(多伦多电影节) / 1994 - 09 - 10(多伦多电影节)/ 1994 - 10 - 14(美国)
51 | # 片长: 142分钟
52 | # 又名: 月黑高飞(港) / 月黑高飞(港)/ 刺激1995(台)/ 地狱诺言/ 铁窗岁月/ 消香克的救赎
53 | # IMDb: tt0111161
54 | #
55 | #
56 | #
57 | if i % 2 == 0: 58 | # 简介 /html/body/div[2] /div[3]/div 59 | summarys = div.xpath('./div/text()') 60 | summary = '' 61 | if len(summarys) > 0: 62 | summary = summarys[0].strip() 63 | print(summary) 64 | else: 65 | titles = div.xpath('./div/h2') 66 | title = '' 67 | if len(titles) > 0: 68 | title = titles[0].text 69 | print(title) 70 | #haibao 71 | img_urls = div.xpath('./div/div/img/@src') 72 | img_url = '' 73 | if len(img_urls) > 0: 74 | img_url = 'https://spiderbuf.cn/' + img_urls[0] 75 | print(img_url) 76 | downloadImage(img_url, './data/h02') 77 | # 评分 /html/body/div[2]/div[2] /div/div[2]/span[1] 78 | ratings = div.xpath('./div/div/span[contains(text(),"豆瓣电影评分:")]/following::text()[1]') 79 | rating = '' 80 | if len(ratings) > 0: 81 | rating = ratings[0].strip() 82 | print(rating) 83 | # 导演 /html/body/div[2]/div[2] /div/div[2]/span[2]/span[2] 84 | directors = div.xpath('./div/div/span/span[contains(text(),"导演")]/following::text()') 85 | director = '' 86 | if len(directors) > 1: 87 | director = directors[1].strip() 88 | if len(directors) > 3: 89 | director += '/' + directors[2].strip() 90 | # for item in directors: 91 | # if director != '': 92 | # director += ' / ' 93 | # director += item.text 94 | print(director) 95 | # 编剧 /html/body/div[2]/div[2] /div/div[2]/span[3]/span[2] 96 | scriptwriters = div.xpath('./div/div/span/span[contains(text(),"编剧")]/following::text()') 97 | scriptwriter = '' 98 | if len(scriptwriters) > 0: 99 | scriptwriter = scriptwriters[1].strip() 100 | 101 | if len(scriptwriters) > 3: 102 | scriptwriter += scriptwriters[2].strip() 103 | print(scriptwriter) 104 | # 主演 105 | performers = div.xpath('./div/div/span/span[contains(text(),"主演")]/following::text()') 106 | performer = '' 107 | if len(performers) > 0: 108 | performer = performers[1].strip() 109 | 110 | if len(performers) > 3: 111 | performer += performers[2].strip() 112 | print(performer) 113 | # 类型 114 | genres = div.xpath('./div/div/span/span[contains(text(),"类型:")]/following::text()') 115 | genre = '' 116 | if len(genres) > 0: 117 | genre = genres[0].strip() 118 | 119 | if len(performers) > 1: 120 | genre += genres[1].strip() 121 | print(genre) 122 | # 制片国家/地区 123 | areas = div.xpath('./div/div/span/span[contains(text(),"制片国家/地区:")]/following::text()') 124 | area = '' 125 | if len(areas) > 0: 126 | area = areas[0].strip() 127 | print(area) 128 | # 语言 129 | langs = div.xpath('./div/div/span/span[contains(text(),"语言:")]/following::text()') 130 | lang = '' 131 | if len(langs) > 0: 132 | lang = langs[0].strip().replace('\n', '') 133 | if len(langs) > 1: 134 | lang += langs[1].strip().replace('\n', '') 135 | print(lang) 136 | # 又名 137 | aliases = div.xpath('./div/div/span/span[contains(text(),"又名:")]/following::text()') 138 | alias = '' 139 | if len(aliases) > 0: 140 | alias = aliases[0].strip().replace('\n', '').replace('|', '') 141 | if len(aliases) > 1: 142 | alias += aliases[1].strip().replace('\n', '').replace('|', '') 143 | print(alias) 144 | # IMDb 145 | imdbs = div.xpath('./div/div/span[contains(text(),"IMDb:")]/following::text()') 146 | imdb = '' 147 | if len(imdbs) > 0: 148 | imdb = imdbs[0].strip().replace('\n', '') 149 | print(imdb) 150 | # 上映日期 151 | release_dates = div.xpath('./div/div/span/span[contains(text(),"上映日期:")]/following::text()') 152 | release_date = '' 153 | if len(release_dates) > 0: 154 | release_date = release_dates[0].strip().replace('\n', '') 155 | if len(release_dates) > 1: 156 | release_date += release_dates[1].strip().replace('\n', '') 157 | print(release_date) 158 | # 片长 159 | runtimes = div.xpath('./div/div/span/span[contains(text(),"片长:")]/following::text()') 160 | runtime = '' 161 | if len(runtimes) > 0: 162 | runtime = runtimes[0].strip().replace('\n', '') 163 | if len(runtimes) > 1: 164 | runtime += runtimes[1].strip().replace('\n', '') 165 | print(runtime) 166 | i += 1 167 | 168 | 169 | if __name__ == '__main__': 170 | html = getHTML(base_url, './data/h02/h02.html') 171 | # with open('./data/h02/h02.html', 'r', encoding='utf-8') as f: 172 | # html = f.read() 173 | parseHTML(html) 174 | -------------------------------------------------------------------------------- /h03.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os.path 3 | 4 | import requests 5 | from lxml import etree 6 | import time 7 | 8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-scroll-load' 9 | 10 | myheaders = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 12 | 13 | def getHTML(url,file_name=''): 14 | html = requests.get(url, headers=myheaders).text 15 | if file_name != '': 16 | with open(file_name, 'w', encoding='utf-8') as f: 17 | f.write(html) 18 | return html 19 | 20 | 21 | def downloadImage(url, path=''): 22 | img_data = requests.get(url, headers=myheaders).content 23 | # get image file name 24 | file_name = url.split('/').pop() 25 | 26 | with open(os.path.join(path, file_name), 'wb') as img: 27 | img.write(img_data) 28 | 29 | 30 | def parseHTML(html): 31 | # parse html source code here 32 | root = etree.HTML(html) 33 | divs = root.xpath('/html/body/div/div/div[@style="margin-top: 10px;"]') 34 | i = 1 35 | for div in divs: 36 | if i % 2 == 0: 37 | # 简介 /html/body/div[2] /div[3]/div 38 | summarys = div.xpath('./div/text()') 39 | summary = '' 40 | if len(summarys) > 0: 41 | summary = summarys[0].strip() 42 | print(summary) 43 | else: 44 | titles = div.xpath('./div/h2') 45 | title = '' 46 | if len(titles) > 0: 47 | title = titles[0].text 48 | print(title) 49 | #haibao 50 | img_urls = div.xpath('./div/div/img/@src') 51 | img_url = '' 52 | if len(img_urls) > 0: 53 | img_url = 'https://spiderbuf.cn/' + img_urls[0] 54 | print(img_url) 55 | downloadImage(img_url, './data/h02') 56 | # 评分 /html/body/div[2]/div[2] /div/div[2]/span[1] 57 | ratings = div.xpath('./div/div/span[contains(text(),"豆瓣电影评分:")]/following::text()[1]') 58 | rating = '' 59 | if len(ratings) > 0: 60 | rating = ratings[0].strip() 61 | print(rating) 62 | # 导演 /html/body/div[2]/div[2] /div/div[2]/span[2]/span[2] 63 | directors = div.xpath('./div/div/span/span[contains(text(),"导演")]/following::text()') 64 | director = '' 65 | if len(directors) > 1: 66 | director = directors[1].strip() 67 | if len(directors) > 3: 68 | director += '/' + directors[2].strip() 69 | # for item in directors: 70 | # if director != '': 71 | # director += ' / ' 72 | # director += item.text 73 | print(director) 74 | # 编剧 /html/body/div[2]/div[2] /div/div[2]/span[3]/span[2] 75 | scriptwriters = div.xpath('./div/div/span/span[contains(text(),"编剧")]/following::text()') 76 | scriptwriter = '' 77 | if len(scriptwriters) > 0: 78 | scriptwriter = scriptwriters[1].strip() 79 | 80 | if len(scriptwriters) > 3: 81 | scriptwriter += scriptwriters[2].strip() 82 | print(scriptwriter) 83 | # 主演 84 | performers = div.xpath('./div/div/span/span[contains(text(),"主演")]/following::text()') 85 | performer = '' 86 | if len(performers) > 0: 87 | performer = performers[1].strip() 88 | 89 | if len(performers) > 3: 90 | performer += performers[2].strip() 91 | print(performer) 92 | # 类型 93 | genres = div.xpath('./div/div/span/span[contains(text(),"类型:")]/following::text()') 94 | genre = '' 95 | if len(genres) > 0: 96 | genre = genres[0].strip() 97 | 98 | if len(performers) > 1: 99 | genre += genres[1].strip() 100 | print(genre) 101 | # 制片国家/地区 102 | areas = div.xpath('./div/div/span/span[contains(text(),"制片国家/地区:")]/following::text()') 103 | area = '' 104 | if len(areas) > 0: 105 | area = areas[0].strip() 106 | print(area) 107 | # 语言 108 | langs = div.xpath('./div/div/span/span[contains(text(),"语言:")]/following::text()') 109 | lang = '' 110 | if len(langs) > 0: 111 | lang = langs[0].strip().replace('\n', '') 112 | if len(langs) > 1: 113 | lang += langs[1].strip().replace('\n', '') 114 | print(lang) 115 | # 又名 116 | aliases = div.xpath('./div/div/span/span[contains(text(),"又名:")]/following::text()') 117 | alias = '' 118 | if len(aliases) > 0: 119 | alias = aliases[0].strip().replace('\n', '').replace('|', '') 120 | if len(aliases) > 1: 121 | alias += aliases[1].strip().replace('\n', '').replace('|', '') 122 | print(alias) 123 | # IMDb 124 | imdbs = div.xpath('./div/div/span[contains(text(),"IMDb:")]/following::text()') 125 | imdb = '' 126 | if len(imdbs) > 0: 127 | imdb = imdbs[0].strip().replace('\n', '') 128 | print(imdb) 129 | # 上映日期 130 | release_dates = div.xpath('./div/div/span/span[contains(text(),"上映日期:")]/following::text()') 131 | release_date = '' 132 | if len(release_dates) > 0: 133 | release_date = release_dates[0].strip().replace('\n', '') 134 | if len(release_dates) > 1: 135 | release_date += release_dates[1].strip().replace('\n', '') 136 | print(release_date) 137 | # 片长 138 | runtimes = div.xpath('./div/div/span/span[contains(text(),"片长:")]/following::text()') 139 | runtime = '' 140 | if len(runtimes) > 0: 141 | runtime = runtimes[0].strip().replace('\n', '') 142 | if len(runtimes) > 1: 143 | runtime += runtimes[1].strip().replace('\n', '') 144 | print(runtime) 145 | i += 1 146 | 147 | 148 | if __name__ == '__main__': 149 | 150 | html = getHTML(base_url, './data/h03/h03.html') 151 | # get next page uri 152 | uri = '' 153 | root = etree.HTML(html) 154 | divs = root.xpath('//div[@id="sLaOuol2SM0iFj4d"]/text()') 155 | if len(divs) > 0: 156 | uri = divs[0] 157 | 158 | i = 1 159 | while (uri != '') & (i < 10): 160 | print(uri) 161 | html = getHTML(base_url + '/' + uri, f'./data/h03/h03_{uri}.html') 162 | uri = '' # *** 163 | root = etree.HTML(html) 164 | divs = root.xpath('//div[@id="sLaOuol2SM0iFj4d"]/text()') 165 | if len(divs) > 0: 166 | uri = divs[0] 167 | i += 1 168 | 169 | # parseHTML(html) 170 | -------------------------------------------------------------------------------- /h04.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | from selenium import webdriver 6 | import time 7 | 8 | 9 | base_url = 'https://spiderbuf.cn/web-scraping-practice/javascript-confuse-encrypt-reverse' 10 | 11 | myheaders = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 13 | 14 | def getHTML(url,file_name=''): 15 | client = webdriver.Chrome() 16 | client.get(url) 17 | time.sleep(3) 18 | html = client.page_source 19 | print(html) 20 | client.quit() 21 | 22 | if file_name != '': 23 | with open(file_name, 'w', encoding='utf-8') as f: 24 | f.write(html) 25 | return html 26 | 27 | 28 | def parseHTML(html,file_name=''): 29 | root = etree.HTML(html) 30 | trs = root.xpath('//tr') 31 | 32 | if file_name != '': 33 | f = open(file_name, 'w', encoding='utf-8') 34 | 35 | for tr in trs: 36 | tds = tr.xpath('./td') 37 | s = '' 38 | for td in tds: 39 | s = s + str(td.xpath('string(.)')) + '|' 40 | # s = s + str(td.text) + '|' 41 | print(s) 42 | if (s != '') & (file_name != ''): 43 | f.write(s + '\n') 44 | f.close() 45 | 46 | 47 | if __name__ == '__main__': 48 | # example: 1 49 | html = getHTML(base_url, './data/h04/h04.html') 50 | # parseHTML(html, './data/h04/h04.txt') 51 | 52 | # example: 2 53 | # url = 'https://spiderbuf.cn/static/js/h04/udSL29.js' 54 | # js_code = requests.get(url, headers=myheaders).text 55 | # # js_code = js_code.encode('utf-8').decode('unicode-escape') 56 | # a = js_code.index('=') + 1 57 | # b = js_code.index(';') 58 | # js_code = js_code[a:b] 59 | 60 | # # 将字符串转换为字典 61 | # dict_data = eval(js_code) 62 | # print(dict_data) 63 | # for item in dict_data: 64 | # print(item) 65 | -------------------------------------------------------------------------------- /h05.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import base64 3 | import hashlib 4 | import time 5 | 6 | import requests 7 | from lxml import etree 8 | from selenium import webdriver 9 | 10 | 11 | base_url = 'https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp' 12 | 13 | myheaders = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 15 | 16 | def getHTML(url,file_name=''): 17 | client = webdriver.Chrome() 18 | client.get(url) 19 | time.sleep(5) 20 | html = client.page_source 21 | print(html) 22 | client.quit() 23 | 24 | if file_name != '': 25 | with open(file_name, 'w', encoding='utf-8') as f: 26 | f.write(html) 27 | return html 28 | 29 | 30 | def parseHTML(html,file_name=''): 31 | root = etree.HTML(html) 32 | trs = root.xpath('//tr') 33 | 34 | if file_name != '': 35 | f = open(file_name, 'w', encoding='utf-8') 36 | 37 | for tr in trs: 38 | tds = tr.xpath('./td') 39 | s = '' 40 | for td in tds: 41 | s = s + str(td.xpath('string(.)')) + '|' 42 | # s = s + str(td.text) + '|' 43 | print(s) 44 | if (s != '') & (file_name != ''): 45 | f.write(s + '\n') 46 | f.close() 47 | 48 | 49 | if __name__ == '__main__': 50 | # example: 1 51 | html = getHTML(base_url, './data/h05/h05.html') 52 | # parseHTML(html, './data/h04/h04.txt') 53 | 54 | # example: 2 55 | # url = 'https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp/api/' 56 | # timestamp = str(int(time.time())) 57 | # md5_hash = hashlib.md5() 58 | # md5_hash.update(timestamp.encode('utf-8')) 59 | # md5 = md5_hash.hexdigest() 60 | # s = ('%s,%s' % (timestamp, md5)) 61 | # print(s) 62 | # payload = str(base64.b64encode(s.encode('utf-8')), 'utf-8') 63 | # print(payload) 64 | # html = requests.get(url + payload, headers=myheaders).text 65 | # print(html) 66 | -------------------------------------------------------------------------------- /h06.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import base64 3 | import hashlib 4 | import time 5 | 6 | import requests 7 | from lxml import etree 8 | from selenium import webdriver 9 | 10 | 11 | base_url = 'https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper' 12 | 13 | myheaders = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 15 | 16 | def getHTML(url,file_name=''): 17 | # client = webdriver.Chrome() 18 | # client.get(url) 19 | # html = client.page_source 20 | # print(html) 21 | # client.quit() 22 | options = webdriver.ChromeOptions() 23 | options.add_argument('disable-infobars') 24 | # options.add_argument('headless') 25 | options.set_capability('goog:loggingPrefs', {'browser': 'ALL'}) # 输出浏览器console 日志:console.log 26 | 27 | options.add_argument('--disable-blink-features=AutomationControlled') # 改变navigator.webdriver 属性值 28 | 29 | client = webdriver.Chrome(options=options) 30 | client.get(url) 31 | time.sleep(5) 32 | print(client.page_source) 33 | html = client.page_source 34 | 35 | # client.quit() 36 | 37 | if file_name != '': 38 | with open(file_name, 'w', encoding='utf-8') as f: 39 | f.write(html) 40 | return html 41 | 42 | 43 | def parseHTML(html,file_name=''): 44 | root = etree.HTML(html) 45 | trs = root.xpath('//tr') 46 | 47 | if file_name != '': 48 | f = open(file_name, 'w', encoding='utf-8') 49 | 50 | for tr in trs: 51 | tds = tr.xpath('./td') 52 | s = '' 53 | for td in tds: 54 | s = s + str(td.xpath('string(.)')) + '|' 55 | # s = s + str(td.text) + '|' 56 | print(s) 57 | if (s != '') & (file_name != ''): 58 | f.write(s + '\n') 59 | f.close() 60 | 61 | 62 | if __name__ == '__main__': 63 | # example: 1 64 | html = getHTML(base_url, './data/h06/h06.html') 65 | # print(html) 66 | # parseHTML(html, './data/h06/h06.txt') 67 | 68 | # example: 2 69 | # url = 'https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper/api/' 70 | # timestamp = str(int(time.time())) 71 | # md5_hash = hashlib.md5() 72 | # md5_hash.update(timestamp.encode('utf-8')) 73 | # md5 = md5_hash.hexdigest() 74 | # s = ('%s,%s' % (timestamp, md5)) 75 | # print(s) 76 | # payload = str(base64.b64encode(s.encode('utf-8')), 'utf-8') 77 | # print(payload) 78 | # html = requests.get(url + payload, headers=myheaders).text 79 | # print(html) 80 | # # 将字符串转换为字典 81 | # dict_data = eval(html) 82 | # print(dict_data) 83 | # for item in dict_data: 84 | # print(item) 85 | -------------------------------------------------------------------------------- /n01.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/user-agent-referrer' 7 | 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36', 9 | 'Referer':'https://spiderbuf.cn/list'} 10 | 11 | html = requests.get(url, headers=myheaders).text 12 | print(html) 13 | 14 | f = open('./data/n01/n01.html', 'w', encoding='utf-8') 15 | f.write(html) 16 | f.close() 17 | 18 | root = etree.HTML(html) 19 | ls = root.xpath('//div[@class ="container"]/div/div') 20 | # page_text = ls[0].xpath('string(.)') 21 | # print(page_text) 22 | 23 | f = open('./data/n01/n01.txt', 'w', encoding='utf-8') 24 | for item in ls: 25 | hnodes = item.xpath('./h2') 26 | s0 = hnodes[0].text 27 | 28 | pnodes = item.xpath('./p') 29 | s1 = pnodes[0].text 30 | s2 = pnodes[1].text 31 | s3 = pnodes[2].text 32 | s4 = pnodes[3].text 33 | # 富邦金融控股排名:50企业估值(亿元):2135CEO:蔡明兴行业:金融服务 34 | s = s0 + '|' + s1.replace('排名:','') + '|' + s2.replace('企业估值(亿元):','') + '|' \ 35 | + s3.replace('CEO:','') + '|' + s4.replace('行业:','') + '\n' 36 | print(s) 37 | f.write(s) 38 | # s = '' 39 | # for td in tds: 40 | # s = s + str(td.xpath('string(.)')) + '|' 41 | # # s = s + str(td.text) + '|' 42 | # print(s) 43 | # if s != '': 44 | # f.write(s + '\n') 45 | 46 | f.close() -------------------------------------------------------------------------------- /n02.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | import base64 7 | 8 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-images-base64' 9 | 10 | myheaders = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 12 | 13 | 14 | html = requests.get(url, headers=myheaders).text 15 | print(html) 16 | 17 | f = open('./data/n02/n02.html', 'w', encoding='utf-8') 18 | f.write(html) 19 | f.close() 20 | 21 | root = etree.HTML(html) 22 | imgs = root.xpath('//img/@src') 23 | print(imgs) 24 | for item in imgs: 25 | print(item) 26 | # item 是获取到的base64字符串 27 | item = item.replace('data:image/png;base64,','') 28 | str_bytes = item.encode('raw_unicode_escape') # str 转 bytes 29 | decoded = base64.b64decode(str_bytes) 30 | 31 | img = open('./data/n02/n02.png', 'wb') 32 | img.write(decoded) 33 | img.close() 34 | 35 | -------------------------------------------------------------------------------- /n03.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | import time 6 | 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-bypass-request-limit/%d' 8 | 9 | myheaders = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 11 | 12 | max_no = 20 13 | # exit() 14 | 15 | for i in range(1, max_no + 1): 16 | print(i) 17 | url = base_url % i 18 | print(url) 19 | html = requests.get(url, headers=myheaders).text 20 | print(html) 21 | 22 | f = open('./data/n03/n03_%d.html' % i, 'w', encoding='utf-8') 23 | f.write(html) 24 | f.close() 25 | 26 | root = etree.HTML(html) 27 | trs = root.xpath('//tr') 28 | 29 | f = open('./data/n03/datan03_%d.txt' % i, 'w', encoding='utf-8') 30 | for tr in trs: 31 | tds = tr.xpath('./td') 32 | s = '' 33 | for td in tds: 34 | s = s + str(td.xpath('string(.)')) + '|' 35 | # s = s + str(td.text) + '|' 36 | print(s) 37 | if s != '': 38 | f.write(s + '\n') 39 | time.sleep(2) 40 | f.close() 41 | -------------------------------------------------------------------------------- /n04.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os.path 3 | 4 | import requests 5 | from lxml import etree 6 | import time 7 | 8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/css-pseudo-elements' 9 | 10 | myheaders = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 12 | 13 | def getHTML(url,file_name=''): 14 | html = requests.get(url, headers=myheaders).text 15 | if file_name != '': 16 | with open(file_name, 'w', encoding='utf-8') as f: 17 | f.write(html) 18 | return html 19 | 20 | 21 | 22 | def parseHTML(html): 23 | class_map = {'abcdef::before':'7', 24 | 'abcdef::after':'5', 25 | 'ghijkl::before':'8', 26 | 'ghijkl::after':'9', 27 | 'mnopqr::before':'9', 28 | 'mnopqr::after':'1', 29 | 'uvwxyz::before':'1', 30 | 'uvwxyz::after':'4', 31 | 'yzabcd::before':'2', 32 | 'yzabcd::after':'6', 33 | 'efghij::before':'3', 34 | 'efghij::after':'2', 35 | 'klmnop::before':'5', 36 | 'klmnop::after':'7', 37 | 'qrstuv::before':'4', 38 | 'qrstuv::after':'3', 39 | 'wxyzab::before':'6', 40 | 'wxyzab::after':'0', 41 | 'cdefgh::before':'0', 42 | 'cdefgh::after':'8', 43 | 'hijklm::after':'6', 44 | 'opqrst::after':'0', 45 | 'uvwxab::after':'3', 46 | 'cdijkl::after':'8', 47 | 'pqrmno::after':'1', 48 | 'stuvwx::after':'4', 49 | 'pkenmc::after':'7', 50 | 'tcwdsk::after':'9', 51 | 'mkrtyu::after':'5', 52 | 'umdrtk::after':'2'} 53 | # parse html source code here 54 | root = etree.HTML(html) 55 | divs = root.xpath('/html/body/div/div[@style="margin-top: 10px;"]') 56 | 57 | for div in divs: 58 | titles = div.xpath('./div/h2') 59 | title = '' 60 | if len(titles) > 0: 61 | title = titles[0].text 62 | print(title) 63 | # 评分 64 | ranking_spans = div.xpath('./div/div[2]/span[@class]') 65 | 66 | if len(ranking_spans) > 0: 67 | span = ranking_spans[0] 68 | attr_class = span.attrib["class"] if "class" in span.attrib else "" 69 | # print(f"{span} - {attr_class}") 70 | # print(span.text) 71 | 72 | classes = attr_class.split(" ") 73 | if len(classes) > 0: 74 | s1 = class_map[classes[0] + '::before'] 75 | s2 = class_map[classes[1] + '::after'] 76 | print(f'{s1}.{s2}') 77 | 78 | 79 | if __name__ == '__main__': 80 | html = getHTML(base_url, './data/n04/n04.html') 81 | parseHTML(html) 82 | -------------------------------------------------------------------------------- /n05.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os.path 3 | 4 | import requests 5 | from lxml import etree 6 | import time 7 | 8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/css-sprites' 9 | 10 | myheaders = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 12 | 13 | def getHTML(url,file_name=''): 14 | html_bytes = requests.get(url, headers=myheaders).content 15 | html = html_bytes.decode() 16 | if file_name != '': 17 | with open(file_name, 'w', encoding='utf-8') as f: 18 | f.write(html) 19 | return html 20 | 21 | 22 | 23 | def parseHTML(html): 24 | class_map = {'sprite abcdef':'0', 25 | 'sprite ghijkl':'1', 26 | 'sprite mnopqr':'2', 27 | 'sprite uvwxyz':'3', 28 | 'sprite yzabcd':'4', 29 | 'sprite efghij':'5', 30 | 'sprite klmnop':'6', 31 | 'sprite qrstuv':'7', 32 | 'sprite wxyzab':'8', 33 | 'sprite cdefgh':'9'} 34 | # parse html source code here 35 | root = etree.HTML(html) 36 | divs = root.xpath('//div[@style="margin-bottom: 30px;"]') 37 | 38 | for div in divs: 39 | titles = div.xpath('./h2') 40 | title = '' 41 | if len(titles) > 0: 42 | title = titles[0].text 43 | print(title) 44 | 45 | amount_spans = div.xpath('./p/span[@class]') 46 | amount_str = '' 47 | for span in amount_spans: 48 | attr_class = span.attrib["class"] if "class" in span.attrib else "" 49 | # print(f"{span} - {attr_class}") 50 | # print(span.text) 51 | amount_str += class_map[attr_class] 52 | print(amount_str) 53 | 54 | 55 | 56 | if __name__ == '__main__': 57 | html = getHTML(base_url, './data/n05/n05.html') 58 | parseHTML(html) 59 | -------------------------------------------------------------------------------- /n06.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os.path 3 | 4 | import requests 5 | from lxml import etree 6 | import time 7 | 8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-form-rpa' 9 | 10 | myheaders = { 11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 12 | 13 | def getHTML(url,file_name=''): 14 | html_bytes = requests.get(url, headers=myheaders).content 15 | html = html_bytes.decode() 16 | if file_name != '': 17 | with open(file_name, 'w', encoding='utf-8') as f: 18 | f.write(html) 19 | return html 20 | 21 | 22 | 23 | def parseHTML(html): 24 | # parse html source code here 25 | root = etree.HTML(html) 26 | inputs = root.xpath('//input') 27 | 28 | for input in inputs: 29 | attr_name = input.attrib['name'] if 'name' in input.attrib else '' 30 | input_value = input.attrib['value'] if 'value' in input.attrib else '' 31 | if attr_name == 'username': 32 | print(f'用户名:{input_value}') 33 | 34 | if attr_name == 'password': 35 | print(f'密码:{input_value}') 36 | 37 | if attr_name == 'email': 38 | print(f'邮箱:{input_value}') 39 | 40 | if attr_name == 'website': 41 | print(f'网站:{input_value}') 42 | 43 | if attr_name == 'date': 44 | print(f'生日:{input_value}') 45 | 46 | if attr_name == 'time': 47 | print(f'时间:{input_value}') 48 | 49 | if attr_name == 'number': 50 | print(f'数量:{input_value}') 51 | 52 | if attr_name == 'range': 53 | print(f'滑块:{input_value}') 54 | 55 | if attr_name == 'color': 56 | print(f'颜色:{input_value}') 57 | 58 | if attr_name == 'search': 59 | print(f'搜索:{input_value}') 60 | 61 | if attr_name == 'gender': 62 | temp = input.attrib['checked'] if 'checked' in input.attrib else '' 63 | if temp != '': 64 | print(f'性别:{input_value}') 65 | 66 | if attr_name == 'interest': 67 | temp = input.attrib['checked'] if 'checked' in input.attrib else '' 68 | if temp != '': 69 | print(f'开发语言:{input_value}') 70 | 71 | options = root.xpath('//select[@name="country"]/option') 72 | for option in options: 73 | attr_name = option.attrib['selected'] if 'selected' in option.attrib else '' 74 | option_value = option.attrib['value'] if 'value' in option.attrib else '' 75 | if attr_name != '': 76 | print(f'人物代表:{option_value}') 77 | 78 | lis = root.xpath('//ul[@class="items"]/li/a') 79 | for li in lis: 80 | attr_name = li.attrib['class'] if 'class' in li.attrib else '' 81 | li_value = li.text 82 | if 'active' in attr_name: 83 | print(f'代表人物出处:{li_value}') 84 | 85 | 86 | if __name__ == '__main__': 87 | html = getHTML(base_url, './data/n06/n06.html') 88 | parseHTML(html) 89 | -------------------------------------------------------------------------------- /n07.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | base_url = 'https://spiderbuf.cn/web-scraping-practice/random-css-classname' 7 | 8 | my_headers = { 9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 10 | 11 | # 取页数 12 | html_bytes = requests.get(base_url, headers=my_headers).content 13 | html = html_bytes.decode() 14 | root = etree.HTML(html) 15 | with open('./data/n07/n07.html', 'w', encoding='utf-8') as f: 16 | f.write(html) 17 | # print(html) 18 | divs = root.xpath('/html/body/main/div[2]/div') 19 | with open('./data/n07/n07.txt','w',encoding='utf-8') as f: 20 | for div in divs: 21 | print(div.text) 22 | if div.text: 23 | f.write(f'{div.text}\n') -------------------------------------------------------------------------------- /python_begin.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | 5 | print('中文') 6 | 7 | a = 2 8 | b = '张三' 9 | c = 'ddddddd' 10 | print(a,b,c) 11 | 12 | d = 2 13 | print(a - d) 14 | 15 | if a == 1: 16 | print('等于1') 17 | elif a == 2: 18 | print('等于2') 19 | else: 20 | print('不等于') 21 | 22 | # for i in range(0, 10): 23 | # print(i) 24 | 25 | while a < 10: 26 | a += 1 27 | print(a) 28 | 29 | print("中文") 30 | 31 | lst = ['张三', '李四', '王五'] 32 | 33 | dict = {'张三':'a2', '李四':'b3'} 34 | 35 | print(dict['张三']) 36 | 37 | for item in dict.keys(): 38 | print(dict[item]) 39 | 40 | # f = open('abc.txt', 'w', encoding='utf-8') 41 | # f.write('这是写入文件的内容') 42 | # f.close() 43 | f = open('abc.txt', 'r', encoding='utf-8') 44 | s = f.read() 45 | f.close() 46 | print(s) 47 | 48 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apturl==0.5.2 2 | attrs==23.1.0 3 | blinker==1.4 4 | Brlapi==0.7.0 5 | certifi==2019.11.28 6 | chardet==3.0.4 7 | Click==7.0 8 | colorama==0.4.3 9 | command-not-found==0.3 10 | cryptography==2.8 11 | cupshelpers==1.0 12 | dbus-python==1.2.16 13 | defer==1.0.6 14 | distro==1.4.0 15 | distro-info===0.23ubuntu1 16 | entrypoints==0.3 17 | exceptiongroup==1.1.3 18 | h11==0.14.0 19 | httplib2==0.14.0 20 | idna==2.8 21 | keyring==18.0.1 22 | language-selector==0.1 23 | launchpadlib==1.10.13 24 | lazr.restfulclient==0.14.2 25 | lazr.uri==1.0.3 26 | louis==3.12.0 27 | lxml==4.6.3 28 | macaroonbakery==1.3.1 29 | netifaces==0.10.4 30 | oauthlib==3.1.0 31 | olefile==0.46 32 | outcome==1.3.0.post0 33 | pexpect==4.6.0 34 | Pillow==7.0.0 35 | protobuf==3.6.1 36 | pycairo==1.16.2 37 | pycups==1.9.73 38 | PyGObject==3.36.0 39 | PyJWT==1.7.1 40 | pymacaroons==0.13.0 41 | PyNaCl==1.3.0 42 | pyRFC3339==1.1 43 | PySocks==1.7.1 44 | python-apt==2.0.1+ubuntu0.20.4.1 45 | python-dateutil==2.7.3 46 | python-debian==0.1.36+ubuntu1.1 47 | pytz==2019.3 48 | pyxdg==0.26 49 | PyYAML==5.3.1 50 | reportlab==3.5.34 51 | requests==2.22.0 52 | requests-unixsocket==0.2.0 53 | SecretStorage==2.3.1 54 | selenium==4.15.2 55 | simplejson==3.16.0 56 | six==1.14.0 57 | sniffio==1.3.0 58 | sortedcontainers==2.4.0 59 | systemd-python==234 60 | trio==0.23.1 61 | trio-websocket==0.11.1 62 | ubuntu-advantage-tools==8001 63 | ubuntu-drivers-common==0.0.0 64 | ufw==0.36 65 | unattended-upgrades==0.1 66 | urllib3==1.25.8 67 | wadllib==1.3.3 68 | wsproto==1.2.0 69 | xkit==0.0.0 70 | -------------------------------------------------------------------------------- /s01.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/requests-lxml-for-scraping-beginner' 7 | 8 | html = requests.get(url).text 9 | 10 | f = open('01.html', 'w', encoding='utf-8') 11 | f.write(html) 12 | f.close() 13 | 14 | root = etree.HTML(html) 15 | trs = root.xpath('//tr') 16 | 17 | f = open('data01.txt', 'w', encoding='utf-8') 18 | for tr in trs: 19 | tds = tr.xpath('./td') 20 | s = '' 21 | for td in tds: 22 | # print(td.text) 23 | s = s + str(td.text) + '|' 24 | print(s) 25 | if s != '': 26 | f.write(s + '\n') 27 | 28 | f.close() 29 | 30 | # print(html) -------------------------------------------------------------------------------- /s02.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-http-header' 7 | 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 9 | 10 | html = requests.get(url, headers=myheaders).text 11 | print(html) 12 | 13 | f = open('02.html', 'w', encoding='utf-8') 14 | f.write(html) 15 | f.close() 16 | 17 | root = etree.HTML(html) 18 | trs = root.xpath('//tr') 19 | 20 | f = open('data02.txt', 'w', encoding='utf-8') 21 | for tr in trs: 22 | tds = tr.xpath('./td') 23 | s = '' 24 | for td in tds: 25 | # print(td.text) 26 | s = s + str(td.text) + '|' 27 | print(s) 28 | if s != '': 29 | f.write(s + '\n') 30 | 31 | f.close() 32 | 33 | -------------------------------------------------------------------------------- /s03.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/lxml-xpath-advanced' 7 | 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 9 | 10 | html = requests.get(url, headers=myheaders).text 11 | print(html) 12 | 13 | f = open('03.html', 'w', encoding='utf-8') 14 | f.write(html) 15 | f.close() 16 | 17 | root = etree.HTML(html) 18 | trs = root.xpath('//tr') 19 | 20 | f = open('data03.txt', 'w', encoding='utf-8') 21 | for tr in trs: 22 | tds = tr.xpath('./td') 23 | s = '' 24 | for td in tds: 25 | s = s + str(td.xpath('string(.)')) + '|' 26 | # s = s + str(td.text) + '|' 27 | print(s) 28 | if s != '': 29 | f.write(s + '\n') 30 | 31 | f.close() 32 | 33 | -------------------------------------------------------------------------------- /s04.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | import re 6 | 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper?pageno=%d' 8 | 9 | myheaders = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 11 | 12 | # 取页数 13 | html = requests.get(base_url % 1, headers=myheaders).text 14 | root = etree.HTML(html) 15 | 16 | lis = root.xpath('//ul[@class="pagination"]/li') 17 | page_text = lis[0].xpath('string(.)') 18 | ls = re.findall('[0-9]', page_text) 19 | 20 | max_no = int(ls[0]) 21 | # exit() 22 | 23 | for i in range(1, max_no + 1): 24 | print(i) 25 | url = base_url % i 26 | print(url) 27 | html = requests.get(url, headers=myheaders).text 28 | print(html) 29 | 30 | f = open('04_%d.html' % i, 'w', encoding='utf-8') 31 | f.write(html) 32 | f.close() 33 | 34 | root = etree.HTML(html) 35 | trs = root.xpath('//tr') 36 | 37 | f = open('data04_%d.txt' % i, 'w', encoding='utf-8') 38 | for tr in trs: 39 | tds = tr.xpath('./td') 40 | s = '' 41 | for td in tds: 42 | s = s + str(td.xpath('string(.)')) + '|' 43 | # s = s + str(td.text) + '|' 44 | print(s) 45 | if s != '': 46 | f.write(s + '\n') 47 | 48 | f.close() 49 | -------------------------------------------------------------------------------- /s05.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-images-from-web' 7 | 8 | myheaders = { 9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 10 | 11 | 12 | html = requests.get(url, headers=myheaders).text 13 | print(html) 14 | 15 | f = open('05.html', 'w', encoding='utf-8') 16 | f.write(html) 17 | f.close() 18 | 19 | root = etree.HTML(html) 20 | imgs = root.xpath('//img/@src') 21 | print(imgs) 22 | for item in imgs: 23 | img_data = requests.get('https://spiderbuf.cn' + item, headers=myheaders).content 24 | img = open(str(item).replace('/',''), 'wb') 25 | img.write(img_data) 26 | img.close() 27 | # 28 | # f = open('data05.txt', 'w', encoding='utf-8') 29 | # for tr in trs: 30 | # tds = tr.xpath('./td') 31 | # s = '' 32 | # for td in tds: 33 | # s = s + str(td.xpath('string(.)')) + '|' 34 | # # s = s + str(td.text) + '|' 35 | # print(s) 36 | # if s != '': 37 | # f.write(s + '\n') 38 | # 39 | # f.close() 40 | -------------------------------------------------------------------------------- /s06.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/inner' 7 | 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 9 | 10 | html = requests.get(url, headers=myheaders).text 11 | print(html) 12 | 13 | f = open('06.html', 'w', encoding='utf-8') 14 | f.write(html) 15 | f.close() 16 | 17 | root = etree.HTML(html) 18 | trs = root.xpath('//tr') 19 | 20 | f = open('data06.txt', 'w', encoding='utf-8') 21 | for tr in trs: 22 | tds = tr.xpath('./td') 23 | s = '' 24 | for td in tds: 25 | s = s + str(td.xpath('string(.)')) + '|' 26 | # s = s + str(td.text) + '|' 27 | print(s) 28 | if s != '': 29 | f.write(s + '\n') 30 | 31 | f.close() 32 | 33 | -------------------------------------------------------------------------------- /s07.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | import json 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/iplist?order=asc' 7 | 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 9 | 10 | data_json = requests.get(url, headers=myheaders).text 11 | print(data_json) 12 | 13 | f = open('./data/7/07.html', 'w', encoding='utf-8') 14 | f.write(data_json) 15 | f.close() 16 | 17 | ls = json.loads(data_json) 18 | print(ls) 19 | 20 | f = open('./data/7/data07.txt', 'w', encoding='utf-8') 21 | for item in ls: 22 | # print(item) 23 | s = '%s|%s|%s|%s|%s|%s|%s\n' % (item['ip'], item['mac'],item['manufacturer'], item['name'],item['ports'], item['status'], item['type']) 24 | f.write(s) 25 | f.close() -------------------------------------------------------------------------------- /s08.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-via-http-post' 7 | 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 9 | 10 | payload = {'level':'8'} 11 | html = requests.post(url, headers=myheaders, data=payload).text 12 | print(html) 13 | 14 | f = open('./data/8/08.html', 'w', encoding='utf-8') 15 | f.write(html) 16 | f.close() 17 | 18 | root = etree.HTML(html) 19 | trs = root.xpath('//tr') 20 | 21 | f = open('./data/8/data08.txt', 'w', encoding='utf-8') 22 | for tr in trs: 23 | tds = tr.xpath('./td') 24 | s = '' 25 | for td in tds: 26 | # print(td.text) 27 | s = s + str(td.text) + '|' 28 | print(s) 29 | if s != '': 30 | f.write(s + '\n') 31 | 32 | f.close() 33 | 34 | -------------------------------------------------------------------------------- /s4-1.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import requests 4 | from lxml import etree 5 | import re 6 | 7 | url = 'https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper?pageno=2&pagesize=50' 8 | 9 | myheaders = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'} 11 | 12 | html = requests.get(url, headers=myheaders).text 13 | print(html) 14 | 15 | f = open('./data/4-1/04-1.html', 'w', encoding='utf-8') 16 | f.write(html) 17 | f.close() 18 | 19 | root = etree.HTML(html) 20 | trs = root.xpath('//tr') 21 | 22 | f = open('./data/4-1/data04-1.txt', 'w', encoding='utf-8') 23 | for tr in trs: 24 | tds = tr.xpath('./td') 25 | s = '' 26 | for td in tds: 27 | s = s + str(td.xpath('string(.)')) + '|' 28 | # s = s + str(td.text) + '|' 29 | print(s) 30 | if s != '': 31 | f.write(s + '\n') 32 | 33 | f.close() 34 | --------------------------------------------------------------------------------