├── LICENSE
├── README.md
├── blob
    ├── crawler.py
    ├── main.py
    └── templates
    │   └── index.html
├── c01.py
├── c02.py
├── c03.py
├── c04.py
├── e01.py
├── e02.py
├── e03.py
├── e04.py
├── h01.py
├── h02.py
├── h03.py
├── h04.py
├── h05.py
├── h06.py
├── n01.py
├── n02.py
├── n03.py
├── n04.py
├── n05.py
├── n06.py
├── n07.py
├── python_begin.py
├── requirements.txt
├── s01.py
├── s02.py
├── s03.py
├── s04.py
├── s05.py
├── s06.py
├── s07.py
├── s08.py
└── s4-1.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 hhuayuan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spiderbuf 爬虫练习网站
 2 | Spiderbuf 爬虫实战案例示例代码
 3 | 
 4 | ## 官方网站
 5 | https://spiderbuf.cn
 6 | 
 7 | [爬虫实战练习列表](https://spiderbuf.cn/web-scraping-practices)
 8 | 
 9 | 专注于 Python 爬虫练习的网站.
10 | 
11 | 提供丰富的爬虫教程、爬虫案例解析和爬虫练习题.
12 | 
13 | Python爬虫开发强化练习，在矛与盾的攻防中不断提高技术水平，通过大量的爬虫实战掌握常见的爬虫与反爬套路。
14 | 
15 | 引导式爬虫案例 + 免费爬虫视频教程，以闯关的形式挑战各个爬虫任务，培养爬虫开发的直觉及经验，验证自身爬虫开发与反爬虫实力的时候到了。
16 | 
17 | ## 代码运行环境
18 | Ubuntu 20.04.6 LTS
19 | macOS 15+
20 | 
21 | Python3.8+
22 | 
23 | ## 更新日志
24 | | 编号 | 名称 | 更新日期 |
25 | | ---- | ---- | ---- |
26 | | [C08](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c08 "JS逆向爬虫实战练习（金融数据）") | JS逆向爬虫实战练习（金融数据） | 2025-05-31 |
27 | | [C07](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c07 "JavaScript 逆向爬虫实战案例") | JavaScript 逆向爬虫实战案例 | 2025-05-14 |
28 | | [C06](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c06 "JavaScript 逆向爬虫实战案例") | JavaScript 逆向爬虫实战案例 | 2025-04-15 |
29 | | [C05](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c05 "爬虫实战练习") | 爬虫实战练习 | 2025-02-26 |
30 | | [C04](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c04 "爬虫实战练习") | 爬虫实战练习 | 2025-02-11 |
31 | | [C03](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c03 "爬虫实战练习") | 爬虫实战练习 | 2025-01-15 |
32 | | [C02](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c02 "爬虫实战练习") | 爬虫实战练习 | 2024-12-16 |
33 | | [C01](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c01 "爬虫实战练习") | 爬虫实战练习 | 2024-11-17 |
34 | | [N07](https://spiderbuf.cn/web-scraping-practice/random-css-classname "随机CSS样式类名，无Element ID") | 随机CSS样式类名，无Element ID | 2024-09-08 |
35 | | [E04](https://spiderbuf.cn/web-scraping-practice/block-ip-proxy "被屏蔽IP后使用代理服务器爬取页面") | 被屏蔽IP后使用代理服务器爬取页面 | 2024-07-23 |
36 | | [N06](https://spiderbuf.cn/web-scraping-practice/scraping-form-rpa "网页表单爬取（RPA初阶）") | 网页表单爬取（RPA初阶） | 2024-03-26 |
37 | | [N05](https://spiderbuf.cn/web-scraping-practice/css-sprites "CSS Sprites （雪碧图）反爬") | CSS Sprites （雪碧图）反爬 | 2024-02-18 |
38 | | [N04](https://spiderbuf.cn/web-scraping-practice/css-pseudo-elements "CSS伪元素反爬") | CSS伪元素反爬 | 2024-01-11 |
39 | | [H06](https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper "初识浏览器指纹：Selenium是如何被反爬的") | 初识浏览器指纹：Selenium是如何被反爬的 | 2023-12-22 |
40 | | [H05](https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp "js逆向破解时间戳反爬") | js逆向破解时间戳反爬 | 2023-11-26 |
41 | | [H04](https://spiderbuf.cn/web-scraping-practice/javascript-confuse-encrypt-reverse "js加密混淆及简单反调试") | js加密混淆及简单反调试 | 2023-11-11 |
42 | | [H03](https://spiderbuf.cn/web-scraping-practice/scraping-scroll-load "网页滚动加载的原理及爬取(JavaScript加密混淆逆向基础)") | 网页滚动加载的原理及爬取(JavaScript加密混淆逆向基础) | 2023-10-20 |
43 | | [H02](https://spiderbuf.cn/web-scraping-practice/scraping-douban-movies-xpath-advanced "高分电影列表复杂页面的解析（仿豆瓣电影）- xpath高级用法") | 高分电影列表复杂页面的解析（仿豆瓣电影）- xpath高级用法 | 2023-10-10 |
44 | | [N03](https://spiderbuf.cn/web-scraping-practice/scraper-bypass-request-limit "限制访问频率不低于1秒") | 限制访问频率不低于1秒 | 2023-07-02 |
45 | | [N02](https://spiderbuf.cn/web-scraping-practice/scraping-images-base64 "使用Base64编码的图片爬取与解码还原") | 使用Base64编码的图片爬取与解码还原 | 2023-06-30 |
46 | | [H01](https://spiderbuf.cn/web-scraping-practice/scraping-css-confuse-offset "CSS样式偏移混淆文本内容的解析与爬取") | CSS样式偏移混淆文本内容的解析与爬取 | 2023-06-25 |
47 | | [N01](https://spiderbuf.cn/web-scraping-practice/user-agent-referrer "User-Agent与Referer校验反爬") | User-Agent与Referer校验反爬 | 2022-11-05 |
48 | | [E03](https://spiderbuf.cn/web-scraping-practice/scraping-random-pagination "无序号翻页") | 无序号翻页 | 2022-11-01 |
49 | | [E02](https://spiderbuf.cn/web-scraping-practice/web-scraping-with-captcha "带验证码的登录爬取") | 带验证码的登录爬取 | 2022-09-17 |
50 | | [E01](https://spiderbuf.cn/web-scraping-practice/scraper-login-username-password "用户名密码登录爬取后台数据") | 用户名密码登录爬取后台数据 | 2022-08-21 |
51 | | [S08](https://spiderbuf.cn/web-scraping-practice/scraper-via-http-post "http post请求的数据爬取") | http post请求的数据爬取 | 2021-06-21 |
52 | | [S07](https://spiderbuf.cn/web-scraping-practice/scraping-ajax-api "ajax动态加载数据的爬取") | ajax动态加载数据的爬取 | 2021-06-21 |
53 | | [S06](https://spiderbuf.cn/web-scraping-practice/scraping-iframe "带iframe的页面源码分析及数据爬取") | 带iframe的页面源码分析及数据爬取 | 2021-06-21 |
54 | | [S05](https://spiderbuf.cn/web-scraping-practice/scraping-images-from-web "网页图片的爬取及本地保存") | 网页图片的爬取及本地保存 | 2021-06-21 |
55 | | [S04](https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper "分页参数分析及翻页爬取") | 分页参数分析及翻页爬取 | 2021-06-21 |
56 | | [S03](https://spiderbuf.cn/web-scraping-practice/lxml-xpath-advanced "lxml库进阶语法及解析练习") | lxml库进阶语法及解析练习 | 2021-06-21 |
57 | | [S02](https://spiderbuf.cn/web-scraping-practice/scraper-http-header "http请求分析及头构造使用") | http请求分析及头构造使用 | 2021-06-21 |
58 | | [S01](https://spiderbuf.cn/web-scraping-practice/requests-lxml-for-scraping-beginner "requests库及lxml库入门") | requests库及lxml库入门 | 2021-06-21 |
59 | 
60 | # 课程
61 | [《深入了解Python爬虫攻防》](https://www.udemy.com/course/python-spiderbuf/?referralCode=77D640F3DB5A310151DB "深入了解Python爬虫攻防")
62 | 
63 | [《Axure RP 9 从入门到精通：打造高保真交互原型》](https://www.udemy.com/course/axure-rp-9/?referralCode=3374A9C2D8B735FC54A1 "Axure RP 9 从入门到精通：打造高保真交互原型")
64 | 


--------------------------------------------------------------------------------
/blob/crawler.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | def download_video(video_url, output_path):
 4 |     try:
 5 |         # 发送 GET 请求
 6 |         response = requests.get(video_url, stream=True)
 7 |         
 8 |         # 如果请求成功
 9 |         if response.status_code == 200:
10 |             # 打开文件并写入下载的数据
11 |             with open(output_path, 'wb') as f:
12 |                 for chunk in response.iter_content(chunk_size=1024):
13 |                     if chunk:  # 防止下载过程中产生空数据
14 |                         f.write(chunk)
15 |             print(f"Video downloaded successfully: {output_path}")
16 |         else:
17 |             print(f"Failed to retrieve video, HTTP status code: {response.status_code}")
18 |     
19 |     except requests.exceptions.RequestException as e:
20 |         print(f"An error occurred: {e}")
21 | 
22 | if __name__ == "__main__":
23 |     # 视频文件的 URL
24 |     video_url = 'http://localhost:5000/video'  # 替换成目标视频 URL
25 |     
26 |     # 本地保存的路径
27 |     output_path = 'downloaded_video.mp4'
28 |     
29 |     # 下载视频
30 |     download_video(video_url, output_path)
31 | 


--------------------------------------------------------------------------------
/blob/main.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, Response, send_file, render_template
 2 | import os
 3 | 
 4 | app = Flask(__name__)
 5 | 
 6 | @app.route('/')
 7 | def index():
 8 |     return render_template('index.html')
 9 | 
10 | @app.route('/video')
11 | def stream_video():
12 |     # 请自行找一个mp4视频文件，放在当前目录下（即与main.py同一个目录）
13 |     video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'spiderbuf.mp4') # 这里替换为你的视频文件路径
14 |     print(video_path)
15 |     if not os.path.exists(video_path):
16 |         return "Video not found", 404
17 |     
18 |     # 打开视频文件，以二进制流的形式发送
19 |     def generate_video():
20 |         with open(video_path, 'rb') as f:
21 |             while chunk := f.read(1024 * 1024):  # 每次读取 1MB
22 |                 yield chunk
23 | 
24 |     return Response(generate_video(), content_type='video/mp4')
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     app.run(debug=True)
29 | 


--------------------------------------------------------------------------------
/blob/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>视频流爬虫 - Spiderbuf</title>
 7 | </head>
 8 | <body>
 9 |     <div style="margin: auto;width: 800px;text-align: center;">
10 |         <h1>视频流爬虫 - Spiderbuf</h1>
11 |         <h2>src="blob:https://spiderbuf.cn....."</h2>
12 |         <video id="video-player" width="100%" controls>
13 |             <source id="video-source" type="video/mp4">
14 |             Your browser does not support the video tag.
15 |         </video>
16 |     </div>
17 | 
18 |     <script>
19 |         const videoElement = document.getElementById('video-player');
20 |         const videoSource = document.getElementById('video-source');
21 | 
22 |         // 使用 fetch 获取视频流
23 |         fetch('/video')
24 |             .then(response => {
25 |                 const videoBlob = response.blob();  // 获取 Blob 对象
26 |                 return videoBlob;
27 |             })
28 |             .then(blob => {
29 |                 // 创建一个 URL 对象，将 Blob 转换成 URL
30 |                 const videoUrl = URL.createObjectURL(blob);
31 |                 videoSource.src = videoUrl;  // 设置视频源
32 |                 videoElement.load();  // 刷新 video 元素
33 |                 videoElement.play();  // 播放视频
34 |             })
35 |             .catch(error => {
36 |                 console.error("Error loading video: ", error);
37 |             });
38 |     </script>
39 | </body>
40 | </html>
41 | 


--------------------------------------------------------------------------------
/c01.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | import numpy as np
 6 | 
 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c01/mnist'
 8 | 
 9 | my_headers = {
10 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
11 |     'Referer': 'https://spiderbuf.cn/web-scraping-practice/c01',
12 |     'Cookie': '__cgf3t=G0gzgFKDRlLtmZH7NrzqOb1x4pek1xNQk12KKc4g21Y-1731624199;'}
13 | 
14 | 
15 | html_bytes = requests.get(base_url, headers=my_headers).content
16 | html = html_bytes.decode()
17 | root = etree.HTML(html)
18 | with open('./data/c01/c01.html', 'w', encoding='utf-8') as f:
19 |     f.write(html)
20 | # print(html)
21 | 
22 | trs = root.xpath('//tbody/tr')
23 | 
24 | 
25 | pix1_arry = []
26 | for tr in trs:
27 |     tds = tr.xpath('td')
28 |     # 把 pix1 列的值添加到数组
29 |     pix1_arry.append([int(tds[1].text) if len(tds) > 1 else 0])
30 | # 计算 pix1 列的平均值并四舍五入至两位小数
31 | print(round(np.mean(pix1_arry),2))


--------------------------------------------------------------------------------
/c02.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | from selenium import webdriver
 6 | from selenium.webdriver import ChromeOptions, ActionChains
 7 | from selenium.webdriver.common.by import By
 8 | import time
 9 | import base64
10 | import json
11 | import numpy as np
12 | 
13 | 
14 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c02'
15 | 
16 | myheaders = {
17 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
18 | 
19 | def getHTML(url,file_name=''):
20 |     client = webdriver.Chrome()
21 |     client.get(url)
22 |     time.sleep(10)
23 | 
24 |     # 事件参数对象
25 |     actionChains = ActionChains(client)
26 | 
27 |     # 捕捉滑块元素
28 |     slide_btn = client.find_element(By.ID, 'slider')
29 |     # 观察网站滑块移动的长度和位置
30 |     actionChains.click_and_hold(slide_btn)
31 |     actionChains.move_by_offset(220,0)
32 |     # 这里要注意：
33 |     # 以下三个是以上面的坐标(220,0)为起点来计算的
34 |     # 所以最终移动的距离是220加上以下的累计
35 |     actionChains.move_by_offset(11,0)
36 |     actionChains.move_by_offset(13,0)
37 |     actionChains.move_by_offset(10,0)
38 | 
39 |     actionChains.release()
40 |     actionChains.perform()
41 | 
42 |     html = client.page_source
43 |     print(html)
44 |     client.quit()
45 | 
46 |     if file_name != '':
47 |         with open(file_name, 'w', encoding='utf-8') as f:
48 |             f.write(html)
49 |     return html
50 | 
51 | 
52 | def parseHTML(html):
53 |     root = etree.HTML(html)
54 |     trs = root.xpath('//tr')
55 | 
56 |     prices = []
57 |     for tr in trs:
58 |         tds = tr.xpath('./td')
59 |         if len(tds) > 2:
60 |             prices.append(int(tds[2].text))
61 |     print(prices)
62 |     print(np.mean(prices))
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     # example: 1
67 |     html = getHTML(base_url, './data/c02/c02.html')
68 |     parseHTML(html)
69 | 
70 |     # example: 2
71 |     # html = requests.get(base_url, headers=myheaders).text
72 |     # a = html.index('encryptedData = "') + 17
73 |     # html = html[a:]
74 |     # b = html.index('";')
75 |     # html = html[:b]
76 |     # print(html)
77 |     # dic = eval(base64.b64decode(html.encode('utf-8')))
78 |     # objs = dic['flights']
79 |     # prices = []
80 |     # for obj in objs:
81 |     #     print(obj)
82 |     #     prices.append(obj['price'])
83 | 
84 |     # print(prices)
85 |     # print(np.mean(prices))
86 | 
87 | 


--------------------------------------------------------------------------------
/c03.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | from selenium import webdriver
 6 | from selenium.webdriver.common.by import By
 7 | import time
 8 | import json
 9 | import hashlib
10 | import random
11 | import numpy as np
12 | 
13 | 
14 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c03'
15 | 
16 | myheaders = {
17 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
18 | 
19 | def getHTML(url,file_name=''):
20 |     sepal_width_arr = []
21 |     client = webdriver.Chrome()
22 |     client.get(url)
23 |     time.sleep(5)
24 |     html = client.page_source
25 |     # print(html)
26 |     parseHTML(html,sepal_width_arr)
27 |     if file_name != '':
28 |         with open(file_name + '_1.html', 'w', encoding='utf-8') as f:
29 |             f.write(html)
30 |     for i in range(1,5):
31 |         client.find_elements(By.XPATH, '//ul/li/a')[i].click()
32 |         time.sleep(5)
33 |         html = client.page_source
34 |         # print(html)
35 |         parseHTML(html,sepal_width_arr)
36 |         if file_name != '':
37 |             with open(file_name + f'_{i+1}.html', 'w', encoding='utf-8') as f:
38 |                 f.write(html)
39 | 
40 |     client.quit()
41 |     print(sepal_width_arr)
42 |     print(np.sum(sepal_width_arr))
43 |     return html
44 | 
45 | 
46 | def parseHTML(html,sepal_width_arr):
47 |     root = etree.HTML(html)
48 |     trs = root.xpath('//tr')
49 |     for tr in trs:
50 |         tds = tr.xpath('./td')
51 |         if len(tds) > 2:
52 |             sepal_width_arr.append(float(tds[2].text))
53 |     
54 |     
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     # example: 1
59 |     # html = getHTML(base_url, './data/c03/c03')
60 | 
61 |     # example: 2
62 |     sepal_width_arr = []
63 |     for i in range(1, 6):
64 |         random_value = random.randint(2000, 10000)
65 |         timestamp = int(time.time())
66 |         xorResult = i ^ timestamp
67 |         md5_hash = hashlib.md5()
68 |         md5_hash.update(f'{xorResult}{timestamp}'.encode('utf-8'))
69 |         hash = md5_hash.hexdigest()
70 |         payload = {
71 |             'random': random_value,
72 |             'timestamp': timestamp,
73 |             'hash': hash,
74 |             'xorResult': xorResult
75 |         }
76 |         # print(payload)
77 |         json_response = requests.post(base_url, headers=myheaders,json=payload).text
78 | 
79 |         print(json_response)
80 |         json_data = json.loads(json_response)
81 |         for item in json_data:
82 |             # print(item)
83 |             sepal_width_arr.append(item['sepal_width'])
84 | 
85 |     print(sepal_width_arr)
86 |     print(np.sum(sepal_width_arr))
87 | 
88 | 


--------------------------------------------------------------------------------
/c04.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # @Author: spiderbuf
 3 | from lxml import etree
 4 | from selenium import webdriver
 5 | from selenium.webdriver import ChromeOptions, ActionChains
 6 | from selenium.webdriver.common.by import By
 7 | import time
 8 | import random
 9 | import numpy as np
10 | import re
11 | 
12 | 
13 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c04'
14 | 
15 | myheaders = {
16 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     options = webdriver.ChromeOptions()
21 |     options.add_argument('disable-infobars')
22 |     options.set_capability('goog:loggingPrefs', {'browser': 'ALL'}) 
23 | 
24 |     options.add_argument('--disable-blink-features=AutomationControlled')  # 改变navigator.webdriver 属性值
25 | 
26 |     client = webdriver.Chrome(options=options)
27 |     print('Getting page...')
28 |     client.get(base_url)
29 |     time.sleep(3)
30 |     
31 |     # 模拟用户在页面上滑动光标
32 |     actionChains = ActionChains(client)
33 |     actionChains.move_by_offset(430,330)
34 |     for i in range(20):
35 |         step = random.randint(1, 10)
36 |         actionChains.move_by_offset(step,step).perform()
37 | 
38 |     checkbox = client.find_element(By.ID, 'captcha')
39 |     checkbox.click()
40 |     print('Checkbox clicked...')
41 |     time.sleep(3)
42 |     html = client.page_source
43 |     # print(html)
44 |     client.quit()
45 | 
46 |     with open('./data/c04/c04.html', 'w', encoding='utf-8') as f:
47 |         f.write(html)
48 | 
49 |     root = etree.HTML(html)
50 |     items = root.xpath('//div[@class="stats"]')
51 |     results = []
52 |     for item in items:
53 |         spans = item.xpath('.//span')
54 |         s = ''.join(spans[3].xpath('string(.)'))
55 |         results.append(int(re.findall('\d+',spans[0].text)[0]) + int(''.join(re.findall('\d+',s))))
56 | 
57 |     print(np.average(results))


--------------------------------------------------------------------------------
/e01.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-login-username-password/login'
 7 | 
 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
 9 | 
10 | payload = {'username':'admin','password':'123456'}
11 | 
12 | html = requests.post(url, headers=myheaders, data=payload).text
13 | print(html)
14 | 
15 | f = open('./data/e01/e01.html', 'w', encoding='utf-8')
16 | f.write(html)
17 | f.close()
18 | 
19 | root = etree.HTML(html)
20 | trs = root.xpath('//tr')
21 | 
22 | f = open('./data/e01/data_e01.txt', 'w', encoding='utf-8')
23 | for tr in trs:
24 |     tds = tr.xpath('./td')
25 |     s = ''
26 |     for td in tds:
27 |         # print(td.text)
28 |         s = s + str(td.text) + '|'
29 |     print(s)
30 |     if s != '':
31 |         f.write(s + '\n')
32 | 
33 | f.close()
34 | 
35 | 


--------------------------------------------------------------------------------
/e02.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/web-scraping-with-captcha/list'
 7 | 
 8 | # 注意：要把Cookie改成自己的 
 9 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
10 |              'Cookie':'admin=a66abb5684c45962d887564f08346e8d'}
11 | 
12 | payload = {'username':'admin','password':'123456'}
13 | 
14 | html = requests.get(url, headers=myheaders, data=payload).text
15 | print(html)
16 | # exit();
17 | f = open('./data/e02/e02.html', 'w', encoding='utf-8')
18 | f.write(html)
19 | f.close()
20 | 
21 | root = etree.HTML(html)
22 | trs = root.xpath('//tr')
23 | 
24 | f = open('./data/e02/data_e02.txt', 'w', encoding='utf-8')
25 | for tr in trs:
26 |     tds = tr.xpath('./td')
27 |     s = ''
28 |     for td in tds:
29 |         # print(td.text)
30 |         s = s + str(td.text) + '|'
31 |     print(s)
32 |     if s != '':
33 |         f.write(s + '\n')
34 | 
35 | f.close()
36 | 
37 | 


--------------------------------------------------------------------------------
/e03.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | import re
 6 | 
 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-random-pagination'
 8 | # https://spiderbuf.cn/e03/5f685274073b
 9 | 
10 | myheaders = {
11 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 | 
13 | # 取页数
14 | html = requests.get(base_url, headers=myheaders).text
15 | root = etree.HTML(html)
16 | print(html)
17 | 
18 | lis = root.xpath('//ul[@class="pagination"]/li/a/@href')
19 | print(lis)
20 | 
21 | i = 1
22 | for item in lis:
23 |     print(item)
24 |     s = item.replace('/web-scraping-practice/scraping-random-pagination','')
25 |     print(base_url + s)
26 |     url = base_url + s
27 |     # print(url)
28 |     html = requests.get(url, headers=myheaders).text
29 |     # print(html)
30 |     #
31 |     f = open('./data/e03/e03_%d.html' % i, 'w', encoding='utf-8')
32 |     f.write(html)
33 |     f.close()
34 |     #
35 |     root = etree.HTML(html)
36 |     trs = root.xpath('//tr')
37 | 
38 |     f = open('./data/e03/e03_%d.txt' % i, 'w', encoding='utf-8')
39 |     for tr in trs:
40 |         tds = tr.xpath('./td')
41 |         s = ''
42 |         for td in tds:
43 |             s = s + str(td.xpath('string(.)')) + '|'
44 |             # s = s + str(td.text) + '|'
45 |         print(s)
46 |         if s != '':
47 |             f.write(s + '\n')
48 | 
49 |     f.close()
50 |     i += 1


--------------------------------------------------------------------------------
/e04.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | import re
 6 | 
 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/block-ip-proxy'
 8 | 
 9 | myheaders = {
10 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
11 | 
12 | proxies = {'http':'47.122.65.254:8080'}
13 | # 取页数
14 | html = requests.get(base_url, headers=myheaders,proxies=proxies).text
15 | root = etree.HTML(html)
16 | # print(html)
17 | lis = root.xpath('//ul[@class="pagination"]/li/a')
18 | pages = []
19 | for item in lis:
20 |     print(item.attrib['href'])
21 |     if item.attrib['class'] != 'item trap':
22 |         pages.append(item.attrib['href'])
23 | print(pages)
24 | i = 1
25 | for item in pages:
26 |     print(item)
27 |     s = item.replace('/web-scraping-practice/block-ip-proxy','')
28 |     print(base_url + s)
29 |     url = base_url + s
30 |     # print(url)
31 |     html = requests.get(url, headers=myheaders).text
32 |     # print(html)
33 |     #
34 |     f = open('./data/e04/e04_%d.html' % i, 'w', encoding='utf-8')
35 |     f.write(html)
36 |     f.close()
37 |     #
38 |     root = etree.HTML(html)
39 |     trs = root.xpath('//tr')
40 | 
41 |     f = open('./data/e04/e04_%d.txt' % i, 'w', encoding='utf-8')
42 |     for tr in trs:
43 |         tds = tr.xpath('./td')
44 |         s = ''
45 |         for td in tds:
46 |             s = s + str(td.xpath('string(.)')) + '|'
47 |             # s = s + str(td.text) + '|'
48 |         print(s)
49 |         if s != '':
50 |             f.write(s + '\n')
51 | 
52 |     f.close()
53 |     i += 1
54 | 


--------------------------------------------------------------------------------
/h01.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-css-confuse-offset'
 7 | 
 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
 9 |              'Referer':'https://spiderbuf.cn/list'}
10 | 
11 | html = requests.get(url, headers=myheaders).text
12 | print(html)
13 | 
14 | f = open('./data/h01/h01.html', 'w', encoding='utf-8')
15 | f.write(html)
16 | f.close()
17 | 
18 | root = etree.HTML(html)
19 | ls = root.xpath('//div[@class ="container"]/div/div')
20 | # page_text = ls[0].xpath('string(.)')
21 | # print(page_text)
22 | 
23 | f = open('./data/h01/h01.txt', 'w', encoding='utf-8')
24 | for item in ls:
25 |     hnodes = item.xpath('./h2')
26 |     temp = hnodes[0].xpath('string(.)')
27 |     s0 = temp[1:2] + temp[0:1] + temp[2:]
28 |     print(s0)
29 | 
30 |     pnodes = item.xpath('./p')
31 |     s1 = pnodes[0].text
32 |     print(s1)
33 |     temp = pnodes[1].xpath('string(.)').replace('企业估值(亿元)：','')
34 |     s2 = temp[1:2] + temp[0:1] + temp[2:]
35 |     print(s2)
36 |     s3 = pnodes[2].text
37 |     print(s3)
38 |     s4 = pnodes[3].text
39 |     print(s4)
40 |     # 富邦金融控股排名：50企业估值(亿元)：2135CEO：蔡明兴行业：金融服务
41 |     s = s0 + '|' + s1.replace('排名：','') + '|' + s2.replace('企业估值(亿元)：','') + '|' \
42 |         + s3.replace('CEO：','') + '|' + s4.replace('行业：','') + '\n'
43 |     print(s)
44 |     f.write(s)
45 | 
46 | f.close()


--------------------------------------------------------------------------------
/h02.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import os.path
  3 | 
  4 | import requests
  5 | from lxml import etree
  6 | import time
  7 | 
  8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-douban-movies-xpath-advanced'
  9 | 
 10 | myheaders = {
 11 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
 12 | 
 13 | def getHTML(url,file_name=''):
 14 |     html = requests.get(url, headers=myheaders).text
 15 |     if file_name != '':
 16 |         with open(file_name, 'w', encoding='utf-8') as f:
 17 |             f.write(html)
 18 |     return html
 19 | 
 20 | 
 21 | def downloadImage(url, path=''):
 22 |     img_data = requests.get(url, headers=myheaders).content
 23 |     # get image file name
 24 |     file_name = url.split('/').pop()
 25 | 
 26 |     with open(os.path.join(path, file_name), 'wb') as img:
 27 |         img.write(img_data)
 28 | 
 29 | 
 30 | def parseHTML(html):
 31 |     # parse html source code here
 32 |     root = etree.HTML(html)
 33 |     divs = root.xpath('/html/body/div/div[@style="margin-top: 10px;"]')
 34 |     i = 1
 35 |     for div in divs:
 36 |     # <div class ="row" style="margin-top: 10px;">
 37 |     #     <div class ="col-xs-12 col-lg-12">
 38 |     #         <h2 > 肖申克的救赎 The Shawshank Redemption</h2 ><br>
 39 |     #         <div class ="col-xs-3 col-lg-3">
 40 |     #             <img src="/static/images/douban_movie/posters/tt0111161.jpg" alt = "肖申克的救赎 The Shawshank Redemption" class="img-responsive img-thumbnail">
 41 |     #         </div>
 42 |     #         <div class ="col-xs-9 col-lg-9">
 43 |     #             <span> 豆瓣电影评分: </span> 9.7<br>
 44 |     #             <span> <span> 导演 </span>: <span> 弗兰克·德拉邦特 </span> </span><br>
 45 |     #             <span> <span> 编剧 </span>: <span> 弗兰克·德拉邦特 </span> </span>/ 弗兰克·德拉邦特/ 斯蒂芬·金<br>
 46 |     #             <span> <span> 主演 </span>: <span> 蒂姆·罗宾斯 </span> </span>/ 蒂姆·罗宾斯/ 摩根·弗里曼/ 鲍勃·冈顿/ 威廉姆·赛德勒/ 克兰西·布朗/ 吉尔·贝罗斯/ 马克·罗斯顿/ 詹姆斯·惠特摩/ 杰弗里·德曼/ 拉里·布兰登伯格/ 尼尔·吉恩托利/ 布赖恩·利比/ 大卫·普罗瓦尔/ 约瑟夫·劳格诺/ 祖德·塞克利拉/ 保罗·麦克兰尼/ 芮妮·布莱恩/ 阿方索·弗里曼/ V·J·福斯特/ 弗兰克·梅德拉诺/ 马克·迈尔斯/ 尼尔·萨默斯/ 耐德·巴拉米/ 布赖恩·戴拉特/ 唐·麦克马纳斯<br>
 47 |     #             <span> <span> 类型: </span> <span> 剧情 </span> </span>/ 剧情/ 犯罪<br>
 48 |     #             <span> <span> 制片国家/ 地区: </span> <span> 美国 </span> </span><br>
 49 |     #             <span> <span> 语言: </span> <span> 英语 </span> </span><br>
 50 |     #             <span> <span> 上映日期: </span> <span> 1994 - 09 - 10(多伦多电影节) </span> </span>/ 1994 - 09 - 10(多伦多电影节)/ 1994 - 10 - 14(美国)<br>
 51 |     #             <span> <span> 片长: </span> <span> 142分钟 </span> </span><br>
 52 |     #             <span> <span> 又名: </span> <span> 月黑高飞(港) </span> </span>/ 月黑高飞(港)/ 刺激1995(台)/ 地狱诺言/ 铁窗岁月/ 消香克的救赎<br>
 53 |     #             <span> IMDb: </span> tt0111161<br>
 54 |     #         </div>
 55 |     #     </div>
 56 |     # </div>
 57 |         if i % 2 == 0:
 58 |             # 简介 /html/body/div[2] /div[3]/div
 59 |             summarys = div.xpath('./div/text()')
 60 |             summary = ''
 61 |             if len(summarys) > 0:
 62 |                 summary = summarys[0].strip()
 63 |             print(summary)
 64 |         else:
 65 |             titles = div.xpath('./div/h2')
 66 |             title = ''
 67 |             if len(titles) > 0:
 68 |                 title = titles[0].text
 69 |                 print(title)
 70 |             #haibao
 71 |             img_urls = div.xpath('./div/div/img/@src')
 72 |             img_url = ''
 73 |             if len(img_urls) > 0:
 74 |                 img_url = 'https://spiderbuf.cn/' + img_urls[0]
 75 |             print(img_url)
 76 |             downloadImage(img_url, './data/h02')
 77 |             # 评分 /html/body/div[2]/div[2]  /div/div[2]/span[1]
 78 |             ratings = div.xpath('./div/div/span[contains(text(),"豆瓣电影评分:")]/following::text()[1]')
 79 |             rating = ''
 80 |             if len(ratings) > 0:
 81 |                 rating = ratings[0].strip()
 82 |             print(rating)
 83 |             # 导演 /html/body/div[2]/div[2] /div/div[2]/span[2]/span[2]
 84 |             directors = div.xpath('./div/div/span/span[contains(text(),"导演")]/following::text()')
 85 |             director = ''
 86 |             if len(directors) > 1:
 87 |                 director = directors[1].strip()
 88 |             if len(directors) > 3:
 89 |                 director += '/' + directors[2].strip()
 90 |             # for item in directors:
 91 |             #     if director != '':
 92 |             #         director += ' / '
 93 |             #     director += item.text
 94 |             print(director)
 95 |             # 编剧 /html/body/div[2]/div[2]  /div/div[2]/span[3]/span[2]
 96 |             scriptwriters = div.xpath('./div/div/span/span[contains(text(),"编剧")]/following::text()')
 97 |             scriptwriter = ''
 98 |             if len(scriptwriters) > 0:
 99 |                 scriptwriter = scriptwriters[1].strip()
100 | 
101 |             if len(scriptwriters) > 3:
102 |                 scriptwriter += scriptwriters[2].strip()
103 |             print(scriptwriter)
104 |             # 主演
105 |             performers = div.xpath('./div/div/span/span[contains(text(),"主演")]/following::text()')
106 |             performer = ''
107 |             if len(performers) > 0:
108 |                 performer = performers[1].strip()
109 | 
110 |             if len(performers) > 3:
111 |                 performer += performers[2].strip()
112 |             print(performer)
113 |             # 类型
114 |             genres = div.xpath('./div/div/span/span[contains(text(),"类型:")]/following::text()')
115 |             genre = ''
116 |             if len(genres) > 0:
117 |                 genre = genres[0].strip()
118 | 
119 |             if len(performers) > 1:
120 |                 genre += genres[1].strip()
121 |             print(genre)
122 |             # 制片国家/地区
123 |             areas = div.xpath('./div/div/span/span[contains(text(),"制片国家/地区:")]/following::text()')
124 |             area = ''
125 |             if len(areas) > 0:
126 |                 area = areas[0].strip()
127 |             print(area)
128 |             # 语言
129 |             langs = div.xpath('./div/div/span/span[contains(text(),"语言:")]/following::text()')
130 |             lang = ''
131 |             if len(langs) > 0:
132 |                 lang = langs[0].strip().replace('\n', '')
133 |             if len(langs) > 1:
134 |                 lang += langs[1].strip().replace('\n', '')
135 |             print(lang)
136 |             # 又名
137 |             aliases = div.xpath('./div/div/span/span[contains(text(),"又名:")]/following::text()')
138 |             alias = ''
139 |             if len(aliases) > 0:
140 |                 alias = aliases[0].strip().replace('\n', '').replace('|', '')
141 |             if len(aliases) > 1:
142 |                 alias += aliases[1].strip().replace('\n', '').replace('|', '')
143 |             print(alias)
144 |             # IMDb
145 |             imdbs = div.xpath('./div/div/span[contains(text(),"IMDb:")]/following::text()')
146 |             imdb = ''
147 |             if len(imdbs) > 0:
148 |                 imdb = imdbs[0].strip().replace('\n', '')
149 |             print(imdb)
150 |             # 上映日期
151 |             release_dates = div.xpath('./div/div/span/span[contains(text(),"上映日期:")]/following::text()')
152 |             release_date = ''
153 |             if len(release_dates) > 0:
154 |                 release_date = release_dates[0].strip().replace('\n', '')
155 |             if len(release_dates) > 1:
156 |                 release_date += release_dates[1].strip().replace('\n', '')
157 |             print(release_date)
158 |             # 片长
159 |             runtimes = div.xpath('./div/div/span/span[contains(text(),"片长:")]/following::text()')
160 |             runtime = ''
161 |             if len(runtimes) > 0:
162 |                 runtime = runtimes[0].strip().replace('\n', '')
163 |             if len(runtimes) > 1:
164 |                 runtime += runtimes[1].strip().replace('\n', '')
165 |             print(runtime)
166 |         i += 1
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     html = getHTML(base_url, './data/h02/h02.html')
171 |     # with open('./data/h02/h02.html', 'r', encoding='utf-8') as f:
172 |     #     html = f.read()
173 |     parseHTML(html)
174 | 


--------------------------------------------------------------------------------
/h03.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import os.path
  3 | 
  4 | import requests
  5 | from lxml import etree
  6 | import time
  7 | 
  8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-scroll-load'
  9 | 
 10 | myheaders = {
 11 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
 12 | 
 13 | def getHTML(url,file_name=''):
 14 |     html = requests.get(url, headers=myheaders).text
 15 |     if file_name != '':
 16 |         with open(file_name, 'w', encoding='utf-8') as f:
 17 |             f.write(html)
 18 |     return html
 19 | 
 20 | 
 21 | def downloadImage(url, path=''):
 22 |     img_data = requests.get(url, headers=myheaders).content
 23 |     # get image file name
 24 |     file_name = url.split('/').pop()
 25 | 
 26 |     with open(os.path.join(path, file_name), 'wb') as img:
 27 |         img.write(img_data)
 28 | 
 29 | 
 30 | def parseHTML(html):
 31 |     # parse html source code here
 32 |     root = etree.HTML(html)
 33 |     divs = root.xpath('/html/body/div/div/div[@style="margin-top: 10px;"]')
 34 |     i = 1
 35 |     for div in divs:
 36 |         if i % 2 == 0:
 37 |             # 简介 /html/body/div[2] /div[3]/div
 38 |             summarys = div.xpath('./div/text()')
 39 |             summary = ''
 40 |             if len(summarys) > 0:
 41 |                 summary = summarys[0].strip()
 42 |             print(summary)
 43 |         else:
 44 |             titles = div.xpath('./div/h2')
 45 |             title = ''
 46 |             if len(titles) > 0:
 47 |                 title = titles[0].text
 48 |                 print(title)
 49 |             #haibao
 50 |             img_urls = div.xpath('./div/div/img/@src')
 51 |             img_url = ''
 52 |             if len(img_urls) > 0:
 53 |                 img_url = 'https://spiderbuf.cn/' + img_urls[0]
 54 |             print(img_url)
 55 |             downloadImage(img_url, './data/h02')
 56 |             # 评分 /html/body/div[2]/div[2]  /div/div[2]/span[1]
 57 |             ratings = div.xpath('./div/div/span[contains(text(),"豆瓣电影评分:")]/following::text()[1]')
 58 |             rating = ''
 59 |             if len(ratings) > 0:
 60 |                 rating = ratings[0].strip()
 61 |             print(rating)
 62 |             # 导演 /html/body/div[2]/div[2] /div/div[2]/span[2]/span[2]
 63 |             directors = div.xpath('./div/div/span/span[contains(text(),"导演")]/following::text()')
 64 |             director = ''
 65 |             if len(directors) > 1:
 66 |                 director = directors[1].strip()
 67 |             if len(directors) > 3:
 68 |                 director += '/' + directors[2].strip()
 69 |             # for item in directors:
 70 |             #     if director != '':
 71 |             #         director += ' / '
 72 |             #     director += item.text
 73 |             print(director)
 74 |             # 编剧 /html/body/div[2]/div[2]  /div/div[2]/span[3]/span[2]
 75 |             scriptwriters = div.xpath('./div/div/span/span[contains(text(),"编剧")]/following::text()')
 76 |             scriptwriter = ''
 77 |             if len(scriptwriters) > 0:
 78 |                 scriptwriter = scriptwriters[1].strip()
 79 | 
 80 |             if len(scriptwriters) > 3:
 81 |                 scriptwriter += scriptwriters[2].strip()
 82 |             print(scriptwriter)
 83 |             # 主演
 84 |             performers = div.xpath('./div/div/span/span[contains(text(),"主演")]/following::text()')
 85 |             performer = ''
 86 |             if len(performers) > 0:
 87 |                 performer = performers[1].strip()
 88 | 
 89 |             if len(performers) > 3:
 90 |                 performer += performers[2].strip()
 91 |             print(performer)
 92 |             # 类型
 93 |             genres = div.xpath('./div/div/span/span[contains(text(),"类型:")]/following::text()')
 94 |             genre = ''
 95 |             if len(genres) > 0:
 96 |                 genre = genres[0].strip()
 97 | 
 98 |             if len(performers) > 1:
 99 |                 genre += genres[1].strip()
100 |             print(genre)
101 |             # 制片国家/地区
102 |             areas = div.xpath('./div/div/span/span[contains(text(),"制片国家/地区:")]/following::text()')
103 |             area = ''
104 |             if len(areas) > 0:
105 |                 area = areas[0].strip()
106 |             print(area)
107 |             # 语言
108 |             langs = div.xpath('./div/div/span/span[contains(text(),"语言:")]/following::text()')
109 |             lang = ''
110 |             if len(langs) > 0:
111 |                 lang = langs[0].strip().replace('\n', '')
112 |             if len(langs) > 1:
113 |                 lang += langs[1].strip().replace('\n', '')
114 |             print(lang)
115 |             # 又名
116 |             aliases = div.xpath('./div/div/span/span[contains(text(),"又名:")]/following::text()')
117 |             alias = ''
118 |             if len(aliases) > 0:
119 |                 alias = aliases[0].strip().replace('\n', '').replace('|', '')
120 |             if len(aliases) > 1:
121 |                 alias += aliases[1].strip().replace('\n', '').replace('|', '')
122 |             print(alias)
123 |             # IMDb
124 |             imdbs = div.xpath('./div/div/span[contains(text(),"IMDb:")]/following::text()')
125 |             imdb = ''
126 |             if len(imdbs) > 0:
127 |                 imdb = imdbs[0].strip().replace('\n', '')
128 |             print(imdb)
129 |             # 上映日期
130 |             release_dates = div.xpath('./div/div/span/span[contains(text(),"上映日期:")]/following::text()')
131 |             release_date = ''
132 |             if len(release_dates) > 0:
133 |                 release_date = release_dates[0].strip().replace('\n', '')
134 |             if len(release_dates) > 1:
135 |                 release_date += release_dates[1].strip().replace('\n', '')
136 |             print(release_date)
137 |             # 片长
138 |             runtimes = div.xpath('./div/div/span/span[contains(text(),"片长:")]/following::text()')
139 |             runtime = ''
140 |             if len(runtimes) > 0:
141 |                 runtime = runtimes[0].strip().replace('\n', '')
142 |             if len(runtimes) > 1:
143 |                 runtime += runtimes[1].strip().replace('\n', '')
144 |             print(runtime)
145 |         i += 1
146 | 
147 | 
148 | if __name__ == '__main__':
149 | 
150 |     html = getHTML(base_url, './data/h03/h03.html')
151 |     # get next page uri
152 |     uri = ''
153 |     root = etree.HTML(html)
154 |     divs = root.xpath('//div[@id="sLaOuol2SM0iFj4d"]/text()')
155 |     if len(divs) > 0:
156 |         uri = divs[0]
157 | 
158 |     i = 1
159 |     while (uri != '') & (i < 10):
160 |         print(uri)
161 |         html = getHTML(base_url + '/' +  uri, f'./data/h03/h03_{uri}.html')
162 |         uri = '' # ***
163 |         root = etree.HTML(html)
164 |         divs = root.xpath('//div[@id="sLaOuol2SM0iFj4d"]/text()')
165 |         if len(divs) > 0:
166 |             uri = divs[0]
167 |         i += 1
168 | 
169 |     # parseHTML(html)
170 | 


--------------------------------------------------------------------------------
/h04.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | from selenium import webdriver
 6 | import time
 7 | 
 8 | 
 9 | base_url = 'https://spiderbuf.cn/web-scraping-practice/javascript-confuse-encrypt-reverse'
10 | 
11 | myheaders = {
12 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
13 | 
14 | def getHTML(url,file_name=''):
15 |     client = webdriver.Chrome()
16 |     client.get(url)
17 |     time.sleep(3)
18 |     html = client.page_source
19 |     print(html)
20 |     client.quit()
21 | 
22 |     if file_name != '':
23 |         with open(file_name, 'w', encoding='utf-8') as f:
24 |             f.write(html)
25 |     return html
26 | 
27 | 
28 | def parseHTML(html,file_name=''):
29 |     root = etree.HTML(html)
30 |     trs = root.xpath('//tr')
31 | 
32 |     if file_name != '':
33 |         f = open(file_name, 'w', encoding='utf-8')
34 | 
35 |     for tr in trs:
36 |         tds = tr.xpath('./td')
37 |         s = ''
38 |         for td in tds:
39 |             s = s + str(td.xpath('string(.)')) + '|'
40 |             # s = s + str(td.text) + '|'
41 |         print(s)
42 |         if (s != '') & (file_name != ''):
43 |             f.write(s + '\n')
44 |     f.close()
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     # example: 1
49 |     html = getHTML(base_url, './data/h04/h04.html')
50 |     # parseHTML(html, './data/h04/h04.txt')
51 | 
52 |     # example: 2
53 |     # url = 'https://spiderbuf.cn/static/js/h04/udSL29.js'
54 |     # js_code = requests.get(url, headers=myheaders).text
55 |     # # js_code = js_code.encode('utf-8').decode('unicode-escape')
56 |     # a = js_code.index('=') + 1
57 |     # b = js_code.index(';')
58 |     # js_code = js_code[a:b]
59 | 
60 |     # # 将字符串转换为字典
61 |     # dict_data = eval(js_code)
62 |     # print(dict_data)
63 |     # for item in dict_data:
64 |     #     print(item)
65 | 


--------------------------------------------------------------------------------
/h05.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import base64
 3 | import hashlib
 4 | import time
 5 | 
 6 | import requests
 7 | from lxml import etree
 8 | from selenium import webdriver
 9 | 
10 | 
11 | base_url = 'https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp'
12 | 
13 | myheaders = {
14 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
15 | 
16 | def getHTML(url,file_name=''):
17 |     client = webdriver.Chrome()
18 |     client.get(url)
19 |     time.sleep(5)
20 |     html = client.page_source
21 |     print(html)
22 |     client.quit()
23 | 
24 |     if file_name != '':
25 |         with open(file_name, 'w', encoding='utf-8') as f:
26 |             f.write(html)
27 |     return html
28 | 
29 | 
30 | def parseHTML(html,file_name=''):
31 |     root = etree.HTML(html)
32 |     trs = root.xpath('//tr')
33 | 
34 |     if file_name != '':
35 |         f = open(file_name, 'w', encoding='utf-8')
36 | 
37 |     for tr in trs:
38 |         tds = tr.xpath('./td')
39 |         s = ''
40 |         for td in tds:
41 |             s = s + str(td.xpath('string(.)')) + '|'
42 |             # s = s + str(td.text) + '|'
43 |         print(s)
44 |         if (s != '') & (file_name != ''):
45 |             f.write(s + '\n')
46 |     f.close()
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     # example: 1
51 |     html = getHTML(base_url, './data/h05/h05.html')
52 |     # parseHTML(html, './data/h04/h04.txt')
53 | 
54 |     # example: 2
55 |     # url = 'https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp/api/'
56 |     # timestamp = str(int(time.time()))
57 |     # md5_hash = hashlib.md5()
58 |     # md5_hash.update(timestamp.encode('utf-8'))
59 |     # md5 = md5_hash.hexdigest()
60 |     # s = ('%s,%s' % (timestamp, md5))
61 |     # print(s)
62 |     # payload = str(base64.b64encode(s.encode('utf-8')), 'utf-8')
63 |     # print(payload)
64 |     # html = requests.get(url + payload, headers=myheaders).text
65 |     # print(html)
66 | 


--------------------------------------------------------------------------------
/h06.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import base64
 3 | import hashlib
 4 | import time
 5 | 
 6 | import requests
 7 | from lxml import etree
 8 | from selenium import webdriver
 9 | 
10 | 
11 | base_url = 'https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper'
12 | 
13 | myheaders = {
14 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
15 | 
16 | def getHTML(url,file_name=''):
17 |     # client = webdriver.Chrome()
18 |     # client.get(url)
19 |     # html = client.page_source
20 |     # print(html)
21 |     # client.quit()
22 |     options = webdriver.ChromeOptions()
23 |     options.add_argument('disable-infobars')
24 |     # options.add_argument('headless')
25 |     options.set_capability('goog:loggingPrefs', {'browser': 'ALL'})  # 输出浏览器console 日志：console.log
26 | 
27 |     options.add_argument('--disable-blink-features=AutomationControlled')  # 改变navigator.webdriver 属性值
28 | 
29 |     client = webdriver.Chrome(options=options)
30 |     client.get(url)
31 |     time.sleep(5)
32 |     print(client.page_source)
33 |     html = client.page_source
34 |     
35 |     # client.quit()
36 | 
37 |     if file_name != '':
38 |         with open(file_name, 'w', encoding='utf-8') as f:
39 |             f.write(html)
40 |     return html
41 | 
42 | 
43 | def parseHTML(html,file_name=''):
44 |     root = etree.HTML(html)
45 |     trs = root.xpath('//tr')
46 | 
47 |     if file_name != '':
48 |         f = open(file_name, 'w', encoding='utf-8')
49 | 
50 |     for tr in trs:
51 |         tds = tr.xpath('./td')
52 |         s = ''
53 |         for td in tds:
54 |             s = s + str(td.xpath('string(.)')) + '|'
55 |             # s = s + str(td.text) + '|'
56 |         print(s)
57 |         if (s != '') & (file_name != ''):
58 |             f.write(s + '\n')
59 |     f.close()
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     # example: 1
64 |     html = getHTML(base_url, './data/h06/h06.html')
65 |     # print(html)
66 |     # parseHTML(html, './data/h06/h06.txt')
67 | 
68 |     # example: 2
69 |     # url = 'https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper/api/'
70 |     # timestamp = str(int(time.time()))
71 |     # md5_hash = hashlib.md5()
72 |     # md5_hash.update(timestamp.encode('utf-8'))
73 |     # md5 = md5_hash.hexdigest()
74 |     # s = ('%s,%s' % (timestamp, md5))
75 |     # print(s)
76 |     # payload = str(base64.b64encode(s.encode('utf-8')), 'utf-8')
77 |     # print(payload)
78 |     # html = requests.get(url + payload, headers=myheaders).text
79 |     # print(html)
80 |     # # 将字符串转换为字典
81 |     # dict_data = eval(html)
82 |     # print(dict_data)
83 |     # for item in dict_data:
84 |     #     print(item)
85 | 


--------------------------------------------------------------------------------
/n01.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/user-agent-referrer'
 7 | 
 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
 9 |              'Referer':'https://spiderbuf.cn/list'}
10 | 
11 | html = requests.get(url, headers=myheaders).text
12 | print(html)
13 | 
14 | f = open('./data/n01/n01.html', 'w', encoding='utf-8')
15 | f.write(html)
16 | f.close()
17 | 
18 | root = etree.HTML(html)
19 | ls = root.xpath('//div[@class ="container"]/div/div')
20 | # page_text = ls[0].xpath('string(.)')
21 | # print(page_text)
22 | 
23 | f = open('./data/n01/n01.txt', 'w', encoding='utf-8')
24 | for item in ls:
25 |     hnodes = item.xpath('./h2')
26 |     s0 = hnodes[0].text
27 | 
28 |     pnodes = item.xpath('./p')
29 |     s1 = pnodes[0].text
30 |     s2 = pnodes[1].text
31 |     s3 = pnodes[2].text
32 |     s4 = pnodes[3].text
33 |     # 富邦金融控股排名：50企业估值(亿元)：2135CEO：蔡明兴行业：金融服务
34 |     s = s0 + '|' + s1.replace('排名：','') + '|' + s2.replace('企业估值(亿元)：','') + '|' \
35 |         + s3.replace('CEO：','') + '|' + s4.replace('行业：','') + '\n'
36 |     print(s)
37 |     f.write(s)
38 |     # s = ''
39 |     # for td in tds:
40 |     #     s = s + str(td.xpath('string(.)')) + '|'
41 |     #     # s = s + str(td.text) + '|'
42 |     # print(s)
43 |     # if s != '':
44 |     #     f.write(s + '\n')
45 | 
46 | f.close()


--------------------------------------------------------------------------------
/n02.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | import base64
 7 | 
 8 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-images-base64'
 9 | 
10 | myheaders = {
11 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 | 
13 | 
14 | html = requests.get(url, headers=myheaders).text
15 | print(html)
16 | 
17 | f = open('./data/n02/n02.html', 'w', encoding='utf-8')
18 | f.write(html)
19 | f.close()
20 | 
21 | root = etree.HTML(html)
22 | imgs = root.xpath('//img/@src')
23 | print(imgs)
24 | for item in imgs:
25 |     print(item)
26 |     # item 是获取到的base64字符串
27 |     item = item.replace('data:image/png;base64,','')
28 |     str_bytes = item.encode('raw_unicode_escape')  # str 转 bytes
29 |     decoded = base64.b64decode(str_bytes)
30 | 
31 |     img = open('./data/n02/n02.png', 'wb')
32 |     img.write(decoded)
33 |     img.close()
34 | 
35 | 


--------------------------------------------------------------------------------
/n03.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | import time
 6 | 
 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-bypass-request-limit/%d'
 8 | 
 9 | myheaders = {
10 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
11 | 
12 | max_no = 20
13 | # exit()
14 | 
15 | for i in range(1, max_no + 1):
16 |     print(i)
17 |     url = base_url % i
18 |     print(url)
19 |     html = requests.get(url, headers=myheaders).text
20 |     print(html)
21 | 
22 |     f = open('./data/n03/n03_%d.html' % i, 'w', encoding='utf-8')
23 |     f.write(html)
24 |     f.close()
25 | 
26 |     root = etree.HTML(html)
27 |     trs = root.xpath('//tr')
28 | 
29 |     f = open('./data/n03/datan03_%d.txt' % i, 'w', encoding='utf-8')
30 |     for tr in trs:
31 |         tds = tr.xpath('./td')
32 |         s = ''
33 |         for td in tds:
34 |             s = s + str(td.xpath('string(.)')) + '|'
35 |             # s = s + str(td.text) + '|'
36 |         print(s)
37 |         if s != '':
38 |             f.write(s + '\n')
39 |     time.sleep(2)
40 |     f.close()
41 | 


--------------------------------------------------------------------------------
/n04.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os.path
 3 | 
 4 | import requests
 5 | from lxml import etree
 6 | import time
 7 | 
 8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/css-pseudo-elements'
 9 | 
10 | myheaders = {
11 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 | 
13 | def getHTML(url,file_name=''):
14 |     html = requests.get(url, headers=myheaders).text
15 |     if file_name != '':
16 |         with open(file_name, 'w', encoding='utf-8') as f:
17 |             f.write(html)
18 |     return html
19 | 
20 | 
21 | 
22 | def parseHTML(html):
23 |     class_map = {'abcdef::before':'7',
24 |                 'abcdef::after':'5',
25 |                 'ghijkl::before':'8',
26 |                 'ghijkl::after':'9',
27 |                 'mnopqr::before':'9',
28 |                 'mnopqr::after':'1',
29 |                 'uvwxyz::before':'1',
30 |                 'uvwxyz::after':'4',
31 |                 'yzabcd::before':'2',
32 |                 'yzabcd::after':'6',
33 |                 'efghij::before':'3',
34 |                 'efghij::after':'2',
35 |                 'klmnop::before':'5',
36 |                 'klmnop::after':'7',
37 |                 'qrstuv::before':'4',
38 |                 'qrstuv::after':'3',
39 |                 'wxyzab::before':'6',
40 |                 'wxyzab::after':'0',
41 |                 'cdefgh::before':'0',
42 |                 'cdefgh::after':'8',
43 |                 'hijklm::after':'6',
44 |                 'opqrst::after':'0',
45 |                 'uvwxab::after':'3',
46 |                 'cdijkl::after':'8',
47 |                 'pqrmno::after':'1',
48 |                 'stuvwx::after':'4',
49 |                 'pkenmc::after':'7',
50 |                 'tcwdsk::after':'9',
51 |                 'mkrtyu::after':'5',
52 |                 'umdrtk::after':'2'}
53 |     # parse html source code here
54 |     root = etree.HTML(html)
55 |     divs = root.xpath('/html/body/div/div[@style="margin-top: 10px;"]')
56 | 
57 |     for div in divs:
58 |         titles = div.xpath('./div/h2')
59 |         title = ''
60 |         if len(titles) > 0:
61 |             title = titles[0].text
62 |             print(title)
63 |         # 评分
64 |         ranking_spans = div.xpath('./div/div[2]/span[@class]')
65 | 
66 |         if len(ranking_spans) > 0:
67 |             span = ranking_spans[0]
68 |             attr_class = span.attrib["class"] if "class" in span.attrib else ""
69 |             # print(f"{span} - {attr_class}")
70 |             # print(span.text)
71 | 
72 |             classes = attr_class.split(" ")
73 |             if len(classes) > 0:
74 |                 s1 = class_map[classes[0] + '::before']
75 |                 s2 = class_map[classes[1] + '::after']
76 |                 print(f'{s1}.{s2}')
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     html = getHTML(base_url, './data/n04/n04.html')
81 |     parseHTML(html)
82 | 


--------------------------------------------------------------------------------
/n05.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os.path
 3 | 
 4 | import requests
 5 | from lxml import etree
 6 | import time
 7 | 
 8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/css-sprites'
 9 | 
10 | myheaders = {
11 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 | 
13 | def getHTML(url,file_name=''):
14 |     html_bytes = requests.get(url, headers=myheaders).content
15 |     html = html_bytes.decode()
16 |     if file_name != '':
17 |         with open(file_name, 'w', encoding='utf-8') as f:
18 |             f.write(html)
19 |     return html
20 | 
21 | 
22 | 
23 | def parseHTML(html):
24 |     class_map = {'sprite abcdef':'0',
25 |                 'sprite ghijkl':'1',
26 |                 'sprite mnopqr':'2',
27 |                 'sprite uvwxyz':'3',
28 |                 'sprite yzabcd':'4',
29 |                 'sprite efghij':'5',
30 |                 'sprite klmnop':'6',
31 |                 'sprite qrstuv':'7',
32 |                 'sprite wxyzab':'8',
33 |                 'sprite cdefgh':'9'}
34 |     # parse html source code here
35 |     root = etree.HTML(html)
36 |     divs = root.xpath('//div[@style="margin-bottom: 30px;"]')
37 | 
38 |     for div in divs:
39 |         titles = div.xpath('./h2')
40 |         title = ''
41 |         if len(titles) > 0:
42 |             title = titles[0].text
43 |             print(title)
44 | 
45 |         amount_spans = div.xpath('./p/span[@class]')
46 |         amount_str = ''
47 |         for span in amount_spans:
48 |             attr_class = span.attrib["class"] if "class" in span.attrib else ""
49 |             # print(f"{span} - {attr_class}")
50 |             # print(span.text)
51 |             amount_str += class_map[attr_class]
52 |         print(amount_str)
53 | 
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     html = getHTML(base_url, './data/n05/n05.html')
58 |     parseHTML(html)
59 | 


--------------------------------------------------------------------------------
/n06.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os.path
 3 | 
 4 | import requests
 5 | from lxml import etree
 6 | import time
 7 | 
 8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-form-rpa'
 9 | 
10 | myheaders = {
11 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 | 
13 | def getHTML(url,file_name=''):
14 |     html_bytes = requests.get(url, headers=myheaders).content
15 |     html = html_bytes.decode()
16 |     if file_name != '':
17 |         with open(file_name, 'w', encoding='utf-8') as f:
18 |             f.write(html)
19 |     return html
20 | 
21 | 
22 | 
23 | def parseHTML(html):
24 |     # parse html source code here
25 |     root = etree.HTML(html)
26 |     inputs = root.xpath('//input')
27 | 
28 |     for input in inputs:
29 |         attr_name = input.attrib['name'] if 'name' in input.attrib else ''
30 |         input_value = input.attrib['value'] if 'value' in input.attrib else ''
31 |         if attr_name == 'username':
32 |             print(f'用户名:{input_value}')
33 | 
34 |         if attr_name == 'password':
35 |             print(f'密码:{input_value}')
36 | 
37 |         if attr_name == 'email':
38 |             print(f'邮箱:{input_value}')
39 | 
40 |         if attr_name == 'website':
41 |             print(f'网站:{input_value}')
42 | 
43 |         if attr_name == 'date':
44 |             print(f'生日:{input_value}')
45 | 
46 |         if attr_name == 'time':
47 |             print(f'时间:{input_value}')
48 | 
49 |         if attr_name == 'number':
50 |             print(f'数量:{input_value}')
51 | 
52 |         if attr_name == 'range':
53 |             print(f'滑块:{input_value}')
54 | 
55 |         if attr_name == 'color':
56 |             print(f'颜色:{input_value}')
57 | 
58 |         if attr_name == 'search':
59 |             print(f'搜索:{input_value}')
60 | 
61 |         if attr_name == 'gender':
62 |             temp = input.attrib['checked'] if 'checked' in input.attrib else ''
63 |             if temp != '':
64 |                 print(f'性别:{input_value}')
65 | 
66 |         if attr_name == 'interest':
67 |             temp = input.attrib['checked'] if 'checked' in input.attrib else ''
68 |             if temp != '':
69 |                 print(f'开发语言:{input_value}')
70 | 
71 |     options = root.xpath('//select[@name="country"]/option')
72 |     for option in options:
73 |         attr_name = option.attrib['selected'] if 'selected' in option.attrib else ''
74 |         option_value = option.attrib['value'] if 'value' in option.attrib else ''
75 |         if attr_name != '':
76 |             print(f'人物代表:{option_value}')
77 | 
78 |     lis = root.xpath('//ul[@class="items"]/li/a')
79 |     for li in lis:
80 |         attr_name = li.attrib['class'] if 'class' in li.attrib else ''
81 |         li_value = li.text
82 |         if 'active' in attr_name:
83 |             print(f'代表人物出处：{li_value}')
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     html = getHTML(base_url, './data/n06/n06.html')
88 |     parseHTML(html)
89 | 


--------------------------------------------------------------------------------
/n07.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | base_url = 'https://spiderbuf.cn/web-scraping-practice/random-css-classname'
 7 | 
 8 | my_headers = {
 9 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
10 | 
11 | # 取页数
12 | html_bytes = requests.get(base_url, headers=my_headers).content
13 | html = html_bytes.decode()
14 | root = etree.HTML(html)
15 | with open('./data/n07/n07.html', 'w', encoding='utf-8') as f:
16 |     f.write(html)
17 | # print(html)
18 | divs = root.xpath('/html/body/main/div[2]/div')
19 | with open('./data/n07/n07.txt','w',encoding='utf-8') as f:
20 |     for div in divs:
21 |         print(div.text)
22 |         if div.text:
23 |             f.write(f'{div.text}\n')


--------------------------------------------------------------------------------
/python_begin.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | 
 5 | print('中文')
 6 | 
 7 | a = 2
 8 | b = '张三'
 9 | c = 'ddddddd'
10 | print(a,b,c)
11 | 
12 | d = 2
13 | print(a - d)
14 | 
15 | if a == 1:
16 |     print('等于1')
17 | elif a == 2:
18 |     print('等于2')
19 | else:
20 |     print('不等于')
21 | 
22 | # for i in range(0, 10):
23 | #     print(i)
24 | 
25 | while  a < 10:
26 |     a += 1
27 |     print(a)
28 | 
29 | print("中文")
30 | 
31 | lst = ['张三', '李四', '王五']
32 | 
33 | dict = {'张三':'a2', '李四':'b3'}
34 | 
35 | print(dict['张三'])
36 | 
37 | for item in dict.keys():
38 |     print(dict[item])
39 | 
40 | # f = open('abc.txt', 'w', encoding='utf-8')
41 | # f.write('这是写入文件的内容')
42 | # f.close()
43 | f = open('abc.txt', 'r', encoding='utf-8')
44 | s = f.read()
45 | f.close()
46 | print(s)
47 | 
48 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | apturl==0.5.2
 2 | attrs==23.1.0
 3 | blinker==1.4
 4 | Brlapi==0.7.0
 5 | certifi==2019.11.28
 6 | chardet==3.0.4
 7 | Click==7.0
 8 | colorama==0.4.3
 9 | command-not-found==0.3
10 | cryptography==2.8
11 | cupshelpers==1.0
12 | dbus-python==1.2.16
13 | defer==1.0.6
14 | distro==1.4.0
15 | distro-info===0.23ubuntu1
16 | entrypoints==0.3
17 | exceptiongroup==1.1.3
18 | h11==0.14.0
19 | httplib2==0.14.0
20 | idna==2.8
21 | keyring==18.0.1
22 | language-selector==0.1
23 | launchpadlib==1.10.13
24 | lazr.restfulclient==0.14.2
25 | lazr.uri==1.0.3
26 | louis==3.12.0
27 | lxml==4.6.3
28 | macaroonbakery==1.3.1
29 | netifaces==0.10.4
30 | oauthlib==3.1.0
31 | olefile==0.46
32 | outcome==1.3.0.post0
33 | pexpect==4.6.0
34 | Pillow==7.0.0
35 | protobuf==3.6.1
36 | pycairo==1.16.2
37 | pycups==1.9.73
38 | PyGObject==3.36.0
39 | PyJWT==1.7.1
40 | pymacaroons==0.13.0
41 | PyNaCl==1.3.0
42 | pyRFC3339==1.1
43 | PySocks==1.7.1
44 | python-apt==2.0.1+ubuntu0.20.4.1
45 | python-dateutil==2.7.3
46 | python-debian==0.1.36+ubuntu1.1
47 | pytz==2019.3
48 | pyxdg==0.26
49 | PyYAML==5.3.1
50 | reportlab==3.5.34
51 | requests==2.22.0
52 | requests-unixsocket==0.2.0
53 | SecretStorage==2.3.1
54 | selenium==4.15.2
55 | simplejson==3.16.0
56 | six==1.14.0
57 | sniffio==1.3.0
58 | sortedcontainers==2.4.0
59 | systemd-python==234
60 | trio==0.23.1
61 | trio-websocket==0.11.1
62 | ubuntu-advantage-tools==8001
63 | ubuntu-drivers-common==0.0.0
64 | ufw==0.36
65 | unattended-upgrades==0.1
66 | urllib3==1.25.8
67 | wadllib==1.3.3
68 | wsproto==1.2.0
69 | xkit==0.0.0
70 | 


--------------------------------------------------------------------------------
/s01.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/requests-lxml-for-scraping-beginner'
 7 | 
 8 | html = requests.get(url).text
 9 | 
10 | f = open('01.html', 'w', encoding='utf-8')
11 | f.write(html)
12 | f.close()
13 | 
14 | root = etree.HTML(html)
15 | trs = root.xpath('//tr')
16 | 
17 | f = open('data01.txt', 'w', encoding='utf-8')
18 | for tr in trs:
19 |     tds = tr.xpath('./td')
20 |     s = ''
21 |     for td in tds:
22 |         # print(td.text)
23 |         s = s + str(td.text) + '|'
24 |     print(s)
25 |     if s != '':
26 |         f.write(s + '\n')
27 | 
28 | f.close()
29 | 
30 | # print(html)


--------------------------------------------------------------------------------
/s02.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-http-header'
 7 | 
 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
 9 | 
10 | html = requests.get(url, headers=myheaders).text
11 | print(html)
12 | 
13 | f = open('02.html', 'w', encoding='utf-8')
14 | f.write(html)
15 | f.close()
16 | 
17 | root = etree.HTML(html)
18 | trs = root.xpath('//tr')
19 | 
20 | f = open('data02.txt', 'w', encoding='utf-8')
21 | for tr in trs:
22 |     tds = tr.xpath('./td')
23 |     s = ''
24 |     for td in tds:
25 |         # print(td.text)
26 |         s = s + str(td.text) + '|'
27 |     print(s)
28 |     if s != '':
29 |         f.write(s + '\n')
30 | 
31 | f.close()
32 | 
33 | 


--------------------------------------------------------------------------------
/s03.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/lxml-xpath-advanced'
 7 | 
 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
 9 | 
10 | html = requests.get(url, headers=myheaders).text
11 | print(html)
12 | 
13 | f = open('03.html', 'w', encoding='utf-8')
14 | f.write(html)
15 | f.close()
16 | 
17 | root = etree.HTML(html)
18 | trs = root.xpath('//tr')
19 | 
20 | f = open('data03.txt', 'w', encoding='utf-8')
21 | for tr in trs:
22 |     tds = tr.xpath('./td')
23 |     s = ''
24 |     for td in tds:
25 |         s = s + str(td.xpath('string(.)')) + '|'
26 |         # s = s + str(td.text) + '|'
27 |     print(s)
28 |     if s != '':
29 |         f.write(s + '\n')
30 | 
31 | f.close()
32 | 
33 | 


--------------------------------------------------------------------------------
/s04.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | import re
 6 | 
 7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper?pageno=%d'
 8 | 
 9 | myheaders = {
10 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
11 | 
12 | # 取页数
13 | html = requests.get(base_url % 1, headers=myheaders).text
14 | root = etree.HTML(html)
15 | 
16 | lis = root.xpath('//ul[@class="pagination"]/li')
17 | page_text = lis[0].xpath('string(.)')
18 | ls = re.findall('[0-9]', page_text)
19 | 
20 | max_no = int(ls[0])
21 | # exit()
22 | 
23 | for i in range(1, max_no + 1):
24 |     print(i)
25 |     url = base_url % i
26 |     print(url)
27 |     html = requests.get(url, headers=myheaders).text
28 |     print(html)
29 | 
30 |     f = open('04_%d.html' % i, 'w', encoding='utf-8')
31 |     f.write(html)
32 |     f.close()
33 | 
34 |     root = etree.HTML(html)
35 |     trs = root.xpath('//tr')
36 | 
37 |     f = open('data04_%d.txt' % i, 'w', encoding='utf-8')
38 |     for tr in trs:
39 |         tds = tr.xpath('./td')
40 |         s = ''
41 |         for td in tds:
42 |             s = s + str(td.xpath('string(.)')) + '|'
43 |             # s = s + str(td.text) + '|'
44 |         print(s)
45 |         if s != '':
46 |             f.write(s + '\n')
47 | 
48 |     f.close()
49 | 


--------------------------------------------------------------------------------
/s05.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-images-from-web'
 7 | 
 8 | myheaders = {
 9 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
10 | 
11 | 
12 | html = requests.get(url, headers=myheaders).text
13 | print(html)
14 | 
15 | f = open('05.html', 'w', encoding='utf-8')
16 | f.write(html)
17 | f.close()
18 | 
19 | root = etree.HTML(html)
20 | imgs = root.xpath('//img/@src')
21 | print(imgs)
22 | for item in imgs:
23 |     img_data = requests.get('https://spiderbuf.cn' + item, headers=myheaders).content
24 |     img = open(str(item).replace('/',''), 'wb')
25 |     img.write(img_data)
26 |     img.close()
27 | #
28 | # f = open('data05.txt', 'w', encoding='utf-8')
29 | # for tr in trs:
30 | #     tds = tr.xpath('./td')
31 | #     s = ''
32 | #     for td in tds:
33 | #         s = s + str(td.xpath('string(.)')) + '|'
34 | #         # s = s + str(td.text) + '|'
35 | #     print(s)
36 | #     if s != '':
37 | #         f.write(s + '\n')
38 | #
39 | # f.close()
40 | 


--------------------------------------------------------------------------------
/s06.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/inner'
 7 | 
 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
 9 | 
10 | html = requests.get(url, headers=myheaders).text
11 | print(html)
12 | 
13 | f = open('06.html', 'w', encoding='utf-8')
14 | f.write(html)
15 | f.close()
16 | 
17 | root = etree.HTML(html)
18 | trs = root.xpath('//tr')
19 | 
20 | f = open('data06.txt', 'w', encoding='utf-8')
21 | for tr in trs:
22 |     tds = tr.xpath('./td')
23 |     s = ''
24 |     for td in tds:
25 |         s = s + str(td.xpath('string(.)')) + '|'
26 |         # s = s + str(td.text) + '|'
27 |     print(s)
28 |     if s != '':
29 |         f.write(s + '\n')
30 | 
31 | f.close()
32 | 
33 | 


--------------------------------------------------------------------------------
/s07.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | import json
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/iplist?order=asc'
 7 | 
 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
 9 | 
10 | data_json = requests.get(url, headers=myheaders).text
11 | print(data_json)
12 | 
13 | f = open('./data/7/07.html', 'w', encoding='utf-8')
14 | f.write(data_json)
15 | f.close()
16 | 
17 | ls = json.loads(data_json)
18 | print(ls)
19 | 
20 | f = open('./data/7/data07.txt', 'w', encoding='utf-8')
21 | for item in ls:
22 |     # print(item)
23 |     s = '%s|%s|%s|%s|%s|%s|%s\n' % (item['ip'], item['mac'],item['manufacturer'], item['name'],item['ports'], item['status'], item['type'])
24 |     f.write(s)
25 | f.close()


--------------------------------------------------------------------------------
/s08.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | 
 6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-via-http-post'
 7 | 
 8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
 9 | 
10 | payload = {'level':'8'}
11 | html = requests.post(url, headers=myheaders, data=payload).text
12 | print(html)
13 | 
14 | f = open('./data/8/08.html', 'w', encoding='utf-8')
15 | f.write(html)
16 | f.close()
17 | 
18 | root = etree.HTML(html)
19 | trs = root.xpath('//tr')
20 | 
21 | f = open('./data/8/data08.txt', 'w', encoding='utf-8')
22 | for tr in trs:
23 |     tds = tr.xpath('./td')
24 |     s = ''
25 |     for td in tds:
26 |         # print(td.text)
27 |         s = s + str(td.text) + '|'
28 |     print(s)
29 |     if s != '':
30 |         f.write(s + '\n')
31 | 
32 | f.close()
33 | 
34 | 


--------------------------------------------------------------------------------
/s4-1.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import requests
 4 | from lxml import etree
 5 | import re
 6 | 
 7 | url = 'https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper?pageno=2&pagesize=50'
 8 | 
 9 | myheaders = {
10 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
11 | 
12 | html = requests.get(url, headers=myheaders).text
13 | print(html)
14 | 
15 | f = open('./data/4-1/04-1.html', 'w', encoding='utf-8')
16 | f.write(html)
17 | f.close()
18 | 
19 | root = etree.HTML(html)
20 | trs = root.xpath('//tr')
21 | 
22 | f = open('./data/4-1/data04-1.txt', 'w', encoding='utf-8')
23 | for tr in trs:
24 |     tds = tr.xpath('./td')
25 |     s = ''
26 |     for td in tds:
27 |         s = s + str(td.xpath('string(.)')) + '|'
28 |         # s = s + str(td.text) + '|'
29 |     print(s)
30 |     if s != '':
31 |         f.write(s + '\n')
32 | 
33 | f.close()
34 | 


--------------------------------------------------------------------------------