├── LICENSE
├── README.md
├── blob
├── crawler.py
├── main.py
└── templates
│ └── index.html
├── c01.py
├── c02.py
├── c03.py
├── c04.py
├── e01.py
├── e02.py
├── e03.py
├── e04.py
├── h01.py
├── h02.py
├── h03.py
├── h04.py
├── h05.py
├── h06.py
├── n01.py
├── n02.py
├── n03.py
├── n04.py
├── n05.py
├── n06.py
├── n07.py
├── python_begin.py
├── requirements.txt
├── s01.py
├── s02.py
├── s03.py
├── s04.py
├── s05.py
├── s06.py
├── s07.py
├── s08.py
└── s4-1.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 hhuayuan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spiderbuf 爬虫练习网站
2 | Spiderbuf 爬虫实战案例示例代码
3 |
4 | ## 官方网站
5 | https://spiderbuf.cn
6 |
7 | [爬虫实战练习列表](https://spiderbuf.cn/web-scraping-practices)
8 |
9 | 专注于 Python 爬虫练习的网站.
10 |
11 | 提供丰富的爬虫教程、爬虫案例解析和爬虫练习题.
12 |
13 | Python爬虫开发强化练习,在矛与盾的攻防中不断提高技术水平,通过大量的爬虫实战掌握常见的爬虫与反爬套路。
14 |
15 | 引导式爬虫案例 + 免费爬虫视频教程,以闯关的形式挑战各个爬虫任务,培养爬虫开发的直觉及经验,验证自身爬虫开发与反爬虫实力的时候到了。
16 |
17 | ## 代码运行环境
18 | Ubuntu 20.04.6 LTS
19 | macOS 15+
20 |
21 | Python3.8+
22 |
23 | ## 更新日志
24 | | 编号 | 名称 | 更新日期 |
25 | | ---- | ---- | ---- |
26 | | [C08](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c08 "JS逆向爬虫实战练习(金融数据)") | JS逆向爬虫实战练习(金融数据) | 2025-05-31 |
27 | | [C07](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c07 "JavaScript 逆向爬虫实战案例") | JavaScript 逆向爬虫实战案例 | 2025-05-14 |
28 | | [C06](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c06 "JavaScript 逆向爬虫实战案例") | JavaScript 逆向爬虫实战案例 | 2025-04-15 |
29 | | [C05](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c05 "爬虫实战练习") | 爬虫实战练习 | 2025-02-26 |
30 | | [C04](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c04 "爬虫实战练习") | 爬虫实战练习 | 2025-02-11 |
31 | | [C03](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c03 "爬虫实战练习") | 爬虫实战练习 | 2025-01-15 |
32 | | [C02](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c02 "爬虫实战练习") | 爬虫实战练习 | 2024-12-16 |
33 | | [C01](https://spiderbuf.cn/web-scraping-practice/scraper-practice-c01 "爬虫实战练习") | 爬虫实战练习 | 2024-11-17 |
34 | | [N07](https://spiderbuf.cn/web-scraping-practice/random-css-classname "随机CSS样式类名,无Element ID") | 随机CSS样式类名,无Element ID | 2024-09-08 |
35 | | [E04](https://spiderbuf.cn/web-scraping-practice/block-ip-proxy "被屏蔽IP后使用代理服务器爬取页面") | 被屏蔽IP后使用代理服务器爬取页面 | 2024-07-23 |
36 | | [N06](https://spiderbuf.cn/web-scraping-practice/scraping-form-rpa "网页表单爬取(RPA初阶)") | 网页表单爬取(RPA初阶) | 2024-03-26 |
37 | | [N05](https://spiderbuf.cn/web-scraping-practice/css-sprites "CSS Sprites (雪碧图)反爬") | CSS Sprites (雪碧图)反爬 | 2024-02-18 |
38 | | [N04](https://spiderbuf.cn/web-scraping-practice/css-pseudo-elements "CSS伪元素反爬") | CSS伪元素反爬 | 2024-01-11 |
39 | | [H06](https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper "初识浏览器指纹:Selenium是如何被反爬的") | 初识浏览器指纹:Selenium是如何被反爬的 | 2023-12-22 |
40 | | [H05](https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp "js逆向破解时间戳反爬") | js逆向破解时间戳反爬 | 2023-11-26 |
41 | | [H04](https://spiderbuf.cn/web-scraping-practice/javascript-confuse-encrypt-reverse "js加密混淆及简单反调试") | js加密混淆及简单反调试 | 2023-11-11 |
42 | | [H03](https://spiderbuf.cn/web-scraping-practice/scraping-scroll-load "网页滚动加载的原理及爬取(JavaScript加密混淆逆向基础)") | 网页滚动加载的原理及爬取(JavaScript加密混淆逆向基础) | 2023-10-20 |
43 | | [H02](https://spiderbuf.cn/web-scraping-practice/scraping-douban-movies-xpath-advanced "高分电影列表复杂页面的解析(仿豆瓣电影)- xpath高级用法") | 高分电影列表复杂页面的解析(仿豆瓣电影)- xpath高级用法 | 2023-10-10 |
44 | | [N03](https://spiderbuf.cn/web-scraping-practice/scraper-bypass-request-limit "限制访问频率不低于1秒") | 限制访问频率不低于1秒 | 2023-07-02 |
45 | | [N02](https://spiderbuf.cn/web-scraping-practice/scraping-images-base64 "使用Base64编码的图片爬取与解码还原") | 使用Base64编码的图片爬取与解码还原 | 2023-06-30 |
46 | | [H01](https://spiderbuf.cn/web-scraping-practice/scraping-css-confuse-offset "CSS样式偏移混淆文本内容的解析与爬取") | CSS样式偏移混淆文本内容的解析与爬取 | 2023-06-25 |
47 | | [N01](https://spiderbuf.cn/web-scraping-practice/user-agent-referrer "User-Agent与Referer校验反爬") | User-Agent与Referer校验反爬 | 2022-11-05 |
48 | | [E03](https://spiderbuf.cn/web-scraping-practice/scraping-random-pagination "无序号翻页") | 无序号翻页 | 2022-11-01 |
49 | | [E02](https://spiderbuf.cn/web-scraping-practice/web-scraping-with-captcha "带验证码的登录爬取") | 带验证码的登录爬取 | 2022-09-17 |
50 | | [E01](https://spiderbuf.cn/web-scraping-practice/scraper-login-username-password "用户名密码登录爬取后台数据") | 用户名密码登录爬取后台数据 | 2022-08-21 |
51 | | [S08](https://spiderbuf.cn/web-scraping-practice/scraper-via-http-post "http post请求的数据爬取") | http post请求的数据爬取 | 2021-06-21 |
52 | | [S07](https://spiderbuf.cn/web-scraping-practice/scraping-ajax-api "ajax动态加载数据的爬取") | ajax动态加载数据的爬取 | 2021-06-21 |
53 | | [S06](https://spiderbuf.cn/web-scraping-practice/scraping-iframe "带iframe的页面源码分析及数据爬取") | 带iframe的页面源码分析及数据爬取 | 2021-06-21 |
54 | | [S05](https://spiderbuf.cn/web-scraping-practice/scraping-images-from-web "网页图片的爬取及本地保存") | 网页图片的爬取及本地保存 | 2021-06-21 |
55 | | [S04](https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper "分页参数分析及翻页爬取") | 分页参数分析及翻页爬取 | 2021-06-21 |
56 | | [S03](https://spiderbuf.cn/web-scraping-practice/lxml-xpath-advanced "lxml库进阶语法及解析练习") | lxml库进阶语法及解析练习 | 2021-06-21 |
57 | | [S02](https://spiderbuf.cn/web-scraping-practice/scraper-http-header "http请求分析及头构造使用") | http请求分析及头构造使用 | 2021-06-21 |
58 | | [S01](https://spiderbuf.cn/web-scraping-practice/requests-lxml-for-scraping-beginner "requests库及lxml库入门") | requests库及lxml库入门 | 2021-06-21 |
59 |
60 | # 课程
61 | [《深入了解Python爬虫攻防》](https://www.udemy.com/course/python-spiderbuf/?referralCode=77D640F3DB5A310151DB "深入了解Python爬虫攻防")
62 |
63 | [《Axure RP 9 从入门到精通:打造高保真交互原型》](https://www.udemy.com/course/axure-rp-9/?referralCode=3374A9C2D8B735FC54A1 "Axure RP 9 从入门到精通:打造高保真交互原型")
64 |
--------------------------------------------------------------------------------
/blob/crawler.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | def download_video(video_url, output_path):
4 | try:
5 | # 发送 GET 请求
6 | response = requests.get(video_url, stream=True)
7 |
8 | # 如果请求成功
9 | if response.status_code == 200:
10 | # 打开文件并写入下载的数据
11 | with open(output_path, 'wb') as f:
12 | for chunk in response.iter_content(chunk_size=1024):
13 | if chunk: # 防止下载过程中产生空数据
14 | f.write(chunk)
15 | print(f"Video downloaded successfully: {output_path}")
16 | else:
17 | print(f"Failed to retrieve video, HTTP status code: {response.status_code}")
18 |
19 | except requests.exceptions.RequestException as e:
20 | print(f"An error occurred: {e}")
21 |
22 | if __name__ == "__main__":
23 | # 视频文件的 URL
24 | video_url = 'http://localhost:5000/video' # 替换成目标视频 URL
25 |
26 | # 本地保存的路径
27 | output_path = 'downloaded_video.mp4'
28 |
29 | # 下载视频
30 | download_video(video_url, output_path)
31 |
--------------------------------------------------------------------------------
/blob/main.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, Response, send_file, render_template
2 | import os
3 |
4 | app = Flask(__name__)
5 |
6 | @app.route('/')
7 | def index():
8 | return render_template('index.html')
9 |
10 | @app.route('/video')
11 | def stream_video():
12 | # 请自行找一个mp4视频文件,放在当前目录下(即与main.py同一个目录)
13 | video_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'spiderbuf.mp4') # 这里替换为你的视频文件路径
14 | print(video_path)
15 | if not os.path.exists(video_path):
16 | return "Video not found", 404
17 |
18 | # 打开视频文件,以二进制流的形式发送
19 | def generate_video():
20 | with open(video_path, 'rb') as f:
21 | while chunk := f.read(1024 * 1024): # 每次读取 1MB
22 | yield chunk
23 |
24 | return Response(generate_video(), content_type='video/mp4')
25 |
26 |
27 | if __name__ == '__main__':
28 | app.run(debug=True)
29 |
--------------------------------------------------------------------------------
/blob/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | 视频流爬虫 - Spiderbuf
7 |
8 |
9 |
10 |
视频流爬虫 - Spiderbuf
11 | src="blob:https://spiderbuf.cn....."
12 |
16 |
17 |
18 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/c01.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | import numpy as np
6 |
7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c01/mnist'
8 |
9 | my_headers = {
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
11 | 'Referer': 'https://spiderbuf.cn/web-scraping-practice/c01',
12 | 'Cookie': '__cgf3t=G0gzgFKDRlLtmZH7NrzqOb1x4pek1xNQk12KKc4g21Y-1731624199;'}
13 |
14 |
15 | html_bytes = requests.get(base_url, headers=my_headers).content
16 | html = html_bytes.decode()
17 | root = etree.HTML(html)
18 | with open('./data/c01/c01.html', 'w', encoding='utf-8') as f:
19 | f.write(html)
20 | # print(html)
21 |
22 | trs = root.xpath('//tbody/tr')
23 |
24 |
25 | pix1_arry = []
26 | for tr in trs:
27 | tds = tr.xpath('td')
28 | # 把 pix1 列的值添加到数组
29 | pix1_arry.append([int(tds[1].text) if len(tds) > 1 else 0])
30 | # 计算 pix1 列的平均值并四舍五入至两位小数
31 | print(round(np.mean(pix1_arry),2))
--------------------------------------------------------------------------------
/c02.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | from selenium import webdriver
6 | from selenium.webdriver import ChromeOptions, ActionChains
7 | from selenium.webdriver.common.by import By
8 | import time
9 | import base64
10 | import json
11 | import numpy as np
12 |
13 |
14 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c02'
15 |
16 | myheaders = {
17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
18 |
19 | def getHTML(url,file_name=''):
20 | client = webdriver.Chrome()
21 | client.get(url)
22 | time.sleep(10)
23 |
24 | # 事件参数对象
25 | actionChains = ActionChains(client)
26 |
27 | # 捕捉滑块元素
28 | slide_btn = client.find_element(By.ID, 'slider')
29 | # 观察网站滑块移动的长度和位置
30 | actionChains.click_and_hold(slide_btn)
31 | actionChains.move_by_offset(220,0)
32 | # 这里要注意:
33 | # 以下三个是以上面的坐标(220,0)为起点来计算的
34 | # 所以最终移动的距离是220加上以下的累计
35 | actionChains.move_by_offset(11,0)
36 | actionChains.move_by_offset(13,0)
37 | actionChains.move_by_offset(10,0)
38 |
39 | actionChains.release()
40 | actionChains.perform()
41 |
42 | html = client.page_source
43 | print(html)
44 | client.quit()
45 |
46 | if file_name != '':
47 | with open(file_name, 'w', encoding='utf-8') as f:
48 | f.write(html)
49 | return html
50 |
51 |
52 | def parseHTML(html):
53 | root = etree.HTML(html)
54 | trs = root.xpath('//tr')
55 |
56 | prices = []
57 | for tr in trs:
58 | tds = tr.xpath('./td')
59 | if len(tds) > 2:
60 | prices.append(int(tds[2].text))
61 | print(prices)
62 | print(np.mean(prices))
63 |
64 |
65 | if __name__ == '__main__':
66 | # example: 1
67 | html = getHTML(base_url, './data/c02/c02.html')
68 | parseHTML(html)
69 |
70 | # example: 2
71 | # html = requests.get(base_url, headers=myheaders).text
72 | # a = html.index('encryptedData = "') + 17
73 | # html = html[a:]
74 | # b = html.index('";')
75 | # html = html[:b]
76 | # print(html)
77 | # dic = eval(base64.b64decode(html.encode('utf-8')))
78 | # objs = dic['flights']
79 | # prices = []
80 | # for obj in objs:
81 | # print(obj)
82 | # prices.append(obj['price'])
83 |
84 | # print(prices)
85 | # print(np.mean(prices))
86 |
87 |
--------------------------------------------------------------------------------
/c03.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | from selenium import webdriver
6 | from selenium.webdriver.common.by import By
7 | import time
8 | import json
9 | import hashlib
10 | import random
11 | import numpy as np
12 |
13 |
14 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c03'
15 |
16 | myheaders = {
17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
18 |
19 | def getHTML(url,file_name=''):
20 | sepal_width_arr = []
21 | client = webdriver.Chrome()
22 | client.get(url)
23 | time.sleep(5)
24 | html = client.page_source
25 | # print(html)
26 | parseHTML(html,sepal_width_arr)
27 | if file_name != '':
28 | with open(file_name + '_1.html', 'w', encoding='utf-8') as f:
29 | f.write(html)
30 | for i in range(1,5):
31 | client.find_elements(By.XPATH, '//ul/li/a')[i].click()
32 | time.sleep(5)
33 | html = client.page_source
34 | # print(html)
35 | parseHTML(html,sepal_width_arr)
36 | if file_name != '':
37 | with open(file_name + f'_{i+1}.html', 'w', encoding='utf-8') as f:
38 | f.write(html)
39 |
40 | client.quit()
41 | print(sepal_width_arr)
42 | print(np.sum(sepal_width_arr))
43 | return html
44 |
45 |
46 | def parseHTML(html,sepal_width_arr):
47 | root = etree.HTML(html)
48 | trs = root.xpath('//tr')
49 | for tr in trs:
50 | tds = tr.xpath('./td')
51 | if len(tds) > 2:
52 | sepal_width_arr.append(float(tds[2].text))
53 |
54 |
55 |
56 |
57 | if __name__ == '__main__':
58 | # example: 1
59 | # html = getHTML(base_url, './data/c03/c03')
60 |
61 | # example: 2
62 | sepal_width_arr = []
63 | for i in range(1, 6):
64 | random_value = random.randint(2000, 10000)
65 | timestamp = int(time.time())
66 | xorResult = i ^ timestamp
67 | md5_hash = hashlib.md5()
68 | md5_hash.update(f'{xorResult}{timestamp}'.encode('utf-8'))
69 | hash = md5_hash.hexdigest()
70 | payload = {
71 | 'random': random_value,
72 | 'timestamp': timestamp,
73 | 'hash': hash,
74 | 'xorResult': xorResult
75 | }
76 | # print(payload)
77 | json_response = requests.post(base_url, headers=myheaders,json=payload).text
78 |
79 | print(json_response)
80 | json_data = json.loads(json_response)
81 | for item in json_data:
82 | # print(item)
83 | sepal_width_arr.append(item['sepal_width'])
84 |
85 | print(sepal_width_arr)
86 | print(np.sum(sepal_width_arr))
87 |
88 |
--------------------------------------------------------------------------------
/c04.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # @Author: spiderbuf
3 | from lxml import etree
4 | from selenium import webdriver
5 | from selenium.webdriver import ChromeOptions, ActionChains
6 | from selenium.webdriver.common.by import By
7 | import time
8 | import random
9 | import numpy as np
10 | import re
11 |
12 |
13 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-practice-c04'
14 |
15 | myheaders = {
16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
17 |
18 |
19 | if __name__ == '__main__':
20 | options = webdriver.ChromeOptions()
21 | options.add_argument('disable-infobars')
22 | options.set_capability('goog:loggingPrefs', {'browser': 'ALL'})
23 |
24 | options.add_argument('--disable-blink-features=AutomationControlled') # 改变navigator.webdriver 属性值
25 |
26 | client = webdriver.Chrome(options=options)
27 | print('Getting page...')
28 | client.get(base_url)
29 | time.sleep(3)
30 |
31 | # 模拟用户在页面上滑动光标
32 | actionChains = ActionChains(client)
33 | actionChains.move_by_offset(430,330)
34 | for i in range(20):
35 | step = random.randint(1, 10)
36 | actionChains.move_by_offset(step,step).perform()
37 |
38 | checkbox = client.find_element(By.ID, 'captcha')
39 | checkbox.click()
40 | print('Checkbox clicked...')
41 | time.sleep(3)
42 | html = client.page_source
43 | # print(html)
44 | client.quit()
45 |
46 | with open('./data/c04/c04.html', 'w', encoding='utf-8') as f:
47 | f.write(html)
48 |
49 | root = etree.HTML(html)
50 | items = root.xpath('//div[@class="stats"]')
51 | results = []
52 | for item in items:
53 | spans = item.xpath('.//span')
54 | s = ''.join(spans[3].xpath('string(.)'))
55 | results.append(int(re.findall('\d+',spans[0].text)[0]) + int(''.join(re.findall('\d+',s))))
56 |
57 | print(np.average(results))
--------------------------------------------------------------------------------
/e01.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-login-username-password/login'
7 |
8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
9 |
10 | payload = {'username':'admin','password':'123456'}
11 |
12 | html = requests.post(url, headers=myheaders, data=payload).text
13 | print(html)
14 |
15 | f = open('./data/e01/e01.html', 'w', encoding='utf-8')
16 | f.write(html)
17 | f.close()
18 |
19 | root = etree.HTML(html)
20 | trs = root.xpath('//tr')
21 |
22 | f = open('./data/e01/data_e01.txt', 'w', encoding='utf-8')
23 | for tr in trs:
24 | tds = tr.xpath('./td')
25 | s = ''
26 | for td in tds:
27 | # print(td.text)
28 | s = s + str(td.text) + '|'
29 | print(s)
30 | if s != '':
31 | f.write(s + '\n')
32 |
33 | f.close()
34 |
35 |
--------------------------------------------------------------------------------
/e02.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/web-scraping-with-captcha/list'
7 |
8 | # 注意:要把Cookie改成自己的
9 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
10 | 'Cookie':'admin=a66abb5684c45962d887564f08346e8d'}
11 |
12 | payload = {'username':'admin','password':'123456'}
13 |
14 | html = requests.get(url, headers=myheaders, data=payload).text
15 | print(html)
16 | # exit();
17 | f = open('./data/e02/e02.html', 'w', encoding='utf-8')
18 | f.write(html)
19 | f.close()
20 |
21 | root = etree.HTML(html)
22 | trs = root.xpath('//tr')
23 |
24 | f = open('./data/e02/data_e02.txt', 'w', encoding='utf-8')
25 | for tr in trs:
26 | tds = tr.xpath('./td')
27 | s = ''
28 | for td in tds:
29 | # print(td.text)
30 | s = s + str(td.text) + '|'
31 | print(s)
32 | if s != '':
33 | f.write(s + '\n')
34 |
35 | f.close()
36 |
37 |
--------------------------------------------------------------------------------
/e03.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | import re
6 |
7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-random-pagination'
8 | # https://spiderbuf.cn/e03/5f685274073b
9 |
10 | myheaders = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 |
13 | # 取页数
14 | html = requests.get(base_url, headers=myheaders).text
15 | root = etree.HTML(html)
16 | print(html)
17 |
18 | lis = root.xpath('//ul[@class="pagination"]/li/a/@href')
19 | print(lis)
20 |
21 | i = 1
22 | for item in lis:
23 | print(item)
24 | s = item.replace('/web-scraping-practice/scraping-random-pagination','')
25 | print(base_url + s)
26 | url = base_url + s
27 | # print(url)
28 | html = requests.get(url, headers=myheaders).text
29 | # print(html)
30 | #
31 | f = open('./data/e03/e03_%d.html' % i, 'w', encoding='utf-8')
32 | f.write(html)
33 | f.close()
34 | #
35 | root = etree.HTML(html)
36 | trs = root.xpath('//tr')
37 |
38 | f = open('./data/e03/e03_%d.txt' % i, 'w', encoding='utf-8')
39 | for tr in trs:
40 | tds = tr.xpath('./td')
41 | s = ''
42 | for td in tds:
43 | s = s + str(td.xpath('string(.)')) + '|'
44 | # s = s + str(td.text) + '|'
45 | print(s)
46 | if s != '':
47 | f.write(s + '\n')
48 |
49 | f.close()
50 | i += 1
--------------------------------------------------------------------------------
/e04.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | import re
6 |
7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/block-ip-proxy'
8 |
9 | myheaders = {
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
11 |
12 | proxies = {'http':'47.122.65.254:8080'}
13 | # 取页数
14 | html = requests.get(base_url, headers=myheaders,proxies=proxies).text
15 | root = etree.HTML(html)
16 | # print(html)
17 | lis = root.xpath('//ul[@class="pagination"]/li/a')
18 | pages = []
19 | for item in lis:
20 | print(item.attrib['href'])
21 | if item.attrib['class'] != 'item trap':
22 | pages.append(item.attrib['href'])
23 | print(pages)
24 | i = 1
25 | for item in pages:
26 | print(item)
27 | s = item.replace('/web-scraping-practice/block-ip-proxy','')
28 | print(base_url + s)
29 | url = base_url + s
30 | # print(url)
31 | html = requests.get(url, headers=myheaders).text
32 | # print(html)
33 | #
34 | f = open('./data/e04/e04_%d.html' % i, 'w', encoding='utf-8')
35 | f.write(html)
36 | f.close()
37 | #
38 | root = etree.HTML(html)
39 | trs = root.xpath('//tr')
40 |
41 | f = open('./data/e04/e04_%d.txt' % i, 'w', encoding='utf-8')
42 | for tr in trs:
43 | tds = tr.xpath('./td')
44 | s = ''
45 | for td in tds:
46 | s = s + str(td.xpath('string(.)')) + '|'
47 | # s = s + str(td.text) + '|'
48 | print(s)
49 | if s != '':
50 | f.write(s + '\n')
51 |
52 | f.close()
53 | i += 1
54 |
--------------------------------------------------------------------------------
/h01.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-css-confuse-offset'
7 |
8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
9 | 'Referer':'https://spiderbuf.cn/list'}
10 |
11 | html = requests.get(url, headers=myheaders).text
12 | print(html)
13 |
14 | f = open('./data/h01/h01.html', 'w', encoding='utf-8')
15 | f.write(html)
16 | f.close()
17 |
18 | root = etree.HTML(html)
19 | ls = root.xpath('//div[@class ="container"]/div/div')
20 | # page_text = ls[0].xpath('string(.)')
21 | # print(page_text)
22 |
23 | f = open('./data/h01/h01.txt', 'w', encoding='utf-8')
24 | for item in ls:
25 | hnodes = item.xpath('./h2')
26 | temp = hnodes[0].xpath('string(.)')
27 | s0 = temp[1:2] + temp[0:1] + temp[2:]
28 | print(s0)
29 |
30 | pnodes = item.xpath('./p')
31 | s1 = pnodes[0].text
32 | print(s1)
33 | temp = pnodes[1].xpath('string(.)').replace('企业估值(亿元):','')
34 | s2 = temp[1:2] + temp[0:1] + temp[2:]
35 | print(s2)
36 | s3 = pnodes[2].text
37 | print(s3)
38 | s4 = pnodes[3].text
39 | print(s4)
40 | # 富邦金融控股排名:50企业估值(亿元):2135CEO:蔡明兴行业:金融服务
41 | s = s0 + '|' + s1.replace('排名:','') + '|' + s2.replace('企业估值(亿元):','') + '|' \
42 | + s3.replace('CEO:','') + '|' + s4.replace('行业:','') + '\n'
43 | print(s)
44 | f.write(s)
45 |
46 | f.close()
--------------------------------------------------------------------------------
/h02.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os.path
3 |
4 | import requests
5 | from lxml import etree
6 | import time
7 |
8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-douban-movies-xpath-advanced'
9 |
10 | myheaders = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 |
13 | def getHTML(url,file_name=''):
14 | html = requests.get(url, headers=myheaders).text
15 | if file_name != '':
16 | with open(file_name, 'w', encoding='utf-8') as f:
17 | f.write(html)
18 | return html
19 |
20 |
21 | def downloadImage(url, path=''):
22 | img_data = requests.get(url, headers=myheaders).content
23 | # get image file name
24 | file_name = url.split('/').pop()
25 |
26 | with open(os.path.join(path, file_name), 'wb') as img:
27 | img.write(img_data)
28 |
29 |
30 | def parseHTML(html):
31 | # parse html source code here
32 | root = etree.HTML(html)
33 | divs = root.xpath('/html/body/div/div[@style="margin-top: 10px;"]')
34 | i = 1
35 | for div in divs:
36 | #
37 | #
38 | #
肖申克的救赎 The Shawshank Redemption
39 | #
40 | #

41 | #
42 | #
43 | # 豆瓣电影评分: 9.7
44 | # 导演 : 弗兰克·德拉邦特
45 | # 编剧 : 弗兰克·德拉邦特 / 弗兰克·德拉邦特/ 斯蒂芬·金
46 | # 主演 : 蒂姆·罗宾斯 / 蒂姆·罗宾斯/ 摩根·弗里曼/ 鲍勃·冈顿/ 威廉姆·赛德勒/ 克兰西·布朗/ 吉尔·贝罗斯/ 马克·罗斯顿/ 詹姆斯·惠特摩/ 杰弗里·德曼/ 拉里·布兰登伯格/ 尼尔·吉恩托利/ 布赖恩·利比/ 大卫·普罗瓦尔/ 约瑟夫·劳格诺/ 祖德·塞克利拉/ 保罗·麦克兰尼/ 芮妮·布莱恩/ 阿方索·弗里曼/ V·J·福斯特/ 弗兰克·梅德拉诺/ 马克·迈尔斯/ 尼尔·萨默斯/ 耐德·巴拉米/ 布赖恩·戴拉特/ 唐·麦克马纳斯
47 | # 类型: 剧情 / 剧情/ 犯罪
48 | # 制片国家/ 地区: 美国
49 | # 语言: 英语
50 | # 上映日期: 1994 - 09 - 10(多伦多电影节) / 1994 - 09 - 10(多伦多电影节)/ 1994 - 10 - 14(美国)
51 | # 片长: 142分钟
52 | # 又名: 月黑高飞(港) / 月黑高飞(港)/ 刺激1995(台)/ 地狱诺言/ 铁窗岁月/ 消香克的救赎
53 | # IMDb: tt0111161
54 | #
55 | #
56 | #
57 | if i % 2 == 0:
58 | # 简介 /html/body/div[2] /div[3]/div
59 | summarys = div.xpath('./div/text()')
60 | summary = ''
61 | if len(summarys) > 0:
62 | summary = summarys[0].strip()
63 | print(summary)
64 | else:
65 | titles = div.xpath('./div/h2')
66 | title = ''
67 | if len(titles) > 0:
68 | title = titles[0].text
69 | print(title)
70 | #haibao
71 | img_urls = div.xpath('./div/div/img/@src')
72 | img_url = ''
73 | if len(img_urls) > 0:
74 | img_url = 'https://spiderbuf.cn/' + img_urls[0]
75 | print(img_url)
76 | downloadImage(img_url, './data/h02')
77 | # 评分 /html/body/div[2]/div[2] /div/div[2]/span[1]
78 | ratings = div.xpath('./div/div/span[contains(text(),"豆瓣电影评分:")]/following::text()[1]')
79 | rating = ''
80 | if len(ratings) > 0:
81 | rating = ratings[0].strip()
82 | print(rating)
83 | # 导演 /html/body/div[2]/div[2] /div/div[2]/span[2]/span[2]
84 | directors = div.xpath('./div/div/span/span[contains(text(),"导演")]/following::text()')
85 | director = ''
86 | if len(directors) > 1:
87 | director = directors[1].strip()
88 | if len(directors) > 3:
89 | director += '/' + directors[2].strip()
90 | # for item in directors:
91 | # if director != '':
92 | # director += ' / '
93 | # director += item.text
94 | print(director)
95 | # 编剧 /html/body/div[2]/div[2] /div/div[2]/span[3]/span[2]
96 | scriptwriters = div.xpath('./div/div/span/span[contains(text(),"编剧")]/following::text()')
97 | scriptwriter = ''
98 | if len(scriptwriters) > 0:
99 | scriptwriter = scriptwriters[1].strip()
100 |
101 | if len(scriptwriters) > 3:
102 | scriptwriter += scriptwriters[2].strip()
103 | print(scriptwriter)
104 | # 主演
105 | performers = div.xpath('./div/div/span/span[contains(text(),"主演")]/following::text()')
106 | performer = ''
107 | if len(performers) > 0:
108 | performer = performers[1].strip()
109 |
110 | if len(performers) > 3:
111 | performer += performers[2].strip()
112 | print(performer)
113 | # 类型
114 | genres = div.xpath('./div/div/span/span[contains(text(),"类型:")]/following::text()')
115 | genre = ''
116 | if len(genres) > 0:
117 | genre = genres[0].strip()
118 |
119 | if len(performers) > 1:
120 | genre += genres[1].strip()
121 | print(genre)
122 | # 制片国家/地区
123 | areas = div.xpath('./div/div/span/span[contains(text(),"制片国家/地区:")]/following::text()')
124 | area = ''
125 | if len(areas) > 0:
126 | area = areas[0].strip()
127 | print(area)
128 | # 语言
129 | langs = div.xpath('./div/div/span/span[contains(text(),"语言:")]/following::text()')
130 | lang = ''
131 | if len(langs) > 0:
132 | lang = langs[0].strip().replace('\n', '')
133 | if len(langs) > 1:
134 | lang += langs[1].strip().replace('\n', '')
135 | print(lang)
136 | # 又名
137 | aliases = div.xpath('./div/div/span/span[contains(text(),"又名:")]/following::text()')
138 | alias = ''
139 | if len(aliases) > 0:
140 | alias = aliases[0].strip().replace('\n', '').replace('|', '')
141 | if len(aliases) > 1:
142 | alias += aliases[1].strip().replace('\n', '').replace('|', '')
143 | print(alias)
144 | # IMDb
145 | imdbs = div.xpath('./div/div/span[contains(text(),"IMDb:")]/following::text()')
146 | imdb = ''
147 | if len(imdbs) > 0:
148 | imdb = imdbs[0].strip().replace('\n', '')
149 | print(imdb)
150 | # 上映日期
151 | release_dates = div.xpath('./div/div/span/span[contains(text(),"上映日期:")]/following::text()')
152 | release_date = ''
153 | if len(release_dates) > 0:
154 | release_date = release_dates[0].strip().replace('\n', '')
155 | if len(release_dates) > 1:
156 | release_date += release_dates[1].strip().replace('\n', '')
157 | print(release_date)
158 | # 片长
159 | runtimes = div.xpath('./div/div/span/span[contains(text(),"片长:")]/following::text()')
160 | runtime = ''
161 | if len(runtimes) > 0:
162 | runtime = runtimes[0].strip().replace('\n', '')
163 | if len(runtimes) > 1:
164 | runtime += runtimes[1].strip().replace('\n', '')
165 | print(runtime)
166 | i += 1
167 |
168 |
169 | if __name__ == '__main__':
170 | html = getHTML(base_url, './data/h02/h02.html')
171 | # with open('./data/h02/h02.html', 'r', encoding='utf-8') as f:
172 | # html = f.read()
173 | parseHTML(html)
174 |
--------------------------------------------------------------------------------
/h03.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os.path
3 |
4 | import requests
5 | from lxml import etree
6 | import time
7 |
8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-scroll-load'
9 |
10 | myheaders = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 |
13 | def getHTML(url,file_name=''):
14 | html = requests.get(url, headers=myheaders).text
15 | if file_name != '':
16 | with open(file_name, 'w', encoding='utf-8') as f:
17 | f.write(html)
18 | return html
19 |
20 |
21 | def downloadImage(url, path=''):
22 | img_data = requests.get(url, headers=myheaders).content
23 | # get image file name
24 | file_name = url.split('/').pop()
25 |
26 | with open(os.path.join(path, file_name), 'wb') as img:
27 | img.write(img_data)
28 |
29 |
30 | def parseHTML(html):
31 | # parse html source code here
32 | root = etree.HTML(html)
33 | divs = root.xpath('/html/body/div/div/div[@style="margin-top: 10px;"]')
34 | i = 1
35 | for div in divs:
36 | if i % 2 == 0:
37 | # 简介 /html/body/div[2] /div[3]/div
38 | summarys = div.xpath('./div/text()')
39 | summary = ''
40 | if len(summarys) > 0:
41 | summary = summarys[0].strip()
42 | print(summary)
43 | else:
44 | titles = div.xpath('./div/h2')
45 | title = ''
46 | if len(titles) > 0:
47 | title = titles[0].text
48 | print(title)
49 | #haibao
50 | img_urls = div.xpath('./div/div/img/@src')
51 | img_url = ''
52 | if len(img_urls) > 0:
53 | img_url = 'https://spiderbuf.cn/' + img_urls[0]
54 | print(img_url)
55 | downloadImage(img_url, './data/h02')
56 | # 评分 /html/body/div[2]/div[2] /div/div[2]/span[1]
57 | ratings = div.xpath('./div/div/span[contains(text(),"豆瓣电影评分:")]/following::text()[1]')
58 | rating = ''
59 | if len(ratings) > 0:
60 | rating = ratings[0].strip()
61 | print(rating)
62 | # 导演 /html/body/div[2]/div[2] /div/div[2]/span[2]/span[2]
63 | directors = div.xpath('./div/div/span/span[contains(text(),"导演")]/following::text()')
64 | director = ''
65 | if len(directors) > 1:
66 | director = directors[1].strip()
67 | if len(directors) > 3:
68 | director += '/' + directors[2].strip()
69 | # for item in directors:
70 | # if director != '':
71 | # director += ' / '
72 | # director += item.text
73 | print(director)
74 | # 编剧 /html/body/div[2]/div[2] /div/div[2]/span[3]/span[2]
75 | scriptwriters = div.xpath('./div/div/span/span[contains(text(),"编剧")]/following::text()')
76 | scriptwriter = ''
77 | if len(scriptwriters) > 0:
78 | scriptwriter = scriptwriters[1].strip()
79 |
80 | if len(scriptwriters) > 3:
81 | scriptwriter += scriptwriters[2].strip()
82 | print(scriptwriter)
83 | # 主演
84 | performers = div.xpath('./div/div/span/span[contains(text(),"主演")]/following::text()')
85 | performer = ''
86 | if len(performers) > 0:
87 | performer = performers[1].strip()
88 |
89 | if len(performers) > 3:
90 | performer += performers[2].strip()
91 | print(performer)
92 | # 类型
93 | genres = div.xpath('./div/div/span/span[contains(text(),"类型:")]/following::text()')
94 | genre = ''
95 | if len(genres) > 0:
96 | genre = genres[0].strip()
97 |
98 | if len(performers) > 1:
99 | genre += genres[1].strip()
100 | print(genre)
101 | # 制片国家/地区
102 | areas = div.xpath('./div/div/span/span[contains(text(),"制片国家/地区:")]/following::text()')
103 | area = ''
104 | if len(areas) > 0:
105 | area = areas[0].strip()
106 | print(area)
107 | # 语言
108 | langs = div.xpath('./div/div/span/span[contains(text(),"语言:")]/following::text()')
109 | lang = ''
110 | if len(langs) > 0:
111 | lang = langs[0].strip().replace('\n', '')
112 | if len(langs) > 1:
113 | lang += langs[1].strip().replace('\n', '')
114 | print(lang)
115 | # 又名
116 | aliases = div.xpath('./div/div/span/span[contains(text(),"又名:")]/following::text()')
117 | alias = ''
118 | if len(aliases) > 0:
119 | alias = aliases[0].strip().replace('\n', '').replace('|', '')
120 | if len(aliases) > 1:
121 | alias += aliases[1].strip().replace('\n', '').replace('|', '')
122 | print(alias)
123 | # IMDb
124 | imdbs = div.xpath('./div/div/span[contains(text(),"IMDb:")]/following::text()')
125 | imdb = ''
126 | if len(imdbs) > 0:
127 | imdb = imdbs[0].strip().replace('\n', '')
128 | print(imdb)
129 | # 上映日期
130 | release_dates = div.xpath('./div/div/span/span[contains(text(),"上映日期:")]/following::text()')
131 | release_date = ''
132 | if len(release_dates) > 0:
133 | release_date = release_dates[0].strip().replace('\n', '')
134 | if len(release_dates) > 1:
135 | release_date += release_dates[1].strip().replace('\n', '')
136 | print(release_date)
137 | # 片长
138 | runtimes = div.xpath('./div/div/span/span[contains(text(),"片长:")]/following::text()')
139 | runtime = ''
140 | if len(runtimes) > 0:
141 | runtime = runtimes[0].strip().replace('\n', '')
142 | if len(runtimes) > 1:
143 | runtime += runtimes[1].strip().replace('\n', '')
144 | print(runtime)
145 | i += 1
146 |
147 |
148 | if __name__ == '__main__':
149 |
150 | html = getHTML(base_url, './data/h03/h03.html')
151 | # get next page uri
152 | uri = ''
153 | root = etree.HTML(html)
154 | divs = root.xpath('//div[@id="sLaOuol2SM0iFj4d"]/text()')
155 | if len(divs) > 0:
156 | uri = divs[0]
157 |
158 | i = 1
159 | while (uri != '') & (i < 10):
160 | print(uri)
161 | html = getHTML(base_url + '/' + uri, f'./data/h03/h03_{uri}.html')
162 | uri = '' # ***
163 | root = etree.HTML(html)
164 | divs = root.xpath('//div[@id="sLaOuol2SM0iFj4d"]/text()')
165 | if len(divs) > 0:
166 | uri = divs[0]
167 | i += 1
168 |
169 | # parseHTML(html)
170 |
--------------------------------------------------------------------------------
/h04.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | from selenium import webdriver
6 | import time
7 |
8 |
9 | base_url = 'https://spiderbuf.cn/web-scraping-practice/javascript-confuse-encrypt-reverse'
10 |
11 | myheaders = {
12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
13 |
14 | def getHTML(url,file_name=''):
15 | client = webdriver.Chrome()
16 | client.get(url)
17 | time.sleep(3)
18 | html = client.page_source
19 | print(html)
20 | client.quit()
21 |
22 | if file_name != '':
23 | with open(file_name, 'w', encoding='utf-8') as f:
24 | f.write(html)
25 | return html
26 |
27 |
28 | def parseHTML(html,file_name=''):
29 | root = etree.HTML(html)
30 | trs = root.xpath('//tr')
31 |
32 | if file_name != '':
33 | f = open(file_name, 'w', encoding='utf-8')
34 |
35 | for tr in trs:
36 | tds = tr.xpath('./td')
37 | s = ''
38 | for td in tds:
39 | s = s + str(td.xpath('string(.)')) + '|'
40 | # s = s + str(td.text) + '|'
41 | print(s)
42 | if (s != '') & (file_name != ''):
43 | f.write(s + '\n')
44 | f.close()
45 |
46 |
47 | if __name__ == '__main__':
48 | # example: 1
49 | html = getHTML(base_url, './data/h04/h04.html')
50 | # parseHTML(html, './data/h04/h04.txt')
51 |
52 | # example: 2
53 | # url = 'https://spiderbuf.cn/static/js/h04/udSL29.js'
54 | # js_code = requests.get(url, headers=myheaders).text
55 | # # js_code = js_code.encode('utf-8').decode('unicode-escape')
56 | # a = js_code.index('=') + 1
57 | # b = js_code.index(';')
58 | # js_code = js_code[a:b]
59 |
60 | # # 将字符串转换为字典
61 | # dict_data = eval(js_code)
62 | # print(dict_data)
63 | # for item in dict_data:
64 | # print(item)
65 |
--------------------------------------------------------------------------------
/h05.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import base64
3 | import hashlib
4 | import time
5 |
6 | import requests
7 | from lxml import etree
8 | from selenium import webdriver
9 |
10 |
11 | base_url = 'https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp'
12 |
13 | myheaders = {
14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
15 |
16 | def getHTML(url,file_name=''):
17 | client = webdriver.Chrome()
18 | client.get(url)
19 | time.sleep(5)
20 | html = client.page_source
21 | print(html)
22 | client.quit()
23 |
24 | if file_name != '':
25 | with open(file_name, 'w', encoding='utf-8') as f:
26 | f.write(html)
27 | return html
28 |
29 |
30 | def parseHTML(html,file_name=''):
31 | root = etree.HTML(html)
32 | trs = root.xpath('//tr')
33 |
34 | if file_name != '':
35 | f = open(file_name, 'w', encoding='utf-8')
36 |
37 | for tr in trs:
38 | tds = tr.xpath('./td')
39 | s = ''
40 | for td in tds:
41 | s = s + str(td.xpath('string(.)')) + '|'
42 | # s = s + str(td.text) + '|'
43 | print(s)
44 | if (s != '') & (file_name != ''):
45 | f.write(s + '\n')
46 | f.close()
47 |
48 |
49 | if __name__ == '__main__':
50 | # example: 1
51 | html = getHTML(base_url, './data/h05/h05.html')
52 | # parseHTML(html, './data/h04/h04.txt')
53 |
54 | # example: 2
55 | # url = 'https://spiderbuf.cn/web-scraping-practice/javascript-reverse-timestamp/api/'
56 | # timestamp = str(int(time.time()))
57 | # md5_hash = hashlib.md5()
58 | # md5_hash.update(timestamp.encode('utf-8'))
59 | # md5 = md5_hash.hexdigest()
60 | # s = ('%s,%s' % (timestamp, md5))
61 | # print(s)
62 | # payload = str(base64.b64encode(s.encode('utf-8')), 'utf-8')
63 | # print(payload)
64 | # html = requests.get(url + payload, headers=myheaders).text
65 | # print(html)
66 |
--------------------------------------------------------------------------------
/h06.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import base64
3 | import hashlib
4 | import time
5 |
6 | import requests
7 | from lxml import etree
8 | from selenium import webdriver
9 |
10 |
11 | base_url = 'https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper'
12 |
13 | myheaders = {
14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
15 |
16 | def getHTML(url,file_name=''):
17 | # client = webdriver.Chrome()
18 | # client.get(url)
19 | # html = client.page_source
20 | # print(html)
21 | # client.quit()
22 | options = webdriver.ChromeOptions()
23 | options.add_argument('disable-infobars')
24 | # options.add_argument('headless')
25 | options.set_capability('goog:loggingPrefs', {'browser': 'ALL'}) # 输出浏览器console 日志:console.log
26 |
27 | options.add_argument('--disable-blink-features=AutomationControlled') # 改变navigator.webdriver 属性值
28 |
29 | client = webdriver.Chrome(options=options)
30 | client.get(url)
31 | time.sleep(5)
32 | print(client.page_source)
33 | html = client.page_source
34 |
35 | # client.quit()
36 |
37 | if file_name != '':
38 | with open(file_name, 'w', encoding='utf-8') as f:
39 | f.write(html)
40 | return html
41 |
42 |
43 | def parseHTML(html,file_name=''):
44 | root = etree.HTML(html)
45 | trs = root.xpath('//tr')
46 |
47 | if file_name != '':
48 | f = open(file_name, 'w', encoding='utf-8')
49 |
50 | for tr in trs:
51 | tds = tr.xpath('./td')
52 | s = ''
53 | for td in tds:
54 | s = s + str(td.xpath('string(.)')) + '|'
55 | # s = s + str(td.text) + '|'
56 | print(s)
57 | if (s != '') & (file_name != ''):
58 | f.write(s + '\n')
59 | f.close()
60 |
61 |
62 | if __name__ == '__main__':
63 | # example: 1
64 | html = getHTML(base_url, './data/h06/h06.html')
65 | # print(html)
66 | # parseHTML(html, './data/h06/h06.txt')
67 |
68 | # example: 2
69 | # url = 'https://spiderbuf.cn/web-scraping-practice/selenium-fingerprint-anti-scraper/api/'
70 | # timestamp = str(int(time.time()))
71 | # md5_hash = hashlib.md5()
72 | # md5_hash.update(timestamp.encode('utf-8'))
73 | # md5 = md5_hash.hexdigest()
74 | # s = ('%s,%s' % (timestamp, md5))
75 | # print(s)
76 | # payload = str(base64.b64encode(s.encode('utf-8')), 'utf-8')
77 | # print(payload)
78 | # html = requests.get(url + payload, headers=myheaders).text
79 | # print(html)
80 | # # 将字符串转换为字典
81 | # dict_data = eval(html)
82 | # print(dict_data)
83 | # for item in dict_data:
84 | # print(item)
85 |
--------------------------------------------------------------------------------
/n01.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/user-agent-referrer'
7 |
8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36',
9 | 'Referer':'https://spiderbuf.cn/list'}
10 |
11 | html = requests.get(url, headers=myheaders).text
12 | print(html)
13 |
14 | f = open('./data/n01/n01.html', 'w', encoding='utf-8')
15 | f.write(html)
16 | f.close()
17 |
18 | root = etree.HTML(html)
19 | ls = root.xpath('//div[@class ="container"]/div/div')
20 | # page_text = ls[0].xpath('string(.)')
21 | # print(page_text)
22 |
23 | f = open('./data/n01/n01.txt', 'w', encoding='utf-8')
24 | for item in ls:
25 | hnodes = item.xpath('./h2')
26 | s0 = hnodes[0].text
27 |
28 | pnodes = item.xpath('./p')
29 | s1 = pnodes[0].text
30 | s2 = pnodes[1].text
31 | s3 = pnodes[2].text
32 | s4 = pnodes[3].text
33 | # 富邦金融控股排名:50企业估值(亿元):2135CEO:蔡明兴行业:金融服务
34 | s = s0 + '|' + s1.replace('排名:','') + '|' + s2.replace('企业估值(亿元):','') + '|' \
35 | + s3.replace('CEO:','') + '|' + s4.replace('行业:','') + '\n'
36 | print(s)
37 | f.write(s)
38 | # s = ''
39 | # for td in tds:
40 | # s = s + str(td.xpath('string(.)')) + '|'
41 | # # s = s + str(td.text) + '|'
42 | # print(s)
43 | # if s != '':
44 | # f.write(s + '\n')
45 |
46 | f.close()
--------------------------------------------------------------------------------
/n02.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | import base64
7 |
8 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-images-base64'
9 |
10 | myheaders = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 |
13 |
14 | html = requests.get(url, headers=myheaders).text
15 | print(html)
16 |
17 | f = open('./data/n02/n02.html', 'w', encoding='utf-8')
18 | f.write(html)
19 | f.close()
20 |
21 | root = etree.HTML(html)
22 | imgs = root.xpath('//img/@src')
23 | print(imgs)
24 | for item in imgs:
25 | print(item)
26 | # item 是获取到的base64字符串
27 | item = item.replace('data:image/png;base64,','')
28 | str_bytes = item.encode('raw_unicode_escape') # str 转 bytes
29 | decoded = base64.b64decode(str_bytes)
30 |
31 | img = open('./data/n02/n02.png', 'wb')
32 | img.write(decoded)
33 | img.close()
34 |
35 |
--------------------------------------------------------------------------------
/n03.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | import time
6 |
7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraper-bypass-request-limit/%d'
8 |
9 | myheaders = {
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
11 |
12 | max_no = 20
13 | # exit()
14 |
15 | for i in range(1, max_no + 1):
16 | print(i)
17 | url = base_url % i
18 | print(url)
19 | html = requests.get(url, headers=myheaders).text
20 | print(html)
21 |
22 | f = open('./data/n03/n03_%d.html' % i, 'w', encoding='utf-8')
23 | f.write(html)
24 | f.close()
25 |
26 | root = etree.HTML(html)
27 | trs = root.xpath('//tr')
28 |
29 | f = open('./data/n03/datan03_%d.txt' % i, 'w', encoding='utf-8')
30 | for tr in trs:
31 | tds = tr.xpath('./td')
32 | s = ''
33 | for td in tds:
34 | s = s + str(td.xpath('string(.)')) + '|'
35 | # s = s + str(td.text) + '|'
36 | print(s)
37 | if s != '':
38 | f.write(s + '\n')
39 | time.sleep(2)
40 | f.close()
41 |
--------------------------------------------------------------------------------
/n04.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os.path
3 |
4 | import requests
5 | from lxml import etree
6 | import time
7 |
8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/css-pseudo-elements'
9 |
10 | myheaders = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 |
13 | def getHTML(url,file_name=''):
14 | html = requests.get(url, headers=myheaders).text
15 | if file_name != '':
16 | with open(file_name, 'w', encoding='utf-8') as f:
17 | f.write(html)
18 | return html
19 |
20 |
21 |
22 | def parseHTML(html):
23 | class_map = {'abcdef::before':'7',
24 | 'abcdef::after':'5',
25 | 'ghijkl::before':'8',
26 | 'ghijkl::after':'9',
27 | 'mnopqr::before':'9',
28 | 'mnopqr::after':'1',
29 | 'uvwxyz::before':'1',
30 | 'uvwxyz::after':'4',
31 | 'yzabcd::before':'2',
32 | 'yzabcd::after':'6',
33 | 'efghij::before':'3',
34 | 'efghij::after':'2',
35 | 'klmnop::before':'5',
36 | 'klmnop::after':'7',
37 | 'qrstuv::before':'4',
38 | 'qrstuv::after':'3',
39 | 'wxyzab::before':'6',
40 | 'wxyzab::after':'0',
41 | 'cdefgh::before':'0',
42 | 'cdefgh::after':'8',
43 | 'hijklm::after':'6',
44 | 'opqrst::after':'0',
45 | 'uvwxab::after':'3',
46 | 'cdijkl::after':'8',
47 | 'pqrmno::after':'1',
48 | 'stuvwx::after':'4',
49 | 'pkenmc::after':'7',
50 | 'tcwdsk::after':'9',
51 | 'mkrtyu::after':'5',
52 | 'umdrtk::after':'2'}
53 | # parse html source code here
54 | root = etree.HTML(html)
55 | divs = root.xpath('/html/body/div/div[@style="margin-top: 10px;"]')
56 |
57 | for div in divs:
58 | titles = div.xpath('./div/h2')
59 | title = ''
60 | if len(titles) > 0:
61 | title = titles[0].text
62 | print(title)
63 | # 评分
64 | ranking_spans = div.xpath('./div/div[2]/span[@class]')
65 |
66 | if len(ranking_spans) > 0:
67 | span = ranking_spans[0]
68 | attr_class = span.attrib["class"] if "class" in span.attrib else ""
69 | # print(f"{span} - {attr_class}")
70 | # print(span.text)
71 |
72 | classes = attr_class.split(" ")
73 | if len(classes) > 0:
74 | s1 = class_map[classes[0] + '::before']
75 | s2 = class_map[classes[1] + '::after']
76 | print(f'{s1}.{s2}')
77 |
78 |
79 | if __name__ == '__main__':
80 | html = getHTML(base_url, './data/n04/n04.html')
81 | parseHTML(html)
82 |
--------------------------------------------------------------------------------
/n05.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os.path
3 |
4 | import requests
5 | from lxml import etree
6 | import time
7 |
8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/css-sprites'
9 |
10 | myheaders = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 |
13 | def getHTML(url,file_name=''):
14 | html_bytes = requests.get(url, headers=myheaders).content
15 | html = html_bytes.decode()
16 | if file_name != '':
17 | with open(file_name, 'w', encoding='utf-8') as f:
18 | f.write(html)
19 | return html
20 |
21 |
22 |
23 | def parseHTML(html):
24 | class_map = {'sprite abcdef':'0',
25 | 'sprite ghijkl':'1',
26 | 'sprite mnopqr':'2',
27 | 'sprite uvwxyz':'3',
28 | 'sprite yzabcd':'4',
29 | 'sprite efghij':'5',
30 | 'sprite klmnop':'6',
31 | 'sprite qrstuv':'7',
32 | 'sprite wxyzab':'8',
33 | 'sprite cdefgh':'9'}
34 | # parse html source code here
35 | root = etree.HTML(html)
36 | divs = root.xpath('//div[@style="margin-bottom: 30px;"]')
37 |
38 | for div in divs:
39 | titles = div.xpath('./h2')
40 | title = ''
41 | if len(titles) > 0:
42 | title = titles[0].text
43 | print(title)
44 |
45 | amount_spans = div.xpath('./p/span[@class]')
46 | amount_str = ''
47 | for span in amount_spans:
48 | attr_class = span.attrib["class"] if "class" in span.attrib else ""
49 | # print(f"{span} - {attr_class}")
50 | # print(span.text)
51 | amount_str += class_map[attr_class]
52 | print(amount_str)
53 |
54 |
55 |
56 | if __name__ == '__main__':
57 | html = getHTML(base_url, './data/n05/n05.html')
58 | parseHTML(html)
59 |
--------------------------------------------------------------------------------
/n06.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os.path
3 |
4 | import requests
5 | from lxml import etree
6 | import time
7 |
8 | base_url = 'https://spiderbuf.cn/web-scraping-practice/scraping-form-rpa'
9 |
10 | myheaders = {
11 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
12 |
13 | def getHTML(url,file_name=''):
14 | html_bytes = requests.get(url, headers=myheaders).content
15 | html = html_bytes.decode()
16 | if file_name != '':
17 | with open(file_name, 'w', encoding='utf-8') as f:
18 | f.write(html)
19 | return html
20 |
21 |
22 |
23 | def parseHTML(html):
24 | # parse html source code here
25 | root = etree.HTML(html)
26 | inputs = root.xpath('//input')
27 |
28 | for input in inputs:
29 | attr_name = input.attrib['name'] if 'name' in input.attrib else ''
30 | input_value = input.attrib['value'] if 'value' in input.attrib else ''
31 | if attr_name == 'username':
32 | print(f'用户名:{input_value}')
33 |
34 | if attr_name == 'password':
35 | print(f'密码:{input_value}')
36 |
37 | if attr_name == 'email':
38 | print(f'邮箱:{input_value}')
39 |
40 | if attr_name == 'website':
41 | print(f'网站:{input_value}')
42 |
43 | if attr_name == 'date':
44 | print(f'生日:{input_value}')
45 |
46 | if attr_name == 'time':
47 | print(f'时间:{input_value}')
48 |
49 | if attr_name == 'number':
50 | print(f'数量:{input_value}')
51 |
52 | if attr_name == 'range':
53 | print(f'滑块:{input_value}')
54 |
55 | if attr_name == 'color':
56 | print(f'颜色:{input_value}')
57 |
58 | if attr_name == 'search':
59 | print(f'搜索:{input_value}')
60 |
61 | if attr_name == 'gender':
62 | temp = input.attrib['checked'] if 'checked' in input.attrib else ''
63 | if temp != '':
64 | print(f'性别:{input_value}')
65 |
66 | if attr_name == 'interest':
67 | temp = input.attrib['checked'] if 'checked' in input.attrib else ''
68 | if temp != '':
69 | print(f'开发语言:{input_value}')
70 |
71 | options = root.xpath('//select[@name="country"]/option')
72 | for option in options:
73 | attr_name = option.attrib['selected'] if 'selected' in option.attrib else ''
74 | option_value = option.attrib['value'] if 'value' in option.attrib else ''
75 | if attr_name != '':
76 | print(f'人物代表:{option_value}')
77 |
78 | lis = root.xpath('//ul[@class="items"]/li/a')
79 | for li in lis:
80 | attr_name = li.attrib['class'] if 'class' in li.attrib else ''
81 | li_value = li.text
82 | if 'active' in attr_name:
83 | print(f'代表人物出处:{li_value}')
84 |
85 |
86 | if __name__ == '__main__':
87 | html = getHTML(base_url, './data/n06/n06.html')
88 | parseHTML(html)
89 |
--------------------------------------------------------------------------------
/n07.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | base_url = 'https://spiderbuf.cn/web-scraping-practice/random-css-classname'
7 |
8 | my_headers = {
9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
10 |
11 | # 取页数
12 | html_bytes = requests.get(base_url, headers=my_headers).content
13 | html = html_bytes.decode()
14 | root = etree.HTML(html)
15 | with open('./data/n07/n07.html', 'w', encoding='utf-8') as f:
16 | f.write(html)
17 | # print(html)
18 | divs = root.xpath('/html/body/main/div[2]/div')
19 | with open('./data/n07/n07.txt','w',encoding='utf-8') as f:
20 | for div in divs:
21 | print(div.text)
22 | if div.text:
23 | f.write(f'{div.text}\n')
--------------------------------------------------------------------------------
/python_begin.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 |
5 | print('中文')
6 |
7 | a = 2
8 | b = '张三'
9 | c = 'ddddddd'
10 | print(a,b,c)
11 |
12 | d = 2
13 | print(a - d)
14 |
15 | if a == 1:
16 | print('等于1')
17 | elif a == 2:
18 | print('等于2')
19 | else:
20 | print('不等于')
21 |
22 | # for i in range(0, 10):
23 | # print(i)
24 |
25 | while a < 10:
26 | a += 1
27 | print(a)
28 |
29 | print("中文")
30 |
31 | lst = ['张三', '李四', '王五']
32 |
33 | dict = {'张三':'a2', '李四':'b3'}
34 |
35 | print(dict['张三'])
36 |
37 | for item in dict.keys():
38 | print(dict[item])
39 |
40 | # f = open('abc.txt', 'w', encoding='utf-8')
41 | # f.write('这是写入文件的内容')
42 | # f.close()
43 | f = open('abc.txt', 'r', encoding='utf-8')
44 | s = f.read()
45 | f.close()
46 | print(s)
47 |
48 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | apturl==0.5.2
2 | attrs==23.1.0
3 | blinker==1.4
4 | Brlapi==0.7.0
5 | certifi==2019.11.28
6 | chardet==3.0.4
7 | Click==7.0
8 | colorama==0.4.3
9 | command-not-found==0.3
10 | cryptography==2.8
11 | cupshelpers==1.0
12 | dbus-python==1.2.16
13 | defer==1.0.6
14 | distro==1.4.0
15 | distro-info===0.23ubuntu1
16 | entrypoints==0.3
17 | exceptiongroup==1.1.3
18 | h11==0.14.0
19 | httplib2==0.14.0
20 | idna==2.8
21 | keyring==18.0.1
22 | language-selector==0.1
23 | launchpadlib==1.10.13
24 | lazr.restfulclient==0.14.2
25 | lazr.uri==1.0.3
26 | louis==3.12.0
27 | lxml==4.6.3
28 | macaroonbakery==1.3.1
29 | netifaces==0.10.4
30 | oauthlib==3.1.0
31 | olefile==0.46
32 | outcome==1.3.0.post0
33 | pexpect==4.6.0
34 | Pillow==7.0.0
35 | protobuf==3.6.1
36 | pycairo==1.16.2
37 | pycups==1.9.73
38 | PyGObject==3.36.0
39 | PyJWT==1.7.1
40 | pymacaroons==0.13.0
41 | PyNaCl==1.3.0
42 | pyRFC3339==1.1
43 | PySocks==1.7.1
44 | python-apt==2.0.1+ubuntu0.20.4.1
45 | python-dateutil==2.7.3
46 | python-debian==0.1.36+ubuntu1.1
47 | pytz==2019.3
48 | pyxdg==0.26
49 | PyYAML==5.3.1
50 | reportlab==3.5.34
51 | requests==2.22.0
52 | requests-unixsocket==0.2.0
53 | SecretStorage==2.3.1
54 | selenium==4.15.2
55 | simplejson==3.16.0
56 | six==1.14.0
57 | sniffio==1.3.0
58 | sortedcontainers==2.4.0
59 | systemd-python==234
60 | trio==0.23.1
61 | trio-websocket==0.11.1
62 | ubuntu-advantage-tools==8001
63 | ubuntu-drivers-common==0.0.0
64 | ufw==0.36
65 | unattended-upgrades==0.1
66 | urllib3==1.25.8
67 | wadllib==1.3.3
68 | wsproto==1.2.0
69 | xkit==0.0.0
70 |
--------------------------------------------------------------------------------
/s01.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/requests-lxml-for-scraping-beginner'
7 |
8 | html = requests.get(url).text
9 |
10 | f = open('01.html', 'w', encoding='utf-8')
11 | f.write(html)
12 | f.close()
13 |
14 | root = etree.HTML(html)
15 | trs = root.xpath('//tr')
16 |
17 | f = open('data01.txt', 'w', encoding='utf-8')
18 | for tr in trs:
19 | tds = tr.xpath('./td')
20 | s = ''
21 | for td in tds:
22 | # print(td.text)
23 | s = s + str(td.text) + '|'
24 | print(s)
25 | if s != '':
26 | f.write(s + '\n')
27 |
28 | f.close()
29 |
30 | # print(html)
--------------------------------------------------------------------------------
/s02.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-http-header'
7 |
8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
9 |
10 | html = requests.get(url, headers=myheaders).text
11 | print(html)
12 |
13 | f = open('02.html', 'w', encoding='utf-8')
14 | f.write(html)
15 | f.close()
16 |
17 | root = etree.HTML(html)
18 | trs = root.xpath('//tr')
19 |
20 | f = open('data02.txt', 'w', encoding='utf-8')
21 | for tr in trs:
22 | tds = tr.xpath('./td')
23 | s = ''
24 | for td in tds:
25 | # print(td.text)
26 | s = s + str(td.text) + '|'
27 | print(s)
28 | if s != '':
29 | f.write(s + '\n')
30 |
31 | f.close()
32 |
33 |
--------------------------------------------------------------------------------
/s03.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/lxml-xpath-advanced'
7 |
8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
9 |
10 | html = requests.get(url, headers=myheaders).text
11 | print(html)
12 |
13 | f = open('03.html', 'w', encoding='utf-8')
14 | f.write(html)
15 | f.close()
16 |
17 | root = etree.HTML(html)
18 | trs = root.xpath('//tr')
19 |
20 | f = open('data03.txt', 'w', encoding='utf-8')
21 | for tr in trs:
22 | tds = tr.xpath('./td')
23 | s = ''
24 | for td in tds:
25 | s = s + str(td.xpath('string(.)')) + '|'
26 | # s = s + str(td.text) + '|'
27 | print(s)
28 | if s != '':
29 | f.write(s + '\n')
30 |
31 | f.close()
32 |
33 |
--------------------------------------------------------------------------------
/s04.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | import re
6 |
7 | base_url = 'https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper?pageno=%d'
8 |
9 | myheaders = {
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
11 |
12 | # 取页数
13 | html = requests.get(base_url % 1, headers=myheaders).text
14 | root = etree.HTML(html)
15 |
16 | lis = root.xpath('//ul[@class="pagination"]/li')
17 | page_text = lis[0].xpath('string(.)')
18 | ls = re.findall('[0-9]', page_text)
19 |
20 | max_no = int(ls[0])
21 | # exit()
22 |
23 | for i in range(1, max_no + 1):
24 | print(i)
25 | url = base_url % i
26 | print(url)
27 | html = requests.get(url, headers=myheaders).text
28 | print(html)
29 |
30 | f = open('04_%d.html' % i, 'w', encoding='utf-8')
31 | f.write(html)
32 | f.close()
33 |
34 | root = etree.HTML(html)
35 | trs = root.xpath('//tr')
36 |
37 | f = open('data04_%d.txt' % i, 'w', encoding='utf-8')
38 | for tr in trs:
39 | tds = tr.xpath('./td')
40 | s = ''
41 | for td in tds:
42 | s = s + str(td.xpath('string(.)')) + '|'
43 | # s = s + str(td.text) + '|'
44 | print(s)
45 | if s != '':
46 | f.write(s + '\n')
47 |
48 | f.close()
49 |
--------------------------------------------------------------------------------
/s05.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraping-images-from-web'
7 |
8 | myheaders = {
9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
10 |
11 |
12 | html = requests.get(url, headers=myheaders).text
13 | print(html)
14 |
15 | f = open('05.html', 'w', encoding='utf-8')
16 | f.write(html)
17 | f.close()
18 |
19 | root = etree.HTML(html)
20 | imgs = root.xpath('//img/@src')
21 | print(imgs)
22 | for item in imgs:
23 | img_data = requests.get('https://spiderbuf.cn' + item, headers=myheaders).content
24 | img = open(str(item).replace('/',''), 'wb')
25 | img.write(img_data)
26 | img.close()
27 | #
28 | # f = open('data05.txt', 'w', encoding='utf-8')
29 | # for tr in trs:
30 | # tds = tr.xpath('./td')
31 | # s = ''
32 | # for td in tds:
33 | # s = s + str(td.xpath('string(.)')) + '|'
34 | # # s = s + str(td.text) + '|'
35 | # print(s)
36 | # if s != '':
37 | # f.write(s + '\n')
38 | #
39 | # f.close()
40 |
--------------------------------------------------------------------------------
/s06.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/inner'
7 |
8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
9 |
10 | html = requests.get(url, headers=myheaders).text
11 | print(html)
12 |
13 | f = open('06.html', 'w', encoding='utf-8')
14 | f.write(html)
15 | f.close()
16 |
17 | root = etree.HTML(html)
18 | trs = root.xpath('//tr')
19 |
20 | f = open('data06.txt', 'w', encoding='utf-8')
21 | for tr in trs:
22 | tds = tr.xpath('./td')
23 | s = ''
24 | for td in tds:
25 | s = s + str(td.xpath('string(.)')) + '|'
26 | # s = s + str(td.text) + '|'
27 | print(s)
28 | if s != '':
29 | f.write(s + '\n')
30 |
31 | f.close()
32 |
33 |
--------------------------------------------------------------------------------
/s07.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | import json
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/iplist?order=asc'
7 |
8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
9 |
10 | data_json = requests.get(url, headers=myheaders).text
11 | print(data_json)
12 |
13 | f = open('./data/7/07.html', 'w', encoding='utf-8')
14 | f.write(data_json)
15 | f.close()
16 |
17 | ls = json.loads(data_json)
18 | print(ls)
19 |
20 | f = open('./data/7/data07.txt', 'w', encoding='utf-8')
21 | for item in ls:
22 | # print(item)
23 | s = '%s|%s|%s|%s|%s|%s|%s\n' % (item['ip'], item['mac'],item['manufacturer'], item['name'],item['ports'], item['status'], item['type'])
24 | f.write(s)
25 | f.close()
--------------------------------------------------------------------------------
/s08.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 |
6 | url = 'https://spiderbuf.cn/web-scraping-practice/scraper-via-http-post'
7 |
8 | myheaders = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
9 |
10 | payload = {'level':'8'}
11 | html = requests.post(url, headers=myheaders, data=payload).text
12 | print(html)
13 |
14 | f = open('./data/8/08.html', 'w', encoding='utf-8')
15 | f.write(html)
16 | f.close()
17 |
18 | root = etree.HTML(html)
19 | trs = root.xpath('//tr')
20 |
21 | f = open('./data/8/data08.txt', 'w', encoding='utf-8')
22 | for tr in trs:
23 | tds = tr.xpath('./td')
24 | s = ''
25 | for td in tds:
26 | # print(td.text)
27 | s = s + str(td.text) + '|'
28 | print(s)
29 | if s != '':
30 | f.write(s + '\n')
31 |
32 | f.close()
33 |
34 |
--------------------------------------------------------------------------------
/s4-1.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import requests
4 | from lxml import etree
5 | import re
6 |
7 | url = 'https://spiderbuf.cn/web-scraping-practice/web-pagination-scraper?pageno=2&pagesize=50'
8 |
9 | myheaders = {
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
11 |
12 | html = requests.get(url, headers=myheaders).text
13 | print(html)
14 |
15 | f = open('./data/4-1/04-1.html', 'w', encoding='utf-8')
16 | f.write(html)
17 | f.close()
18 |
19 | root = etree.HTML(html)
20 | trs = root.xpath('//tr')
21 |
22 | f = open('./data/4-1/data04-1.txt', 'w', encoding='utf-8')
23 | for tr in trs:
24 | tds = tr.xpath('./td')
25 | s = ''
26 | for td in tds:
27 | s = s + str(td.xpath('string(.)')) + '|'
28 | # s = s + str(td.text) + '|'
29 | print(s)
30 | if s != '':
31 | f.write(s + '\n')
32 |
33 | f.close()
34 |
--------------------------------------------------------------------------------