├── README.md
├── redBook.py
├── redBookPatchAll.py
├── srtToTxt.py
├── text
└── content.srt
└── 问题锦囊.md
/README.md:
--------------------------------------------------------------------------------
1 | # python-demo
2 | 爬虫demo, 爬取小红书无水印图片、视频字幕srt文件转成txt文件等
3 |
4 | ## 一、爬取小红书、视频
5 | ### 1、爬取小红书无水印的图片,运行对应的python文件
6 | ```
7 | python .\redBook.py
8 | ```
9 | ### tips:
10 | 1、文件中的cookie需要用你自己的cookie
11 | 2、获取小红书的链接组成数组
12 | 3、默认生成的图片保存在:当前目录下的 image 文件夹下,你可以自定义文件路径
13 |
14 | ### 2、爬取小红书用户主页无水印的图片,视频
15 | ```
16 | python .\redBookPatchAll.py
17 | ```
18 | ### tips:
19 | 1、文件中的cookie需要用你自己的cookie
20 | 2、获取小红书用户主页的链接,找到136行替换链接
21 | 3、默认生成的资源保存在:当前目录下的 image 文件夹下,你可以自定义文件路径
22 |
23 | ## 二、视频字幕srt文件转成txt文件
24 | 运行对应的python文件
25 | ```
26 | python .\srtToTxt.py
27 | ```
28 | ### tips:
29 | 1、根据自己的需要,自定义 path\targetPath 的路径
30 | 2、path为srt字幕所有文件、targetPath是转换后生成的txt文件对应的目录
31 |
--------------------------------------------------------------------------------
/redBook.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import os
4 | import re
5 | import json
6 |
7 | def mkdir(path):
8 | '''
9 | 创建文件夹
10 | '''
11 | folder = os.path.exists(path)
12 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
13 | print("--- 创建新的文件夹😀 ---")
14 | os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
15 | print("--- OK 🚩 ---")
16 | else:
17 | print("--- ⚠️ 文件夹已存在! ---")
18 |
19 | def fetchUrl(url):
20 | '''
21 | 发起网络请求,获取网页源码
22 | '''
23 | headers = {
24 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 ',
25 | 'cookie':'', # 换成自己的cookie哦~
26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
27 | }
28 |
29 | r = requests.get(url, headers = headers)
30 | return r.text
31 |
32 | def parsing_link(html):
33 | '''
34 | 解析html文本,提取无水印图片的 url
35 | '''
36 | soup = BeautifulSoup(html, 'html.parser')
37 | script = soup.find('script', string=re.compile('window\.__INITIAL_STATE__'))
38 |
39 | test = re.split(r'=', script.string)
40 | # 处理字符串json数据不合理的地方
41 | string = test[1].replace('undefined', 'null')
42 | # 转换成json数据
43 | result = json.loads(string, strict=False)
44 | # 获取对应字段
45 | imageList = result['note']['note']['imageList']
46 | title = result['note']['note']['title']
47 | print('标题:', title)
48 | print('开始下载啦!🚀')
49 |
50 | # 调用生成以title为名的文件夹, 可自定义要保存的路径
51 | file = os.path.dirname(__file__) + '/image/' + title
52 | mkdir(file)
53 |
54 | # 提取图片
55 | for i in imageList:
56 | picUrl = f"https://sns-img-qc.xhscdn.com/{i['traceId']}"
57 | yield picUrl, i['traceId'], title
58 |
59 | def download(url, filename, folder):
60 | '''
61 | 下载图片
62 | '''
63 | headers = {
64 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
65 | }
66 |
67 | with open(f'image/{folder}/{filename}.jpg', 'wb') as v:
68 | try:
69 | r = requests.get(url, headers=headers)
70 | v.write(r.content)
71 | except Exception as e:
72 | print('图片下载错误!')
73 |
74 | def roopLink(urls):
75 | '''
76 | 遍历urls,批量下载去水印图片
77 | '''
78 | for item in urls:
79 | html = fetchUrl(item)
80 | for url, traceId, title in parsing_link(html):
81 | print(f"download image {url}")
82 |
83 | download(url, traceId, title)
84 |
85 | if __name__ == '__main__':
86 | # 输入小红书的链接
87 | links = ['https://www.xiaohongshu.com/explore/63f07247000000001300d67b','https://www.xiaohongshu.com/explore/60a5f16f0000000021034cb4']
88 | roopLink(links)
89 | print("Finished!🎉")
90 |
--------------------------------------------------------------------------------
/redBookPatchAll.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import os
4 | import re
5 | import json
6 | import time
7 | from selenium import webdriver
8 | from selenium.webdriver.common.by import By
9 | from selenium.webdriver.chrome.options import Options
10 | from selenium.webdriver import ChromeOptions
11 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
12 |
13 | def mkdir(path):
14 | '''
15 | 创建文件夹
16 | '''
17 | folder = os.path.exists(path)
18 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
19 | print("--- 创建新的文件夹😀 ---")
20 | os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
21 | print("--- OK 🚩 ---")
22 | else:
23 | print("--- ⚠️ 文件夹已存在! ---")
24 |
25 | def fetchUrl(url):
26 | '''
27 | 发起网络请求,获取网页源码
28 | '''
29 | headers = {
30 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 ',
31 | 'cookie':'xhsTrackerId=7ff0cee2-d318-4845-a5f1-7f4fb9acb9c3; xhsTrackerId.sig=hrWK6Hw0SUZ8mRCiVj0KO1K4nyb5Rbr3cxcAneVDyJY; a1=186823c9460khhoyfzypkmiw8cprofje475xhyso150000115335; webId=bbb653427b6f8ad0d223ff07d448275f; gid=yYKYJqSjjJ4dyYKYJqSj4WvMK8TxxlviuvUT2ly63EYSUC28FliVdx888yy2qq28j0DDy2dS; gid.sign=SVNqT90rFZJP8Hpu1fDr6lxAv7Q=; web_session=030037a4cb351b0516ff232780244a0727192f; customerClientId=918124472893177; x-user-id-ark.xiaohongshu.com=62b982519a415e00014f6c2f; timestamp2=1677477939195dfe19d311adbcb4966d1bc2c3da33a8ec424ce52a391b32474; timestamp2.sig=RuqNBEIFoHscqX8BtjGLcla6Yn5Z36oIRc4hMvXN1iI; gr_user_id=711f3d34-fd16-4ba2-bc61-f09ca8023256; x-user-id-eva.xiaohongshu.com=62b982519a415e00014f6c2f; xhsTracker=url=user-profile&xhsshare=CopyLink; xhsTracker.sig=WS8d3HYlzoIfhHjyJtY_Y1QP5iYacJ96TpUFr1hgfm4; extra_exp_ids=yamcha_0327_exp,h5_1208_exp3; extra_exp_ids.sig=ANlofVKSDcIxHrXW_rvDettMT1wABiN2baUCClhZnYI; webBuild=2.0.3; websectiga=82e85efc5500b609ac1166aaf086ff8aa4261153a448ef0be5b17417e4512f28; sec_poison_id=18a62e3c-9284-4e5d-a196-c777ed2a4c6a; xsecappid=yamcha',
32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
33 | }
34 |
35 | r = requests.get(url, headers = headers)
36 | return r.text
37 |
38 | def parsing_link(html):
39 | '''
40 | 解析html文本,提取无水印图片的 url
41 | '''
42 | soup = BeautifulSoup(html, 'html.parser')
43 | script = soup.find('script', string=re.compile('window\.__INITIAL_STATE__'))
44 |
45 | # print(script.string, '🚀🚀🚀🚀')
46 | test = re.split(r'_=', script.string)
47 | # 处理字符串json数据不合理的地方
48 | string = test[1].replace('undefined', 'null')
49 | # 转换成json数据
50 | result = json.loads(string, strict=False)
51 | # # 获取对应字段
52 | video = ''
53 | videoId = ''
54 | imageList = []
55 | if 'video' in result['note']['note'] :
56 | video = result['note']['note']['video']['media']['stream']['h264'][0]['masterUrl']
57 | videoId = result['note']['note']['video']['media']['videoId']
58 | else:
59 | imageList = result['note']['note']['imageList']
60 |
61 | title = result['note']['note']['title']
62 | print('标题:', title)
63 | print('开始下载啦!🚀')
64 |
65 | # # 调用生成以title为名的文件夹, 可自定义要保存的路径
66 | file = os.path.dirname(__file__) + '/image/' + title
67 | mkdir(file)
68 |
69 | print(video, '🚀🚀🚀🚀')
70 | if video:
71 | downloadVideo(video, videoId, title)
72 | # 提取图片
73 | for i in imageList:
74 | picUrl = f"https://sns-img-qc.xhscdn.com/{i['traceId']}"
75 | yield picUrl, i['traceId'], title
76 |
77 | def download(url, filename, folder):
78 | '''
79 | 下载图片
80 | '''
81 | headers = {
82 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
83 | }
84 |
85 | with open(f'image/{folder}/{filename}.jpg', 'wb') as v:
86 | try:
87 | r = requests.get(url, headers=headers)
88 | v.write(r.content)
89 | except Exception as e:
90 | print('图片下载错误!')
91 |
92 | def downloadVideo(url, filename, folder):
93 | '''
94 | 下载视频
95 | '''
96 | headers = {
97 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36',
98 | }
99 |
100 | with open(f'image/{folder}/{filename}.mp4', 'wb') as v:
101 | try:
102 | r = requests.get(url, headers=headers)
103 | v.write(r.content)
104 | except Exception as e:
105 | print('视频下载错误!')
106 |
107 | def roopLink(urls):
108 | '''
109 | 遍历urls,批量下载去水印图片
110 | '''
111 | for item in urls:
112 | html = fetchUrl(item)
113 | parsing_link(html)
114 | for url, traceId, title in parsing_link(html):
115 | print(f"download image {url}")
116 |
117 | download(url, traceId, title)
118 |
119 | if __name__ == '__main__':
120 |
121 |
122 | option = ChromeOptions()
123 | option.add_experimental_option('excludeSwitches', ['enable-automation','enable-logging'])
124 | option.add_argument("--disable-blink-features")
125 | option.add_argument("--disable-blink-features=AutomationControlled")
126 | option.add_experimental_option("detach", True)
127 |
128 | desired_capabilities = DesiredCapabilities.CHROME
129 | desired_capabilities["pageLoadStrategy"] = "none"
130 | # 创建Chrome浏览器对象
131 | browser = webdriver.Chrome(options = option)
132 | # version = browser.capabilities['browserVersion']
133 | # print(version, 'versionversion')
134 |
135 | # 小红书主页的地址
136 | browser.get('https://www.xiaohongshu.com/user/profile/5c014959f7e8b935bc3cec68?appuid=5a2025504eacab20fa287e82&apptime=1679472390')
137 |
138 | # 设置隐式等待时间为10秒
139 | time.sleep(3)
140 | browser.refresh()
141 | time.sleep(5)
142 |
143 | pages = browser.page_source
144 | soup = BeautifulSoup(pages, 'html.parser')
145 |
146 | postId = []
147 | hrefArr = []
148 |
149 | for span in soup.find_all('a', class_='cover ld'):
150 | # titles.append(span.find('h2').text)
151 | postId.append('https://www.xiaohongshu.com/explore/'+span.get('href').split('/')[4])
152 |
153 | # print(soup, 'noteItem')
154 | print(postId)
155 |
156 | roopLink(postId)
157 | print("Finished!🎉")
158 |
--------------------------------------------------------------------------------
/srtToTxt.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | def split2step(alist,step):
4 | rs = []
5 | for i in range(0,len(alist),step):
6 | rs.append(alist[i:i+step])
7 |
8 | return rs
9 |
10 | def mkdir(path):
11 | '''
12 | 创建文件夹
13 | '''
14 | folder = os.path.exists(path)
15 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
16 | print("--- 创建新的文件夹😀 ---")
17 | os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
18 | print("--- OK 🚩 ---")
19 | else:
20 | print("--- ⚠️ 文件夹已存在! ---")
21 |
22 | path = "D:/python-demo/text" #文件夹目录
23 | targetPath = 'D:/python-demo/targetText' #存储的目标文件夹
24 | files= os.listdir(path) #得到文件夹下的所有文件名称
25 |
26 | mkdir(targetPath) # 创建目标结果文件夹
27 |
28 | for file in files: #遍历文件夹
29 | if not os.path.isdir(file): #判断是否是文件夹,不是文件夹才打开
30 | flines = open(path+"/"+file, 'r', encoding='utf-8').readlines()
31 | f4 = split2step(flines,4)
32 | result = ""
33 | for item in f4:
34 | result = result+item[2].replace("\n",",")
35 |
36 | targetFileName = file.split('.')[0]+'.txt'
37 |
38 | with open(f'targetText/{targetFileName}','w',encoding='utf-8') as r:
39 | print(f'---{targetFileName} 转换成功 🚩---')
40 | r.write(result)
41 |
--------------------------------------------------------------------------------
/text/content.srt:
--------------------------------------------------------------------------------
1 | 1
2 | 00:00:00,066 --> 00:00:03,733
3 | 是他建造了保护14亿人民的“地下钢铁长城”
4 |
5 | 2
6 | 00:00:03,766 --> 00:00:05,800
7 | 能抵挡敌人的任何核打击
8 |
9 | 3
10 | 00:00:05,800 --> 00:00:06,766
11 | 他的名字
12 |
13 | 4
14 | 00:00:06,766 --> 00:00:09,600
15 | 曾作为国家最高机密学长达60年
16 |
17 | 5
18 | 00:00:09,633 --> 00:00:10,833
19 | 直到2018年
20 |
21 | 6
22 | 00:00:10,833 --> 00:00:13,966
23 | 荣获国家最高科学技术奖,钱七虎
24 |
25 | 7
26 | 00:00:13,966 --> 00:00:16,100
27 | 这个名字才被世人知晓
28 |
29 | 8
30 | 00:00:16,166 --> 00:00:18,133
31 | 然而钱老却把800万奖金
32 |
33 | 9
34 | 00:00:18,133 --> 00:00:20,466
35 | 全数捐给贫困地区的学生
36 |
37 | 10
38 | 00:00:20,566 --> 00:00:23,200
39 | 他说“我们的先辈为国捐躯
40 |
41 | 11
42 | 00:00:23,266 --> 00:00:25,000
43 | 我还有什么不能捐的?
44 |
45 | 12
46 | 00:00:25,033 --> 00:00:28,066
47 | 国士无双, 民族脊梁,侠之大者
48 |
49 | 13
50 | 00:00:28,066 --> 00:00:30,500
51 | 为国为民,向钱老致敬!
52 |
53 |
--------------------------------------------------------------------------------
/问题锦囊.md:
--------------------------------------------------------------------------------
1 | ## 一、```pip install ``` 执行pip安装操作以下错误:
2 | 
3 |
4 | ### 解决方法如下:
5 | 
6 |
7 | 在C盘的用户目录下,新建pip文件夹,然后新建pip.ini文件,内容如下(可复制直接使用):
8 | ```
9 | [global]
10 |
11 | index-url=http://mirrors.aliyun.com/pypi/simple
12 |
13 | [install]
14 |
15 | trusted-host=mirrors.aliyun.com
16 | ```
17 | 这样就可以解决pip安装失败的问题了
18 |
19 | ## 二、调用json.loads()方法报错,如图所示:
20 | 
21 |
22 | ### 解决方案
23 | 使用这个地址去验证你的json字符串是否合理,https://jsonlint.com/ ,不合理的话,对应需要用代码处理一下数据,然后再调用
24 |
--------------------------------------------------------------------------------