├── README.md ├── redBook.py ├── redBookPatchAll.py ├── srtToTxt.py ├── text └── content.srt └── 问题锦囊.md /README.md: -------------------------------------------------------------------------------- 1 | # python-demo 2 | 爬虫demo, 爬取小红书无水印图片、视频字幕srt文件转成txt文件等 3 | 4 | ## 一、爬取小红书、视频 5 | ### 1、爬取小红书无水印的图片,运行对应的python文件 6 | ``` 7 | python .\redBook.py 8 | ``` 9 | ### tips: 10 | 1、文件中的cookie需要用你自己的cookie
11 | 2、获取小红书的链接组成数组
12 | 3、默认生成的图片保存在:当前目录下的 image 文件夹下,你可以自定义文件路径
13 | 14 | ### 2、爬取小红书用户主页无水印的图片,视频 15 | ``` 16 | python .\redBookPatchAll.py 17 | ``` 18 | ### tips: 19 | 1、文件中的cookie需要用你自己的cookie
20 | 2、获取小红书用户主页的链接,找到136行替换链接
21 | 3、默认生成的资源保存在:当前目录下的 image 文件夹下,你可以自定义文件路径
22 | 23 | ## 二、视频字幕srt文件转成txt文件 24 | 运行对应的python文件 25 | ``` 26 | python .\srtToTxt.py 27 | ``` 28 | ### tips: 29 | 1、根据自己的需要,自定义 path\targetPath 的路径
30 | 2、path为srt字幕所有文件、targetPath是转换后生成的txt文件对应的目录
31 | -------------------------------------------------------------------------------- /redBook.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import re 5 | import json 6 | 7 | def mkdir(path): 8 | ''' 9 | 创建文件夹 10 | ''' 11 | folder = os.path.exists(path) 12 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹 13 | print("--- 创建新的文件夹😀 ---") 14 | os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径 15 | print("--- OK 🚩 ---") 16 | else: 17 | print("--- ⚠️ 文件夹已存在! ---") 18 | 19 | def fetchUrl(url): 20 | ''' 21 | 发起网络请求,获取网页源码 22 | ''' 23 | headers = { 24 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 ', 25 | 'cookie':'', # 换成自己的cookie哦~ 26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36', 27 | } 28 | 29 | r = requests.get(url, headers = headers) 30 | return r.text 31 | 32 | def parsing_link(html): 33 | ''' 34 | 解析html文本,提取无水印图片的 url 35 | ''' 36 | soup = BeautifulSoup(html, 'html.parser') 37 | script = soup.find('script', string=re.compile('window\.__INITIAL_STATE__')) 38 | 39 | test = re.split(r'=', script.string) 40 | # 处理字符串json数据不合理的地方 41 | string = test[1].replace('undefined', 'null') 42 | # 转换成json数据 43 | result = json.loads(string, strict=False) 44 | # 获取对应字段 45 | imageList = result['note']['note']['imageList'] 46 | title = result['note']['note']['title'] 47 | print('标题:', title) 48 | print('开始下载啦!🚀') 49 | 50 | # 调用生成以title为名的文件夹, 可自定义要保存的路径 51 | file = os.path.dirname(__file__) + '/image/' + title 52 | mkdir(file) 53 | 54 | # 提取图片 55 | for i in imageList: 56 | picUrl = f"https://sns-img-qc.xhscdn.com/{i['traceId']}" 57 | yield picUrl, i['traceId'], title 58 | 59 | def download(url, filename, folder): 60 | ''' 61 | 下载图片 62 | ''' 63 | headers = { 64 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36', 65 | } 66 | 67 | with open(f'image/{folder}/{filename}.jpg', 'wb') as v: 68 | try: 69 | r = requests.get(url, headers=headers) 70 | v.write(r.content) 71 | except Exception as e: 72 | print('图片下载错误!') 73 | 74 | def roopLink(urls): 75 | ''' 76 | 遍历urls,批量下载去水印图片 77 | ''' 78 | for item in urls: 79 | html = fetchUrl(item) 80 | for url, traceId, title in parsing_link(html): 81 | print(f"download image {url}") 82 | 83 | download(url, traceId, title) 84 | 85 | if __name__ == '__main__': 86 | # 输入小红书的链接 87 | links = ['https://www.xiaohongshu.com/explore/63f07247000000001300d67b','https://www.xiaohongshu.com/explore/60a5f16f0000000021034cb4'] 88 | roopLink(links) 89 | print("Finished!🎉") 90 | -------------------------------------------------------------------------------- /redBookPatchAll.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import re 5 | import json 6 | import time 7 | from selenium import webdriver 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.chrome.options import Options 10 | from selenium.webdriver import ChromeOptions 11 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 12 | 13 | def mkdir(path): 14 | ''' 15 | 创建文件夹 16 | ''' 17 | folder = os.path.exists(path) 18 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹 19 | print("--- 创建新的文件夹😀 ---") 20 | os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径 21 | print("--- OK 🚩 ---") 22 | else: 23 | print("--- ⚠️ 文件夹已存在! ---") 24 | 25 | def fetchUrl(url): 26 | ''' 27 | 发起网络请求,获取网页源码 28 | ''' 29 | headers = { 30 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 ', 31 | 'cookie':'xhsTrackerId=7ff0cee2-d318-4845-a5f1-7f4fb9acb9c3; xhsTrackerId.sig=hrWK6Hw0SUZ8mRCiVj0KO1K4nyb5Rbr3cxcAneVDyJY; a1=186823c9460khhoyfzypkmiw8cprofje475xhyso150000115335; webId=bbb653427b6f8ad0d223ff07d448275f; gid=yYKYJqSjjJ4dyYKYJqSj4WvMK8TxxlviuvUT2ly63EYSUC28FliVdx888yy2qq28j0DDy2dS; gid.sign=SVNqT90rFZJP8Hpu1fDr6lxAv7Q=; web_session=030037a4cb351b0516ff232780244a0727192f; customerClientId=918124472893177; x-user-id-ark.xiaohongshu.com=62b982519a415e00014f6c2f; timestamp2=1677477939195dfe19d311adbcb4966d1bc2c3da33a8ec424ce52a391b32474; timestamp2.sig=RuqNBEIFoHscqX8BtjGLcla6Yn5Z36oIRc4hMvXN1iI; gr_user_id=711f3d34-fd16-4ba2-bc61-f09ca8023256; x-user-id-eva.xiaohongshu.com=62b982519a415e00014f6c2f; xhsTracker=url=user-profile&xhsshare=CopyLink; xhsTracker.sig=WS8d3HYlzoIfhHjyJtY_Y1QP5iYacJ96TpUFr1hgfm4; extra_exp_ids=yamcha_0327_exp,h5_1208_exp3; extra_exp_ids.sig=ANlofVKSDcIxHrXW_rvDettMT1wABiN2baUCClhZnYI; webBuild=2.0.3; websectiga=82e85efc5500b609ac1166aaf086ff8aa4261153a448ef0be5b17417e4512f28; sec_poison_id=18a62e3c-9284-4e5d-a196-c777ed2a4c6a; xsecappid=yamcha', 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 33 | } 34 | 35 | r = requests.get(url, headers = headers) 36 | return r.text 37 | 38 | def parsing_link(html): 39 | ''' 40 | 解析html文本,提取无水印图片的 url 41 | ''' 42 | soup = BeautifulSoup(html, 'html.parser') 43 | script = soup.find('script', string=re.compile('window\.__INITIAL_STATE__')) 44 | 45 | # print(script.string, '🚀🚀🚀🚀') 46 | test = re.split(r'_=', script.string) 47 | # 处理字符串json数据不合理的地方 48 | string = test[1].replace('undefined', 'null') 49 | # 转换成json数据 50 | result = json.loads(string, strict=False) 51 | # # 获取对应字段 52 | video = '' 53 | videoId = '' 54 | imageList = [] 55 | if 'video' in result['note']['note'] : 56 | video = result['note']['note']['video']['media']['stream']['h264'][0]['masterUrl'] 57 | videoId = result['note']['note']['video']['media']['videoId'] 58 | else: 59 | imageList = result['note']['note']['imageList'] 60 | 61 | title = result['note']['note']['title'] 62 | print('标题:', title) 63 | print('开始下载啦!🚀') 64 | 65 | # # 调用生成以title为名的文件夹, 可自定义要保存的路径 66 | file = os.path.dirname(__file__) + '/image/' + title 67 | mkdir(file) 68 | 69 | print(video, '🚀🚀🚀🚀') 70 | if video: 71 | downloadVideo(video, videoId, title) 72 | # 提取图片 73 | for i in imageList: 74 | picUrl = f"https://sns-img-qc.xhscdn.com/{i['traceId']}" 75 | yield picUrl, i['traceId'], title 76 | 77 | def download(url, filename, folder): 78 | ''' 79 | 下载图片 80 | ''' 81 | headers = { 82 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36', 83 | } 84 | 85 | with open(f'image/{folder}/{filename}.jpg', 'wb') as v: 86 | try: 87 | r = requests.get(url, headers=headers) 88 | v.write(r.content) 89 | except Exception as e: 90 | print('图片下载错误!') 91 | 92 | def downloadVideo(url, filename, folder): 93 | ''' 94 | 下载视频 95 | ''' 96 | headers = { 97 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36', 98 | } 99 | 100 | with open(f'image/{folder}/{filename}.mp4', 'wb') as v: 101 | try: 102 | r = requests.get(url, headers=headers) 103 | v.write(r.content) 104 | except Exception as e: 105 | print('视频下载错误!') 106 | 107 | def roopLink(urls): 108 | ''' 109 | 遍历urls,批量下载去水印图片 110 | ''' 111 | for item in urls: 112 | html = fetchUrl(item) 113 | parsing_link(html) 114 | for url, traceId, title in parsing_link(html): 115 | print(f"download image {url}") 116 | 117 | download(url, traceId, title) 118 | 119 | if __name__ == '__main__': 120 | 121 | 122 | option = ChromeOptions() 123 | option.add_experimental_option('excludeSwitches', ['enable-automation','enable-logging']) 124 | option.add_argument("--disable-blink-features") 125 | option.add_argument("--disable-blink-features=AutomationControlled") 126 | option.add_experimental_option("detach", True) 127 | 128 | desired_capabilities = DesiredCapabilities.CHROME 129 | desired_capabilities["pageLoadStrategy"] = "none" 130 | # 创建Chrome浏览器对象 131 | browser = webdriver.Chrome(options = option) 132 | # version = browser.capabilities['browserVersion'] 133 | # print(version, 'versionversion') 134 | 135 | # 小红书主页的地址 136 | browser.get('https://www.xiaohongshu.com/user/profile/5c014959f7e8b935bc3cec68?appuid=5a2025504eacab20fa287e82&apptime=1679472390') 137 | 138 | # 设置隐式等待时间为10秒 139 | time.sleep(3) 140 | browser.refresh() 141 | time.sleep(5) 142 | 143 | pages = browser.page_source 144 | soup = BeautifulSoup(pages, 'html.parser') 145 | 146 | postId = [] 147 | hrefArr = [] 148 | 149 | for span in soup.find_all('a', class_='cover ld'): 150 | # titles.append(span.find('h2').text) 151 | postId.append('https://www.xiaohongshu.com/explore/'+span.get('href').split('/')[4]) 152 | 153 | # print(soup, 'noteItem') 154 | print(postId) 155 | 156 | roopLink(postId) 157 | print("Finished!🎉") 158 | -------------------------------------------------------------------------------- /srtToTxt.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def split2step(alist,step): 4 | rs = [] 5 | for i in range(0,len(alist),step): 6 | rs.append(alist[i:i+step]) 7 | 8 | return rs 9 | 10 | def mkdir(path): 11 | ''' 12 | 创建文件夹 13 | ''' 14 | folder = os.path.exists(path) 15 | if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹 16 | print("--- 创建新的文件夹😀 ---") 17 | os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径 18 | print("--- OK 🚩 ---") 19 | else: 20 | print("--- ⚠️ 文件夹已存在! ---") 21 | 22 | path = "D:/python-demo/text" #文件夹目录 23 | targetPath = 'D:/python-demo/targetText' #存储的目标文件夹 24 | files= os.listdir(path) #得到文件夹下的所有文件名称 25 | 26 | mkdir(targetPath) # 创建目标结果文件夹 27 | 28 | for file in files: #遍历文件夹 29 | if not os.path.isdir(file): #判断是否是文件夹,不是文件夹才打开 30 | flines = open(path+"/"+file, 'r', encoding='utf-8').readlines() 31 | f4 = split2step(flines,4) 32 | result = "" 33 | for item in f4: 34 | result = result+item[2].replace("\n",",") 35 | 36 | targetFileName = file.split('.')[0]+'.txt' 37 | 38 | with open(f'targetText/{targetFileName}','w',encoding='utf-8') as r: 39 | print(f'---{targetFileName} 转换成功 🚩---') 40 | r.write(result) 41 | -------------------------------------------------------------------------------- /text/content.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:00,066 --> 00:00:03,733 3 | 是他建造了保护14亿人民的“地下钢铁长城” 4 | 5 | 2 6 | 00:00:03,766 --> 00:00:05,800 7 | 能抵挡敌人的任何核打击 8 | 9 | 3 10 | 00:00:05,800 --> 00:00:06,766 11 | 他的名字 12 | 13 | 4 14 | 00:00:06,766 --> 00:00:09,600 15 | 曾作为国家最高机密学长达60年 16 | 17 | 5 18 | 00:00:09,633 --> 00:00:10,833 19 | 直到2018年 20 | 21 | 6 22 | 00:00:10,833 --> 00:00:13,966 23 | 荣获国家最高科学技术奖,钱七虎 24 | 25 | 7 26 | 00:00:13,966 --> 00:00:16,100 27 | 这个名字才被世人知晓 28 | 29 | 8 30 | 00:00:16,166 --> 00:00:18,133 31 | 然而钱老却把800万奖金 32 | 33 | 9 34 | 00:00:18,133 --> 00:00:20,466 35 | 全数捐给贫困地区的学生 36 | 37 | 10 38 | 00:00:20,566 --> 00:00:23,200 39 | 他说“我们的先辈为国捐躯 40 | 41 | 11 42 | 00:00:23,266 --> 00:00:25,000 43 | 我还有什么不能捐的? 44 | 45 | 12 46 | 00:00:25,033 --> 00:00:28,066 47 | 国士无双, 民族脊梁,侠之大者 48 | 49 | 13 50 | 00:00:28,066 --> 00:00:30,500 51 | 为国为民,向钱老致敬! 52 | 53 | -------------------------------------------------------------------------------- /问题锦囊.md: -------------------------------------------------------------------------------- 1 | ## 一、```pip install ``` 执行pip安装操作以下错误: 2 | ![ebPIA.png](https://i.328888.xyz/2023/02/27/ebPIA.png) 3 | 4 | ### 解决方法如下: 5 | ![ebaqN.png](https://i.328888.xyz/2023/02/27/ebaqN.png) 6 | 7 | 在C盘的用户目录下,新建pip文件夹,然后新建pip.ini文件,内容如下(可复制直接使用): 8 | ``` 9 | [global] 10 | 11 | index-url=http://mirrors.aliyun.com/pypi/simple 12 | 13 | [install] 14 | 15 | trusted-host=mirrors.aliyun.com 16 | ``` 17 | 这样就可以解决pip安装失败的问题了 18 | 19 | ## 二、调用json.loads()方法报错,如图所示: 20 | ![ebTGo.png](https://i.328888.xyz/2023/02/27/ebTGo.png) 21 | 22 | ### 解决方案 23 | 使用这个地址去验证你的json字符串是否合理,https://jsonlint.com/ ,不合理的话,对应需要用代码处理一下数据,然后再调用 24 | --------------------------------------------------------------------------------