├── README.md ├── XhsContent.py ├── XhsTitle.py └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # 免责声明: 2 | 3 | 本仓库的所有内容仅供学习和参考之用,禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究,不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任,本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。 4 | 5 | # 这个爬虫主要用于帮助菜鸟选手,只要自己抓取几个简单的信息,就可以使用 6 | 7 | # 包含中英文说明,中文版说明请往下拉 8 | 9 | # wechat.py 10 | Use WeChat Work to send messages and files 11 | ## configuration 12 | The following information needs to be configured by yourself 13 | self.CORPID = "" # Enterprise ID, there are many online tutorials, easy to find 14 | self.CORPSECRET = "" # Apply Secret, there are many online tutorials, easy to find 15 | self.AGENTID = "" # Apply Agentid, there are many online tutorials, easy to find 16 | self.TOUSER = userid # The userid of the received message, as a parameter of the new class, different ids are separated by "|" 17 | self.ACCESS_TOKEN_PATH = "access_token.conf" # path to store access_token 18 | ## Create a wechat object 19 | # Create a wechat object, including 1 parameter 20 | chat = wechat. WeChat(userid) 21 | ## function 22 | There are two functions in it, 23 | ### send_message(message) 24 | You only need to fill in the text that needs to be sent, it can only be used to send text 25 | ### send_file(file) 26 | Just fill in the path you want to send 27 | 28 | 29 | # XhsTitle.py 30 | Information capture through WeChat's Xiaohongshu applet 31 | ## Create a crawler object 32 | # Create a crawler object, including 3 parameters 33 | xhs_spider = XhsTitle.XhsTitle(keyName, authorization, sortedWay) 34 | # The keywords to search 35 | keyName = "Enter the keyword to be searched" 36 | # Authorization token, which can be obtained through charles, there are many tutorials on the Internet, a text at the beginning of wxmp 37 | authorization = "wxmp.XXXXX" 38 | # There are 3 sorting methods, general: comprehensive sorting, hot_desc: popularity sorting, create_time_desc: release time sorting 39 | sortedWay = "general" 40 | ## function 41 | # Get the page content of the applet 42 | idList = xhs_spider. getlist_by_name() 43 | # Get the parsed information - if you need to grab the content of the article, it is also a link to get the content from xhs_title 44 | xhs_title = xhs_spider. get_title_url(idList) 45 | # Get article links, a list will be generated 46 | links = [d['article link'] for d in xhs_title] 47 | # Output to csv, and finally a csv file with a file name containing KeyName, sortedWay, and the date of the day will be generated 48 | xhs_spider.xhs_to_csv(xhs_title, fields, path='path.csv') 49 | 50 | 51 | #XhsContent 52 | Fetch article content by link 53 | ## Create grab object 54 | # print(url) 55 | xhs_content = XhsContent. XHSContent(url, authorization) 56 | 57 | # 中文版说明 58 | # xhs_spider 59 | 使用爬虫抓取小红书信息,并通过企业微信发送给自己 60 | 包含4个py文件,其中XhsTitle.py, XhsContent.py, wechat.py 均为独立的py文件,可以单独运行 61 | main.py为汇总文件,有完整的导入以上3个文件,以及在多关键字,多页面的情况下跑一个完整流程的代码 62 | 63 | # wechat.py 64 | 使用企业微信发送消息和文件 65 | ## 配置 66 | 以下信息是需要自己进行配置的 67 | self.CORPID = "" # 企业ID,网上教程很多,很好找 68 | self.CORPSECRET = "" # 应用Secret,网上教程很多,很好找 69 | self.AGENTID = "" # 应用Agentid,网上教程很多,很好找 70 | self.TOUSER = userid # 接收消息的userid,作为新建类的一个参数,不同的id用"|"隔开 71 | self.ACCESS_TOKEN_PATH = "access_token.conf" # 存放access_token的路径 72 | ## 建立企业微信对象 73 | # 建立企业微信对象,包含1个参数,不同的id用"|"隔开 74 | chat = wechat.WeChat(userid) 75 | ## 函数 76 | 里面有两个函数, 77 | ### send_message(message) 78 | 只需要填入需要发送的文本即可,仅可用于发送文本 79 | ### send_file(file) 80 | 填入需要发送的路径即可 81 | 82 | # XhsTitle.py 83 | 通过微信的小红书小程序进行信息抓取 84 | ## 建立爬虫对象 85 | # 建立爬虫对象,包含3个参数 86 | xhs_spider = XhsTitle.XhsTitle(keyName, authorization, sortedWay) 87 | # 需要搜索的关键字 88 | keyName = "输入需要搜索的关键字" 89 | # 授权令牌,可通过charles获取,教程网上很多,wxmp开头的一段文本 90 | authorization = "wxmp.XXXXX" 91 | # 排序方式,共3种,general:综合排序,hot_desc:热度排序,create_time_desc:发布时间排序 92 | sortedWay = "general" 93 | ## 函数 94 | # 获取小程序的页面内容 95 | idList = xhs_spider.getlist_by_name() 96 | # 获取解析后的信息-需要抓取文章内容的话,也是从xhs_title里面获取内容的链接 97 | xhs_title = xhs_spider.get_title_url(idList) 98 | # 获取文章链接,会生成一个列表 99 | links = [d['文章链接'] for d in xhs_title] 100 | # 输出到csv,最后会生成一个文件名包含KeyName, sortedWay,以及当天日期的csv文件 101 | xhs_spider.xhs_to_csv(xhs_title, fields, path='path.csv') 102 | 103 | # XhsContent 104 | 通过链接抓取文章内容 105 | ## 建立抓取对象 106 | # 输入URL和授权令牌即可 107 | xhs_content = XhsContent.XHSContent(url, authorization) 108 | ## 函数 109 | # 获取文章内容,包含两个字段,文章的链接和文章的内容 110 | xhs = xhs_content.getdata() 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /XhsContent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import hashlib 3 | import json 4 | 5 | import pandas as pd 6 | import urllib3 7 | import requests 8 | from bs4 import BeautifulSoup 9 | import time 10 | import re 11 | 12 | urllib3.disable_warnings() 13 | 14 | 15 | class XHSContent: 16 | 17 | def __init__(self, url, Authorization): 18 | self.url = url 19 | self.Authorization = Authorization 20 | 21 | def header(self): 22 | headers = { 23 | 'Accept-Encoding': 'gzip, deflate, br', 24 | 'Accept-Language': 'zh-cn', 25 | 'Connection': 'keep-alive', 26 | 'Host': 'www.xiaohongshu.com', 27 | 'Referer': "https://servicewechat.com/wxffc08ac7df482a27/346/page-frame.html", 28 | 'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 " 29 | "(KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1", 30 | 'Authorization': self.Authorization 31 | } 32 | return headers 33 | 34 | def get_x_sign(self): 35 | x_sign = "X" 36 | m = hashlib.md5() 37 | m.update((self.url + "WSUDD").encode()) 38 | x_sign = x_sign + m.hexdigest() 39 | return x_sign 40 | 41 | def html_header(self): 42 | headers = { 43 | 'Accept-Encoding': 'gzip, deflate, br', 44 | 'Accept-Language': 'zh-cn', 45 | 'Connection': 'keep-alive', 46 | 'Host': 'www.xiaohongshu.com', 47 | 'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 " 48 | "(KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1", 49 | 'X-Sign': self.get_x_sign(), 50 | } 51 | return headers 52 | 53 | def getHtmlSession(self): 54 | ses = requests.session() 55 | html = ses.get(self.url, headers=self.html_header(), verify=False) 56 | soup = BeautifulSoup(html.content, 'html.parser') 57 | return soup 58 | 59 | def getdata(self): 60 | soup = self.getHtmlSession() 61 | # print(f'soup is {soup}') 62 | script_tag = soup.find('script', {'type': 'application/ld+json'}) 63 | if script_tag is None: 64 | print('无内容') 65 | return '无内容' 66 | else: 67 | json_ld_str = script_tag.string 68 | # print(f'json is {json_ld_str}') 69 | cleaned_data = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', json_ld_str) 70 | cleaned_data = cleaned_data.replace('\n', '').replace('\r\n', '').replace('\t', '') 71 | json_ld = json.loads(cleaned_data) 72 | print(json_ld['description']) 73 | return json_ld['description'] 74 | 75 | # def run(self): 76 | # with open('xhs.csv', mode='a') as f: 77 | # for url in self.urls: 78 | # print(url) 79 | # description = self.getdata(url) 80 | # print(description) 81 | # f.write(description + '\n') 82 | # time.sleep(30) 83 | 84 | 85 | if __name__ == "__main__": 86 | urls = ['https://www.xiaohongshu.com/explore/64ac08f0000000000f00c2ea', 87 | 'https://www.xiaohongshu.com/explore/64adc895000000001c00cf3b'] 88 | 89 | content = [] 90 | for url in urls: 91 | xhs_content = XHSContent(url, 'Authorization') 92 | # print(url) 93 | entire_data = xhs_content.getdata() 94 | a_list = [url, entire_data] 95 | content.append(a_list) 96 | print(content) 97 | time.sleep(15) 98 | pd_data = pd.DataFrame(content) 99 | pd_data.to_csv('test.csv', encoding='utf-8-sig', index=False) 100 | print(pd_data) 101 | -------------------------------------------------------------------------------- /XhsTitle.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from urllib import parse 3 | 4 | import pandas as pd 5 | import requests 6 | from bs4 import BeautifulSoup 7 | from datetime import datetime 8 | import json 9 | import csv 10 | 11 | # 定义当天日期 12 | today = datetime.today() 13 | today_str = today.strftime('%Y%m%d') 14 | class XhsTitle: 15 | def __init__(self, key_name, authorization, sorted_way): 16 | self.key_name = key_name 17 | self.authorization = authorization 18 | self.sorted_way = sorted_way 19 | self.host = 'https://www.xiaohongshu.com' 20 | 21 | @staticmethod 22 | def get_x_sign(api): 23 | x_sign = "X" 24 | m = hashlib.md5() 25 | m.update((api + "WSUDD").encode()) 26 | x_sign = x_sign + m.hexdigest() 27 | return x_sign 28 | 29 | def spider(self, d_page, sort_by='general'): 30 | url = f'/fe_api/burdock/weixin/v2/search/notes?keyword={parse.quote(self.key_name)}&sortBy={sort_by}' \ 31 | f'&page={d_page + 1}&pageSize=20&prependNoteIds=&needGifCover=true' 32 | headers = { 33 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.38(0x1800262c) NetType/WIFI Language/zh_CN', 34 | 'Referer': 'https://servicewechat.com', 35 | 'Authorization': self.authorization, 36 | 'X-Sign': self.get_x_sign(url) 37 | } 38 | resp = requests.get(url=self.host + url, headers=headers, timeout=5) 39 | if resp.status_code == 200: 40 | res = json.loads(resp.text) 41 | return res['data']['notes'], res['data']['totalCount'] 42 | else: 43 | print(f'Fail:{resp.text}') 44 | 45 | def getlist_by_name(self, page_range=5): 46 | notes = [] 47 | # 目前是每次小程序搜索,出来100条结果,然后分5页进行抓取 48 | for i in range(0, page_range): 49 | tmp = self.spider(d_page=i, sort_by=self.sorted_way) 50 | if len(tmp[0]) <= 0: 51 | break 52 | else: 53 | notes.extend(tmp[0]) 54 | print(tmp[0]) 55 | return notes 56 | 57 | @staticmethod 58 | def get_info(ids): 59 | infolist = [] 60 | for id in ids: 61 | url = f"https://www.xiaohongshu.com/explore/{id}" 62 | headers = { 63 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 64 | "Accept-Encoding": "gzip, deflate, br", 65 | "Accept-Language": "zh-CN,zh-Hans;q=0.9", 66 | "Connection": "keep-alive", 67 | "Host": "www.xiaohongshu.com", 68 | "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 " 69 | "(KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1" 70 | } 71 | resp = requests.get(url, headers=headers) 72 | resp.encoding = resp.apparent_encoding 73 | html = resp.text 74 | soup = BeautifulSoup(html, 'lxml') 75 | json_str = soup.find(attrs={'type': 'application/ld+json'}).text 76 | json_str = json_str.replace('\n', '').replace('\r\n', '') 77 | info_dic = json.loads(json_str, strict=False) 78 | info_dic['link'] = url 79 | if info_dic['name'] != '': 80 | infolist.append(info_dic) 81 | return infolist 82 | 83 | @staticmethod 84 | def get_title_url(xhs_data): 85 | new_data = [] 86 | for item in xhs_data: 87 | new_data.append({ 88 | '文章链接': f"https://www.xiaohongshu.com/explore/{item['id']}", 89 | '作者主页': f'https://www.xiaohongshu.com/user/profile/{item["user"]["id"]}', 90 | '作者昵称': item['user']['nickname'], 91 | '文章标题': item['title'], 92 | '获赞数量': item['likes'], 93 | '发布时间': item['time'], 94 | '是否认证ID': item['user']['officialVerified'] 95 | }) 96 | 97 | return new_data 98 | 99 | def xhs_to_csv(self, data, field, path='x'): 100 | if path == 'x': 101 | with open(f'{today_str}{self.key_name}{self.sorted_way}.csv', 'w', newline='', encoding='utf-8-sig') as f: 102 | writer = csv.DictWriter(f, fieldnames=field) 103 | writer.writeheader() 104 | writer.writerows(data) 105 | print(f'保存成功,文件名为:{today_str}{self.key_name}{self.sorted_way}.csv') 106 | else: 107 | with open(path, 'w', newline='', encoding='utf-8-sig') as f: 108 | writer = csv.DictWriter(f, fieldnames=field) 109 | writer.writeheader() 110 | writer.writerows(data) 111 | print(f'保存成功,文件名为:{path}') 112 | 113 | 114 | if __name__ == "__main__": 115 | # 以下3个均为class XHS的参数 116 | # 需要搜索的关键字 117 | keyName = "请输入要搜索的关键词" 118 | # 授权令牌,可通过charles获取,教程网上很多 119 | authorization = "wxmp.XXXX" 120 | # 排序方式,共3种,general:综合排序,hot_desc:热度排序,create_time_desc:发布时间排序 121 | sortedWay = "general" 122 | fields = ['文章链接', '作者主页', '作者昵称', '文章标题', '获赞数量', '发布时间', '是否认证ID'] 123 | # 建立爬虫对象 124 | xhs_spider = XhsTitle(keyName, authorization, sortedWay) 125 | # 获取小程序的页面内容 126 | idList = xhs_spider.getlist_by_name() 127 | # 获取解析后的内容-需要抓取内容的话,也是从xhs_title里面获取内容的链接 128 | xhs_title = xhs_spider.get_title_url(idList) 129 | # 获取文章链接,会生成一个列表 130 | links = [d['文章链接'] for d in xhs_title] 131 | # 输出到csv,最后会生成一个文件名包含KeyName, sortedWay,以及当天日期的csv文件 132 | xhs_spider.xhs_to_csv(xhs_title, fields, path='basic.csv') 133 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import time 4 | import random 5 | from datetime import datetime 6 | import wechat 7 | import os 8 | import logging 9 | import XhsContent 10 | import XhsTitle 11 | 12 | # fields = ['文章链接', '作者主页', '作者昵称', '文章标题', '获赞数量', '发布时间', '是否认证ID'] 13 | # 需要搜索的关键字 14 | key = [""] 15 | # 授权令牌,可通过charles获取,教程网上很多 16 | authorization = "wxmp.XXXXX" 17 | # 排序方式,共3种,general:综合排序,hot_desc:热度排序,create_time_desc:发布时间排序 18 | sort = ["create_time_desc", 'general', 'hot_desc'] 19 | times = 0 20 | # 对关键字和品牌做循环 21 | for keyName in key: 22 | for sortedWay in sort: 23 | times = times + 1 24 | print(f'一共进行了{times}次循环') 25 | try: 26 | # 创建爬虫对象 27 | xhs_title = XhsTitle.XhsTitle(key_name=keyName, authorization=authorization, sorted_way=sortedWay) 28 | # 获取小程序的页面内容 29 | idList = xhs_title.getlist_by_name() 30 | # 获取解析后的内容 31 | xhs_basic = xhs_title.get_title_url(idList) 32 | # print(f'xhs_basic: {xhs_basic}') 33 | # 转换为DataFrame格式,方便后面进行操作 34 | pd_basic = pd.DataFrame(xhs_basic) 35 | # print(f'pd_basic转为dataframe: {pd_basic}') 36 | # 删除重复的"文章链接"列 37 | pd_basic.drop_duplicates('文章链接', inplace=True, keep='last') 38 | # print(f'去重后的pd_basic: {pd_basic}') 39 | # 获取文章链接,生成一个包含所有链接的列表 40 | links = [d['文章链接'] for d in xhs_basic] 41 | # print(links) 42 | # 把标题及链接等信息保存为csv文件 43 | # xhs_title.xhs_to_csv(xhs_basic, field=fields) 44 | 45 | # # 由于存在编码问题,因此要先对标题表的编码进行转换 46 | # with open('20230707hot_desc.csv', 'r', encoding='GB2312') as f: 47 | # data = f.read() 48 | # 49 | # # 将内容写入新的无BOM的UTF-8文件 50 | # with open('2023070hot_desc.csv', 'w', encoding='utf-8') as f: 51 | # f.write(data) 52 | 53 | # 通过Links获取文章内容,返回列表content 54 | content = [] 55 | for url in links: 56 | # print(url) 57 | xhs_content = XhsContent.XHSContent(url, authorization) 58 | xhs = xhs_content.getdata() 59 | time.sleep(random.randint(20, 40)) 60 | a_list = [url, xhs] 61 | content.append(a_list) 62 | 63 | # 把content转化为DataFrame格式 64 | pd_content = pd.DataFrame(content) 65 | # 原标题为0和1,把标题转为"文章链接"和"文章内容" 66 | pd_content = pd_content.rename(columns={0: "文章链接", 1: "文章内容"}) 67 | # 把文章内容存入CSV文件 68 | # pd_content.to_csv(f'{today_str()}{keyName}{sortedWay}content.csv', index=False, encoding='utf-8-sig') 69 | # print(datetime.today(), pd_content) 70 | # 使用merge函数合并两个DataFrame 71 | # content_pd = pd.DataFrame(content) 72 | # content_pd = pd.DataFrame(data=list(zip(links, content)), columns=["文章链接", "文章内容"]) 73 | xhs_pd = pd.merge(pd_basic, pd_content, on='文章链接', how='left') 74 | 75 | # 新建一列,用来记录抓取日期 76 | xhs_pd['抓取日期'] = pd.to_datetime('now').date() 77 | xhs_pd['品牌'] = keyName 78 | xhs_pd['抓取方式'] = sortedWay 79 | 80 | # 保存为csv文件 81 | xhs_pd.to_csv('xhs_db.csv', index=False, mode='a') 82 | # 每次间隔500 - 700秒,规避反爬虫机制 83 | time.sleep(random.randint(500, 700)) 84 | except: 85 | logging.exception("Error occurred") 86 | continue 87 | 88 | 89 | def duplicates(s, t): 90 | # 随机爬取,难保会抓到重复的内容,保存后去除重复的内容 91 | t = pd.read_excel(t) 92 | s = pd.read_csv(s) 93 | df = pd.concat([t, s], ignore_index=True) 94 | # print(len(df)) 95 | # 去除重复内容,如有重复内容,保留最新的一条 96 | df.drop_duplicates(subset='文章链接', inplace=True, keep='last') 97 | # print(len(df)) 98 | df.to_excel('xhs_db.xlsx', index=False) 99 | 100 | 101 | def sent_file(sent_id, path): 102 | chat = wechat.WeChat(sent_id) 103 | chat.send_file(path) 104 | 105 | 106 | if __name__ == "__main__": 107 | source = 'xhs_db.csv' 108 | target = 'xhs_db.xlsx' 109 | duplicates(source, target) 110 | sent_file('12345', 'xhs_db.xlsx') 111 | 112 | if os.path.exists(source): 113 | os.remove(source) 114 | print(f"文件 {source} 已成功删除") 115 | else: 116 | print(f"文件 {source} 不存在") 117 | --------------------------------------------------------------------------------