├── README.md
├── XhsContent.py
├── XhsTitle.py
└── main.py


/README.md:
--------------------------------------------------------------------------------
  1 | # 免责声明：
  2 | 
  3 | 本仓库的所有内容仅供学习和参考之用，禁止用于商业用途。任何人或组织不得将本仓库的内容用于非法用途或侵犯他人合法权益。本仓库所涉及的爬虫技术仅用于学习和研究，不得用于对其他平台进行大规模爬虫或其他非法行为。对于因使用本仓库内容而引起的任何法律责任，本仓库不承担任何责任。使用本仓库的内容即表示您同意本免责声明的所有条款和条件。
  4 | 
  5 | # 这个爬虫主要用于帮助菜鸟选手，只要自己抓取几个简单的信息，就可以使用
  6 | 
  7 | # 包含中英文说明，中文版说明请往下拉
  8 | 
  9 | # wechat.py
 10 | Use WeChat Work to send messages and files
 11 | ## configuration
 12 |         The following information needs to be configured by yourself
 13 |         self.CORPID = "" # Enterprise ID, there are many online tutorials, easy to find
 14 |         self.CORPSECRET = "" # Apply Secret, there are many online tutorials, easy to find
 15 |         self.AGENTID = "" # Apply Agentid, there are many online tutorials, easy to find
 16 |         self.TOUSER = userid # The userid of the received message, as a parameter of the new class, different ids are separated by "|"
 17 |         self.ACCESS_TOKEN_PATH = "access_token.conf" # path to store access_token
 18 | ## Create a wechat object
 19 |     # Create a wechat object, including 1 parameter
 20 |     chat = wechat. WeChat(userid)
 21 | ## function
 22 | There are two functions in it,
 23 | ### send_message(message)
 24 | You only need to fill in the text that needs to be sent, it can only be used to send text
 25 | ### send_file(file)
 26 | Just fill in the path you want to send
 27 | 
 28 | 
 29 | # XhsTitle.py
 30 | Information capture through WeChat's Xiaohongshu applet
 31 | ## Create a crawler object
 32 |     # Create a crawler object, including 3 parameters
 33 |     xhs_spider = XhsTitle.XhsTitle(keyName, authorization, sortedWay)
 34 |     # The keywords to search
 35 |     keyName = "Enter the keyword to be searched"
 36 |     # Authorization token, which can be obtained through charles, there are many tutorials on the Internet, a text at the beginning of wxmp
 37 |     authorization = "wxmp.XXXXX"
 38 |     # There are 3 sorting methods, general: comprehensive sorting, hot_desc: popularity sorting, create_time_desc: release time sorting
 39 |     sortedWay = "general"
 40 | ## function
 41 |     # Get the page content of the applet
 42 |     idList = xhs_spider. getlist_by_name()
 43 |     # Get the parsed information - if you need to grab the content of the article, it is also a link to get the content from xhs_title
 44 |     xhs_title = xhs_spider. get_title_url(idList)
 45 |     # Get article links, a list will be generated
 46 |     links = [d['article link'] for d in xhs_title]
 47 |     # Output to csv, and finally a csv file with a file name containing KeyName, sortedWay, and the date of the day will be generated
 48 |     xhs_spider.xhs_to_csv(xhs_title, fields, path='path.csv')
 49 | 
 50 | 
 51 | #XhsContent
 52 | Fetch article content by link
 53 | ## Create grab object
 54 |     # print(url)
 55 |     xhs_content = XhsContent. XHSContent(url, authorization)
 56 | 
 57 | # 中文版说明
 58 | # xhs_spider
 59 | 使用爬虫抓取小红书信息，并通过企业微信发送给自己
 60 | 包含4个py文件，其中XhsTitle.py, XhsContent.py, wechat.py 均为独立的py文件，可以单独运行
 61 | main.py为汇总文件，有完整的导入以上3个文件，以及在多关键字，多页面的情况下跑一个完整流程的代码
 62 | 
 63 | # wechat.py
 64 | 使用企业微信发送消息和文件
 65 | ## 配置
 66 |         以下信息是需要自己进行配置的
 67 |         self.CORPID = ""  # 企业ID，网上教程很多，很好找
 68 |         self.CORPSECRET = ""  # 应用Secret，网上教程很多，很好找
 69 |         self.AGENTID = ""  # 应用Agentid，网上教程很多，很好找
 70 |         self.TOUSER = userid  # 接收消息的userid，作为新建类的一个参数，不同的id用"|"隔开
 71 |         self.ACCESS_TOKEN_PATH = "access_token.conf"  # 存放access_token的路径
 72 | ## 建立企业微信对象
 73 |     # 建立企业微信对象，包含1个参数，不同的id用"|"隔开
 74 |     chat = wechat.WeChat(userid)
 75 | ## 函数
 76 | 里面有两个函数，
 77 | ### send_message(message)
 78 | 只需要填入需要发送的文本即可，仅可用于发送文本
 79 | ### send_file(file)
 80 | 填入需要发送的路径即可
 81 | 
 82 | # XhsTitle.py
 83 | 通过微信的小红书小程序进行信息抓取
 84 | ## 建立爬虫对象
 85 |     # 建立爬虫对象，包含3个参数
 86 |     xhs_spider = XhsTitle.XhsTitle(keyName, authorization, sortedWay)
 87 |     # 需要搜索的关键字
 88 |     keyName = "输入需要搜索的关键字"
 89 |     # 授权令牌，可通过charles获取，教程网上很多，wxmp开头的一段文本
 90 |     authorization = "wxmp.XXXXX"
 91 |     # 排序方式，共3种，general：综合排序，hot_desc：热度排序,create_time_desc：发布时间排序
 92 |     sortedWay = "general"
 93 | ## 函数
 94 |     # 获取小程序的页面内容
 95 |     idList = xhs_spider.getlist_by_name()
 96 |     # 获取解析后的信息-需要抓取文章内容的话，也是从xhs_title里面获取内容的链接
 97 |     xhs_title = xhs_spider.get_title_url(idList)
 98 |     # 获取文章链接,会生成一个列表
 99 |     links = [d['文章链接'] for d in xhs_title]
100 |     # 输出到csv，最后会生成一个文件名包含KeyName, sortedWay,以及当天日期的csv文件
101 |     xhs_spider.xhs_to_csv(xhs_title, fields, path='path.csv')
102 | 
103 | # XhsContent
104 | 通过链接抓取文章内容
105 | ## 建立抓取对象
106 |     # 输入URL和授权令牌即可
107 |     xhs_content = XhsContent.XHSContent(url, authorization)
108 | ## 函数
109 |     # 获取文章内容，包含两个字段，文章的链接和文章的内容
110 |     xhs = xhs_content.getdata()
111 |     
112 |         
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/XhsContent.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import hashlib
  3 | import json
  4 | 
  5 | import pandas as pd
  6 | import urllib3
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | import time
 10 | import re
 11 | 
 12 | urllib3.disable_warnings()
 13 | 
 14 | 
 15 | class XHSContent:
 16 | 
 17 |     def __init__(self, url, Authorization):
 18 |         self.url = url
 19 |         self.Authorization = Authorization
 20 | 
 21 |     def header(self):
 22 |         headers = {
 23 |             'Accept-Encoding': 'gzip, deflate, br',
 24 |             'Accept-Language': 'zh-cn',
 25 |             'Connection': 'keep-alive',
 26 |             'Host': 'www.xiaohongshu.com',
 27 |             'Referer': "https://servicewechat.com/wxffc08ac7df482a27/346/page-frame.html",
 28 |             'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 "
 29 |                           "(KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1",
 30 |             'Authorization': self.Authorization
 31 |         }
 32 |         return headers
 33 | 
 34 |     def get_x_sign(self):
 35 |         x_sign = "X"
 36 |         m = hashlib.md5()
 37 |         m.update((self.url + "WSUDD").encode())
 38 |         x_sign = x_sign + m.hexdigest()
 39 |         return x_sign
 40 | 
 41 |     def html_header(self):
 42 |         headers = {
 43 |             'Accept-Encoding': 'gzip, deflate, br',
 44 |             'Accept-Language': 'zh-cn',
 45 |             'Connection': 'keep-alive',
 46 |             'Host': 'www.xiaohongshu.com',
 47 |             'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 "
 48 |                           "(KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1",
 49 |             'X-Sign': self.get_x_sign(),
 50 |         }
 51 |         return headers
 52 | 
 53 |     def getHtmlSession(self):
 54 |         ses = requests.session()
 55 |         html = ses.get(self.url, headers=self.html_header(), verify=False)
 56 |         soup = BeautifulSoup(html.content, 'html.parser')
 57 |         return soup
 58 | 
 59 |     def getdata(self):
 60 |         soup = self.getHtmlSession()
 61 |         # print(f'soup is {soup}')
 62 |         script_tag = soup.find('script', {'type': 'application/ld+json'})
 63 |         if script_tag is None:
 64 |             print('无内容')
 65 |             return '无内容'
 66 |         else:
 67 |             json_ld_str = script_tag.string
 68 |             # print(f'json is {json_ld_str}')
 69 |             cleaned_data = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', json_ld_str)
 70 |             cleaned_data = cleaned_data.replace('\n', '').replace('\r\n', '').replace('\t', '')
 71 |             json_ld = json.loads(cleaned_data)
 72 |             print(json_ld['description'])
 73 |             return json_ld['description']
 74 | 
 75 |     # def run(self):
 76 |     #     with open('xhs.csv', mode='a') as f:
 77 |     #         for url in self.urls:
 78 |     #             print(url)
 79 |     #             description = self.getdata(url)
 80 |     #             print(description)
 81 |     #             f.write(description + '\n')
 82 |     #             time.sleep(30)
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     urls = ['https://www.xiaohongshu.com/explore/64ac08f0000000000f00c2ea',
 87 |             'https://www.xiaohongshu.com/explore/64adc895000000001c00cf3b']
 88 | 
 89 |     content = []
 90 |     for url in urls:
 91 |         xhs_content = XHSContent(url, 'Authorization')
 92 |         # print(url)
 93 |         entire_data = xhs_content.getdata()
 94 |         a_list = [url, entire_data]
 95 |         content.append(a_list)
 96 |         print(content)
 97 |         time.sleep(15)
 98 |     pd_data = pd.DataFrame(content)
 99 |     pd_data.to_csv('test.csv', encoding='utf-8-sig', index=False)
100 |     print(pd_data)
101 | 


--------------------------------------------------------------------------------
/XhsTitle.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | from urllib import parse
  3 | 
  4 | import pandas as pd
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | from datetime import datetime
  8 | import json
  9 | import csv
 10 | 
 11 | # 定义当天日期
 12 | today = datetime.today()
 13 | today_str = today.strftime('%Y%m%d')
 14 | class XhsTitle:
 15 |     def __init__(self, key_name, authorization, sorted_way):
 16 |         self.key_name = key_name
 17 |         self.authorization = authorization
 18 |         self.sorted_way = sorted_way
 19 |         self.host = 'https://www.xiaohongshu.com'
 20 | 
 21 |     @staticmethod
 22 |     def get_x_sign(api):
 23 |         x_sign = "X"
 24 |         m = hashlib.md5()
 25 |         m.update((api + "WSUDD").encode())
 26 |         x_sign = x_sign + m.hexdigest()
 27 |         return x_sign
 28 | 
 29 |     def spider(self, d_page, sort_by='general'):
 30 |         url = f'/fe_api/burdock/weixin/v2/search/notes?keyword={parse.quote(self.key_name)}&sortBy={sort_by}' \
 31 |               f'&page={d_page + 1}&pageSize=20&prependNoteIds=&needGifCover=true'
 32 |         headers = {
 33 |             'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/8.0.38(0x1800262c) NetType/WIFI Language/zh_CN',
 34 |             'Referer': 'https://servicewechat.com',
 35 |             'Authorization': self.authorization,
 36 |             'X-Sign': self.get_x_sign(url)
 37 |         }
 38 |         resp = requests.get(url=self.host + url, headers=headers, timeout=5)
 39 |         if resp.status_code == 200:
 40 |             res = json.loads(resp.text)
 41 |             return res['data']['notes'], res['data']['totalCount']
 42 |         else:
 43 |             print(f'Fail:{resp.text}')
 44 | 
 45 |     def getlist_by_name(self, page_range=5):
 46 |         notes = []
 47 |         # 目前是每次小程序搜索，出来100条结果，然后分5页进行抓取
 48 |         for i in range(0, page_range):
 49 |             tmp = self.spider(d_page=i, sort_by=self.sorted_way)
 50 |             if len(tmp[0]) <= 0:
 51 |                 break
 52 |             else:
 53 |                 notes.extend(tmp[0])
 54 |             print(tmp[0])
 55 |         return notes
 56 | 
 57 |     @staticmethod
 58 |     def get_info(ids):
 59 |         infolist = []
 60 |         for id in ids:
 61 |             url = f"https://www.xiaohongshu.com/explore/{id}"
 62 |             headers = {
 63 |                 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 64 |                 "Accept-Encoding": "gzip, deflate, br",
 65 |                 "Accept-Language": "zh-CN,zh-Hans;q=0.9",
 66 |                 "Connection": "keep-alive",
 67 |                 "Host": "www.xiaohongshu.com",
 68 |                 "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 "
 69 |                               "(KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1"
 70 |             }
 71 |             resp = requests.get(url, headers=headers)
 72 |             resp.encoding = resp.apparent_encoding
 73 |             html = resp.text
 74 |             soup = BeautifulSoup(html, 'lxml')
 75 |             json_str = soup.find(attrs={'type': 'application/ld+json'}).text
 76 |             json_str = json_str.replace('\n', '').replace('\r\n', '')
 77 |             info_dic = json.loads(json_str, strict=False)
 78 |             info_dic['link'] = url
 79 |             if info_dic['name'] != '':
 80 |                 infolist.append(info_dic)
 81 |         return infolist
 82 | 
 83 |     @staticmethod
 84 |     def get_title_url(xhs_data):
 85 |         new_data = []
 86 |         for item in xhs_data:
 87 |             new_data.append({
 88 |                 '文章链接': f"https://www.xiaohongshu.com/explore/{item['id']}",
 89 |                 '作者主页': f'https://www.xiaohongshu.com/user/profile/{item["user"]["id"]}',
 90 |                 '作者昵称': item['user']['nickname'],
 91 |                 '文章标题': item['title'],
 92 |                 '获赞数量': item['likes'],
 93 |                 '发布时间': item['time'],
 94 |                 '是否认证ID': item['user']['officialVerified']
 95 |             })
 96 | 
 97 |         return new_data
 98 | 
 99 |     def xhs_to_csv(self, data, field, path='x'):
100 |         if path == 'x':
101 |             with open(f'{today_str}{self.key_name}{self.sorted_way}.csv', 'w', newline='', encoding='utf-8-sig') as f:
102 |                 writer = csv.DictWriter(f, fieldnames=field)
103 |                 writer.writeheader()
104 |                 writer.writerows(data)
105 |             print(f'保存成功，文件名为：{today_str}{self.key_name}{self.sorted_way}.csv')
106 |         else:
107 |             with open(path, 'w', newline='', encoding='utf-8-sig') as f:
108 |                 writer = csv.DictWriter(f, fieldnames=field)
109 |                 writer.writeheader()
110 |                 writer.writerows(data)
111 |             print(f'保存成功，文件名为：{path}')
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     # 以下3个均为class XHS的参数
116 |     # 需要搜索的关键字
117 |     keyName = "请输入要搜索的关键词"
118 |     # 授权令牌，可通过charles获取，教程网上很多
119 |     authorization = "wxmp.XXXX"
120 |     # 排序方式，共3种，general：综合排序，hot_desc：热度排序,create_time_desc：发布时间排序
121 |     sortedWay = "general"
122 |     fields = ['文章链接', '作者主页', '作者昵称', '文章标题', '获赞数量', '发布时间', '是否认证ID']
123 |     # 建立爬虫对象
124 |     xhs_spider = XhsTitle(keyName, authorization, sortedWay)
125 |     # 获取小程序的页面内容
126 |     idList = xhs_spider.getlist_by_name()
127 |     # 获取解析后的内容-需要抓取内容的话，也是从xhs_title里面获取内容的链接
128 |     xhs_title = xhs_spider.get_title_url(idList)
129 |     # 获取文章链接,会生成一个列表
130 |     links = [d['文章链接'] for d in xhs_title]
131 |     # 输出到csv，最后会生成一个文件名包含KeyName, sortedWay,以及当天日期的csv文件
132 |     xhs_spider.xhs_to_csv(xhs_title, fields, path='basic.csv')
133 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import time
  4 | import random
  5 | from datetime import datetime
  6 | import wechat
  7 | import os
  8 | import logging
  9 | import XhsContent
 10 | import XhsTitle
 11 | 
 12 | # fields = ['文章链接', '作者主页', '作者昵称', '文章标题', '获赞数量', '发布时间', '是否认证ID']
 13 | # 需要搜索的关键字
 14 | key = [""]
 15 | # 授权令牌，可通过charles获取，教程网上很多
 16 | authorization = "wxmp.XXXXX"
 17 | # 排序方式，共3种，general：综合排序，hot_desc：热度排序,create_time_desc：发布时间排序
 18 | sort = ["create_time_desc", 'general', 'hot_desc']
 19 | times = 0
 20 | # 对关键字和品牌做循环
 21 | for keyName in key:
 22 |     for sortedWay in sort:
 23 |         times = times + 1
 24 |         print(f'一共进行了{times}次循环')
 25 |         try:
 26 |             # 创建爬虫对象
 27 |             xhs_title = XhsTitle.XhsTitle(key_name=keyName, authorization=authorization, sorted_way=sortedWay)
 28 |             # 获取小程序的页面内容
 29 |             idList = xhs_title.getlist_by_name()
 30 |             # 获取解析后的内容
 31 |             xhs_basic = xhs_title.get_title_url(idList)
 32 |             # print(f'xhs_basic: {xhs_basic}')
 33 |             # 转换为DataFrame格式，方便后面进行操作
 34 |             pd_basic = pd.DataFrame(xhs_basic)
 35 |             # print(f'pd_basic转为dataframe: {pd_basic}')
 36 |             # 删除重复的"文章链接"列
 37 |             pd_basic.drop_duplicates('文章链接', inplace=True, keep='last')
 38 |             # print(f'去重后的pd_basic: {pd_basic}')
 39 |             # 获取文章链接,生成一个包含所有链接的列表
 40 |             links = [d['文章链接'] for d in xhs_basic]
 41 |             # print(links)
 42 |             # 把标题及链接等信息保存为csv文件
 43 |             # xhs_title.xhs_to_csv(xhs_basic, field=fields)
 44 | 
 45 |             # # 由于存在编码问题，因此要先对标题表的编码进行转换
 46 |             # with open('20230707hot_desc.csv', 'r', encoding='GB2312') as f:
 47 |             #     data = f.read()
 48 |             #
 49 |             # # 将内容写入新的无BOM的UTF-8文件
 50 |             # with open('2023070hot_desc.csv', 'w', encoding='utf-8') as f:
 51 |             #     f.write(data)
 52 | 
 53 |             # 通过Links获取文章内容，返回列表content
 54 |             content = []
 55 |             for url in links:
 56 |                 # print(url)
 57 |                 xhs_content = XhsContent.XHSContent(url, authorization)
 58 |                 xhs = xhs_content.getdata()
 59 |                 time.sleep(random.randint(20, 40))
 60 |                 a_list = [url, xhs]
 61 |                 content.append(a_list)
 62 | 
 63 |             # 把content转化为DataFrame格式
 64 |             pd_content = pd.DataFrame(content)
 65 |             # 原标题为0和1，把标题转为"文章链接"和"文章内容"
 66 |             pd_content = pd_content.rename(columns={0: "文章链接", 1: "文章内容"})
 67 |             # 把文章内容存入CSV文件
 68 |             # pd_content.to_csv(f'{today_str()}{keyName}{sortedWay}content.csv', index=False, encoding='utf-8-sig')
 69 |             # print(datetime.today(), pd_content)
 70 |             # 使用merge函数合并两个DataFrame
 71 |             # content_pd = pd.DataFrame(content)
 72 |             # content_pd = pd.DataFrame(data=list(zip(links, content)), columns=["文章链接", "文章内容"])
 73 |             xhs_pd = pd.merge(pd_basic, pd_content, on='文章链接', how='left')
 74 | 
 75 |             # 新建一列，用来记录抓取日期
 76 |             xhs_pd['抓取日期'] = pd.to_datetime('now').date()
 77 |             xhs_pd['品牌'] = keyName
 78 |             xhs_pd['抓取方式'] = sortedWay
 79 | 
 80 |             # 保存为csv文件
 81 |             xhs_pd.to_csv('xhs_db.csv', index=False, mode='a')
 82 |             # 每次间隔500 - 700秒，规避反爬虫机制
 83 |             time.sleep(random.randint(500, 700))
 84 |         except:
 85 |             logging.exception("Error occurred")
 86 |             continue
 87 | 
 88 | 
 89 | def duplicates(s, t):
 90 |     # 随机爬取，难保会抓到重复的内容，保存后去除重复的内容
 91 |     t = pd.read_excel(t)
 92 |     s = pd.read_csv(s)
 93 |     df = pd.concat([t, s], ignore_index=True)
 94 |     # print(len(df))
 95 |     # 去除重复内容，如有重复内容，保留最新的一条
 96 |     df.drop_duplicates(subset='文章链接', inplace=True, keep='last')
 97 |     # print(len(df))
 98 |     df.to_excel('xhs_db.xlsx', index=False)
 99 | 
100 | 
101 | def sent_file(sent_id, path):
102 |     chat = wechat.WeChat(sent_id)
103 |     chat.send_file(path)
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     source = 'xhs_db.csv'
108 |     target = 'xhs_db.xlsx'
109 |     duplicates(source, target)
110 |     sent_file('12345', 'xhs_db.xlsx')
111 | 
112 |     if os.path.exists(source):
113 |         os.remove(source)
114 |         print(f"文件 {source} 已成功删除")
115 |     else:
116 |         print(f"文件 {source} 不存在")
117 | 


--------------------------------------------------------------------------------