├── .gitignore
├── README.md
├── xhs.py
├── xhs_notes_2022-01-20_19:49:51.csv
└── xhs_users_2022-01-20_15:43:58.csv


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | .DS_Store
131 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # xhs-spider
 2 | 
 3 | 
 4 | 小红书博主信息、笔记信息爬虫（写给雪绒）
 5 | —— kuloud 2022/1/20
 6 | 
 7 | Example:
 8 | 
 9 | 1. 自行抓包小红书小程序，找用户token，替换{$authorizations}（其实1个就够了，但是小红书访刷做了很多限制，请求频次10s一次基本够用）
10 | 2. 需要定期爬的链接列表，自行替换{$URL}
11 | 3. 运行脚本，在当前目录生成对应的csv数据
12 | 
13 |     $ python3 xhs.py
14 | 
15 | 


--------------------------------------------------------------------------------
/xhs.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | 
  4 | 小红书博主信息、笔记信息爬虫（写给雪绒）
  5 | —— kuloud 2022/1/20
  6 | 
  7 | Example:
  8 | 
  9 | 1. 自行抓包小红书小程序，找用户token，替换{$authorizations}（其实1个就够了，但是小红书访刷做了很多限制，请求频次10s一次基本够用）
 10 | 2. 需要定期爬的链接列表，自行替换{$URL}
 11 | 3. 运行脚本，在当前目录生成对应的csv数据
 12 | 
 13 |     $ python3 xhs.py
 14 | 
 15 | 
 16 | """
 17 | import re
 18 | import requests
 19 | import csv
 20 | import hashlib
 21 | import time
 22 | 
 23 | def m_md5(data: str):
 24 |     m = hashlib.md5()
 25 |     m.update(data.encode())
 26 |     return m.hexdigest()
 27 | 
 28 | def get_user(user_id, authorization=''):
 29 |     headers['authorization'] = authorization
 30 |     URI = f'/fe_api/burdock/weixin/v2/user/{user_id}'
 31 |     xsign = 'X' + m_md5(URI + "WSUDD")
 32 |     headers['x-sign'] = xsign
 33 |     return gets(URI)
 34 | 
 35 | def get_note(user_id, authorization=''):
 36 |     headers['authorization'] = authorization
 37 |     URI = f'/fe_api/burdock/weixin/v2/note/{user_id}/single_feed'
 38 |     xsign = 'X' + m_md5(URI + "WSUDD")
 39 |     headers['x-sign'] = xsign
 40 |     return gets(URI)
 41 | 
 42 | def gets(url_path):
 43 |     base_url = 'https://www.xiaohongshu.com'
 44 |     data = requests.get(base_url+url_path, headers=headers,
 45 |                         verify=False).json()
 46 |     return data
 47 | 
 48 | def write_csv_headers(filename, csv_headers):
 49 |     f = open(filename, 'a+', encoding='utf-8')  # a+表示追加
 50 |     csv_writer = csv.writer(f)
 51 | 
 52 |     csv_writer.writerow(csv_headers)
 53 | 
 54 |     f.close()
 55 | 
 56 | def write_user_info(url, filename, data):
 57 |     id = data['id']
 58 |     nickname = data['nickname']
 59 |     gender = '男' if data['gender'] == 0 else '女' if data['gender'] == 1 else '未知'
 60 |     fans = data['fans']
 61 |     follows = data['follows']
 62 |     notes = data['notes']
 63 |     location = data['location']
 64 |     collected = data['collected']
 65 |     desc = data['desc']
 66 |     liked = data['liked']
 67 |     level = data['level']['name']
 68 |     officialVerifyName = data['officialVerifyName']
 69 | 
 70 |     fetch_time = time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime())
 71 | 
 72 |     record = "url: {}, 用户名：{}, 性别：{}, 粉丝数：{}, 关注数：{}, 收藏数：{}, 笔记数: {}, 点赞数：{}, 等级：{}, 位置：{}, 认证名：{}, 主页描述：{}, 数据时间：{}".format(
 73 |         url, nickname,  gender, fans, follows, collected, notes, liked, level, location, officialVerifyName, desc, fetch_time
 74 |     )
 75 | 
 76 |     f = open(filename, 'a+', encoding='utf-8')  # a+表示追加
 77 |     csv_writer = csv.writer(f)
 78 | 
 79 |     print(record)
 80 |     csv_writer.writerow([url, nickname,  gender, fans, follows, collected, notes, liked, level, location, officialVerifyName, desc])
 81 | 
 82 |     f.close()
 83 | 
 84 | def write_note_info(url, filename, data):
 85 |     id = data['id']
 86 |     title = data['title']
 87 |     likes = data['likes']
 88 |     collects = data['collects']
 89 |     comments = data['comments']
 90 |     note_time = data['time']
 91 |     type = data['type']
 92 | 
 93 |     fetch_time = time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime())
 94 | 
 95 |     record = "链接: {}, 标题：{}, 点赞数：{}, 收藏数：{}, 评论数：{}, 类型：{}, 发布日期: {}, 数据时间：{}".format(
 96 |         url, title,  likes, collects, comments, type, note_time, fetch_time
 97 |     )
 98 | 
 99 |     f = open(filename, 'a+', encoding='utf-8')  # a+表示追加
100 |     csv_writer = csv.writer(f)
101 | 
102 |     print(record)
103 |     csv_writer.writerow([url, title,  likes, collects, comments, type, note_time, fetch_time])
104 | 
105 |     f.close()
106 | 
107 | def fetch_users():
108 |     export_file_name = 'xhs_users_{}.csv'.format(time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime()))
109 |     users = [
110 | '{$URL}'
111 |     ]
112 |     csv_headers = ['链接', '用户名',  '性别', '粉丝数', '关注数', '收藏数', '笔记数', '点赞数', '等级', '位置', '认证名', '主页描述', '数据时间']
113 |     write_csv_headers(filename=export_file_name, csv_headers=csv_headers)
114 | 
115 |     for i, u in enumerate(users):
116 |         try:
117 |             uid = re.findall(r"profile/(.+)\?", u)[0]
118 |             authorization = authorizations[i % len(authorizations)]
119 |             print("{}/{} uid: {}, authorization: {}".format(i + 1, len(users), uid,authorization))
120 |             userData = get_user(uid,authorization=authorization)
121 |             write_user_info(url=u,filename=export_file_name, data=userData['data'])
122 |         except Exception as e:
123 |             print('handle user error: {} \n {}'.format(u, e))
124 |             pass
125 |         time.sleep(10)
126 | 
127 | def fetch_notes():
128 |     export_file_name = 'xhs_notes_{}.csv'.format(time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime()))
129 |     notes = [
130 | '{$URL}'
131 |     ]
132 |     csv_headers = ['链接', '标题', '点赞数',  '收藏数', '评论数', '类型（视频或者图文）', '发布日期', '数据时间']
133 |     write_csv_headers(filename=export_file_name, csv_headers=csv_headers)
134 | 
135 |     for i, n in enumerate(notes):
136 |         try:
137 |             if not n.startswith('http://xhslink.com'):
138 |                 # write_note_info(url=n,filename=export_file_name, data={})
139 |                 continue
140 |             r = requests.get(n, verify=False,allow_redirects=True)
141 |             origin = r.history[0].headers['Location']
142 |             uid = re.findall(r"/discovery/item/(.+)\?", origin)[0]
143 |             authorization = authorizations[i % len(authorizations)]
144 |             print("{}/{} uid: {}, authorization: {}".format(i + 1, len(notes), uid,authorization))
145 |             noteData = get_note(uid,authorization=authorization)
146 |             # print(noteData)
147 |             write_note_info(url=n,filename=export_file_name, data=noteData['data'])
148 |         except Exception as e:
149 |             print('handle note error: {} \n {}'.format(n, e))
150 |             pass
151 |         time.sleep(10)        
152 | 
153 | if __name__ == '__main__':
154 |     requests.packages.urllib3.disable_warnings()
155 | 
156 |     headers = {
157 |         'Host': 'www.xiaohongshu.com',
158 |         'device-fingerprint': 'WHJMrwNw1k/EDrs4qQu7qho7mmYuri0YzaIx1p+ZruHXU5ABFod6r13el9Gk7wXXC5zMfJLxaBMpubNTyqfbLczzvrRRHlcJkdCW1tldyDzmauSxIJm5Txg==1487582755342',
159 |         'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
160 |         'content-type': 'application/json'
161 |     }
162 | 
163 |     authorizations = ['{$authorizations}']
164 | 
165 |     fetch_users()
166 |     fetch_notes()
167 |     
168 |     
169 | 


--------------------------------------------------------------------------------
/xhs_notes_2022-01-20_19:49:51.csv:
--------------------------------------------------------------------------------
1 | 链接,标题,点赞数,收藏数,评论数,类型（视频或者图文）,发布日期,数据时间
2 | http://xhslink.com/ayOtDd,韩系学妹滤镜妆｜牛乳少女～拿来吧你！,1234,242,150,video,2021-07-26 17:25,2022-01-20_19:49:52
3 | http://xhslink.com/WvGiYd,大四近期购物分享！伪素颜眼镜｜口罩｜干发粉,295,89,59,video,2021-08-31 20:28,2022-01-20_19:50:02
4 | http://xhslink.com/iPpSRd,Somi甜心辣妹妆 | 仿妆不仿舞嘿嘿,2238,268,60,video,2021-08-16 17:46,2022-01-20_19:50:23
5 | http://xhslink.com/GATtOd,windcci粉底液🆚mac粉底液丨10小时测评‼️,125,66,99,video,2021-08-16 10:51,2022-01-20_19:50:55
6 | http://xhslink.com/WNvFQd,WINDCCI雾感粉底液｜原相机残酷怼脸全天测评,114,93,21,video,2021-08-19 18:00,2022-01-20_19:51:05
7 | 


--------------------------------------------------------------------------------
/xhs_users_2022-01-20_15:43:58.csv:
--------------------------------------------------------------------------------
 1 | 链接,用户名,性别,粉丝数,关注数,收藏数,笔记数,点赞数,等级,位置,认证名,主页描述
 2 | https://www.xiaohongshu.com/user/profile/5aba62f711be10266011d212?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237341,是小宝鸭,女,88630,48,120384,65,561079,铜冠薯,上海 长宁区,美妆博主,"161/85 
 3 | 🎵：偶尔摘星✨
 4 | 🧣：没写情书_
 5 | 感谢仙女喜欢ଘ(੭ˊ꒳​ˋ)੭✧
 6 | ❤️🧡💛💚💙💜🖤🤍🤎"
 7 | https://www.xiaohongshu.com/user/profile/5732bdf582ec390aff193608?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237428,hello我是沱沱,女,2695282,196,2271870,1040,3586348,金冠薯,四川  成都,时尚博主,"又乖又酷的野路子川妹儿📮cdtaolesi@163.com
 8 | 全部都同名@hello我是沱沱"
 9 | https://www.xiaohongshu.com/user/profile/5a7be8a911be105b172a573a?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237688,芯芯还没睡,女,122131,156,244495,209,628189,金冠薯,地球的某一片红薯地,,"沙漠干皮 敏感肌
10 | 东北人在杭州嘻嘻
11 | 分享一切变美的东西
12 | 各平台同名
13 | 商务v FGDHyyds"
14 | https://www.xiaohongshu.com/user/profile/59b005846a6a697f6a6ed660?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237856,吃掉蓝色,女,114849,200,201082,95,481379,金冠薯,中国  四川  成都,博主,"干敏肌
15 | 普通女孩
16 | 不签约 谢～
17 | 🧣喝点蓝色"
18 | https://www.xiaohongshu.com/user/profile/59f709254eacab3f56cc677b?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237968,奶茶要不要🍼,女,10933,142,19338,65,29299,金冠薯,地球的某一片红薯地,美妆博主,"🎓 学设计的退役空姐
19 | ♠️真诚分享喜欢的妆面&爱用物
20 | ◾️中庭偏长/肉肉脸/鼻子没整
21 | 📷 均相机拍摄：无滤镜（不要连赞噢"


--------------------------------------------------------------------------------