├── .gitignore ├── README.md ├── xhs.py ├── xhs_notes_2022-01-20_19:49:51.csv └── xhs_users_2022-01-20_15:43:58.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .DS_Store 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xhs-spider 2 | 3 | 4 | 小红书博主信息、笔记信息爬虫(写给雪绒) 5 | —— kuloud 2022/1/20 6 | 7 | Example: 8 | 9 | 1. 自行抓包小红书小程序,找用户token,替换{$authorizations}(其实1个就够了,但是小红书访刷做了很多限制,请求频次10s一次基本够用) 10 | 2. 需要定期爬的链接列表,自行替换{$URL} 11 | 3. 运行脚本,在当前目录生成对应的csv数据 12 | 13 | $ python3 xhs.py 14 | 15 | -------------------------------------------------------------------------------- /xhs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | 小红书博主信息、笔记信息爬虫(写给雪绒) 5 | —— kuloud 2022/1/20 6 | 7 | Example: 8 | 9 | 1. 自行抓包小红书小程序,找用户token,替换{$authorizations}(其实1个就够了,但是小红书访刷做了很多限制,请求频次10s一次基本够用) 10 | 2. 需要定期爬的链接列表,自行替换{$URL} 11 | 3. 运行脚本,在当前目录生成对应的csv数据 12 | 13 | $ python3 xhs.py 14 | 15 | 16 | """ 17 | import re 18 | import requests 19 | import csv 20 | import hashlib 21 | import time 22 | 23 | def m_md5(data: str): 24 | m = hashlib.md5() 25 | m.update(data.encode()) 26 | return m.hexdigest() 27 | 28 | def get_user(user_id, authorization=''): 29 | headers['authorization'] = authorization 30 | URI = f'/fe_api/burdock/weixin/v2/user/{user_id}' 31 | xsign = 'X' + m_md5(URI + "WSUDD") 32 | headers['x-sign'] = xsign 33 | return gets(URI) 34 | 35 | def get_note(user_id, authorization=''): 36 | headers['authorization'] = authorization 37 | URI = f'/fe_api/burdock/weixin/v2/note/{user_id}/single_feed' 38 | xsign = 'X' + m_md5(URI + "WSUDD") 39 | headers['x-sign'] = xsign 40 | return gets(URI) 41 | 42 | def gets(url_path): 43 | base_url = 'https://www.xiaohongshu.com' 44 | data = requests.get(base_url+url_path, headers=headers, 45 | verify=False).json() 46 | return data 47 | 48 | def write_csv_headers(filename, csv_headers): 49 | f = open(filename, 'a+', encoding='utf-8') # a+表示追加 50 | csv_writer = csv.writer(f) 51 | 52 | csv_writer.writerow(csv_headers) 53 | 54 | f.close() 55 | 56 | def write_user_info(url, filename, data): 57 | id = data['id'] 58 | nickname = data['nickname'] 59 | gender = '男' if data['gender'] == 0 else '女' if data['gender'] == 1 else '未知' 60 | fans = data['fans'] 61 | follows = data['follows'] 62 | notes = data['notes'] 63 | location = data['location'] 64 | collected = data['collected'] 65 | desc = data['desc'] 66 | liked = data['liked'] 67 | level = data['level']['name'] 68 | officialVerifyName = data['officialVerifyName'] 69 | 70 | fetch_time = time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime()) 71 | 72 | record = "url: {}, 用户名:{}, 性别:{}, 粉丝数:{}, 关注数:{}, 收藏数:{}, 笔记数: {}, 点赞数:{}, 等级:{}, 位置:{}, 认证名:{}, 主页描述:{}, 数据时间:{}".format( 73 | url, nickname, gender, fans, follows, collected, notes, liked, level, location, officialVerifyName, desc, fetch_time 74 | ) 75 | 76 | f = open(filename, 'a+', encoding='utf-8') # a+表示追加 77 | csv_writer = csv.writer(f) 78 | 79 | print(record) 80 | csv_writer.writerow([url, nickname, gender, fans, follows, collected, notes, liked, level, location, officialVerifyName, desc]) 81 | 82 | f.close() 83 | 84 | def write_note_info(url, filename, data): 85 | id = data['id'] 86 | title = data['title'] 87 | likes = data['likes'] 88 | collects = data['collects'] 89 | comments = data['comments'] 90 | note_time = data['time'] 91 | type = data['type'] 92 | 93 | fetch_time = time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime()) 94 | 95 | record = "链接: {}, 标题:{}, 点赞数:{}, 收藏数:{}, 评论数:{}, 类型:{}, 发布日期: {}, 数据时间:{}".format( 96 | url, title, likes, collects, comments, type, note_time, fetch_time 97 | ) 98 | 99 | f = open(filename, 'a+', encoding='utf-8') # a+表示追加 100 | csv_writer = csv.writer(f) 101 | 102 | print(record) 103 | csv_writer.writerow([url, title, likes, collects, comments, type, note_time, fetch_time]) 104 | 105 | f.close() 106 | 107 | def fetch_users(): 108 | export_file_name = 'xhs_users_{}.csv'.format(time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime())) 109 | users = [ 110 | '{$URL}' 111 | ] 112 | csv_headers = ['链接', '用户名', '性别', '粉丝数', '关注数', '收藏数', '笔记数', '点赞数', '等级', '位置', '认证名', '主页描述', '数据时间'] 113 | write_csv_headers(filename=export_file_name, csv_headers=csv_headers) 114 | 115 | for i, u in enumerate(users): 116 | try: 117 | uid = re.findall(r"profile/(.+)\?", u)[0] 118 | authorization = authorizations[i % len(authorizations)] 119 | print("{}/{} uid: {}, authorization: {}".format(i + 1, len(users), uid,authorization)) 120 | userData = get_user(uid,authorization=authorization) 121 | write_user_info(url=u,filename=export_file_name, data=userData['data']) 122 | except Exception as e: 123 | print('handle user error: {} \n {}'.format(u, e)) 124 | pass 125 | time.sleep(10) 126 | 127 | def fetch_notes(): 128 | export_file_name = 'xhs_notes_{}.csv'.format(time.strftime("%Y-%m-%d_%H:%M:%S",time.localtime())) 129 | notes = [ 130 | '{$URL}' 131 | ] 132 | csv_headers = ['链接', '标题', '点赞数', '收藏数', '评论数', '类型(视频或者图文)', '发布日期', '数据时间'] 133 | write_csv_headers(filename=export_file_name, csv_headers=csv_headers) 134 | 135 | for i, n in enumerate(notes): 136 | try: 137 | if not n.startswith('http://xhslink.com'): 138 | # write_note_info(url=n,filename=export_file_name, data={}) 139 | continue 140 | r = requests.get(n, verify=False,allow_redirects=True) 141 | origin = r.history[0].headers['Location'] 142 | uid = re.findall(r"/discovery/item/(.+)\?", origin)[0] 143 | authorization = authorizations[i % len(authorizations)] 144 | print("{}/{} uid: {}, authorization: {}".format(i + 1, len(notes), uid,authorization)) 145 | noteData = get_note(uid,authorization=authorization) 146 | # print(noteData) 147 | write_note_info(url=n,filename=export_file_name, data=noteData['data']) 148 | except Exception as e: 149 | print('handle note error: {} \n {}'.format(n, e)) 150 | pass 151 | time.sleep(10) 152 | 153 | if __name__ == '__main__': 154 | requests.packages.urllib3.disable_warnings() 155 | 156 | headers = { 157 | 'Host': 'www.xiaohongshu.com', 158 | 'device-fingerprint': 'WHJMrwNw1k/EDrs4qQu7qho7mmYuri0YzaIx1p+ZruHXU5ABFod6r13el9Gk7wXXC5zMfJLxaBMpubNTyqfbLczzvrRRHlcJkdCW1tldyDzmauSxIJm5Txg==1487582755342', 159 | 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat', 160 | 'content-type': 'application/json' 161 | } 162 | 163 | authorizations = ['{$authorizations}'] 164 | 165 | fetch_users() 166 | fetch_notes() 167 | 168 | 169 | -------------------------------------------------------------------------------- /xhs_notes_2022-01-20_19:49:51.csv: -------------------------------------------------------------------------------- 1 | 链接,标题,点赞数,收藏数,评论数,类型(视频或者图文),发布日期,数据时间 2 | http://xhslink.com/ayOtDd,韩系学妹滤镜妆|牛乳少女~拿来吧你!,1234,242,150,video,2021-07-26 17:25,2022-01-20_19:49:52 3 | http://xhslink.com/WvGiYd,大四近期购物分享!伪素颜眼镜|口罩|干发粉,295,89,59,video,2021-08-31 20:28,2022-01-20_19:50:02 4 | http://xhslink.com/iPpSRd,Somi甜心辣妹妆 | 仿妆不仿舞嘿嘿,2238,268,60,video,2021-08-16 17:46,2022-01-20_19:50:23 5 | http://xhslink.com/GATtOd,windcci粉底液🆚mac粉底液丨10小时测评‼️,125,66,99,video,2021-08-16 10:51,2022-01-20_19:50:55 6 | http://xhslink.com/WNvFQd,WINDCCI雾感粉底液|原相机残酷怼脸全天测评,114,93,21,video,2021-08-19 18:00,2022-01-20_19:51:05 7 | -------------------------------------------------------------------------------- /xhs_users_2022-01-20_15:43:58.csv: -------------------------------------------------------------------------------- 1 | 链接,用户名,性别,粉丝数,关注数,收藏数,笔记数,点赞数,等级,位置,认证名,主页描述 2 | https://www.xiaohongshu.com/user/profile/5aba62f711be10266011d212?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237341,是小宝鸭,女,88630,48,120384,65,561079,铜冠薯,上海 长宁区,美妆博主,"161/85 3 | 🎵:偶尔摘星✨ 4 | 🧣:没写情书_ 5 | 感谢仙女喜欢ଘ(੭ˊ꒳​ˋ)੭✧ 6 | ❤️🧡💛💚💙💜🖤🤍🤎" 7 | https://www.xiaohongshu.com/user/profile/5732bdf582ec390aff193608?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237428,hello我是沱沱,女,2695282,196,2271870,1040,3586348,金冠薯,四川 成都,时尚博主,"又乖又酷的野路子川妹儿📮cdtaolesi@163.com 8 | 全部都同名@hello我是沱沱" 9 | https://www.xiaohongshu.com/user/profile/5a7be8a911be105b172a573a?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237688,芯芯还没睡,女,122131,156,244495,209,628189,金冠薯,地球的某一片红薯地,,"沙漠干皮 敏感肌 10 | 东北人在杭州嘻嘻 11 | 分享一切变美的东西 12 | 各平台同名 13 | 商务v FGDHyyds" 14 | https://www.xiaohongshu.com/user/profile/59b005846a6a697f6a6ed660?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237856,吃掉蓝色,女,114849,200,201082,95,481379,金冠薯,中国 四川 成都,博主,"干敏肌 15 | 普通女孩 16 | 不签约 谢~ 17 | 🧣喝点蓝色" 18 | https://www.xiaohongshu.com/user/profile/59f709254eacab3f56cc677b?xhsshare=CopyLink&appuid=5a94e4544eacab3e3c4eb104&apptime=1640237968,奶茶要不要🍼,女,10933,142,19338,65,29299,金冠薯,地球的某一片红薯地,美妆博主,"🎓 学设计的退役空姐 19 | ♠️真诚分享喜欢的妆面&爱用物 20 | ◾️中庭偏长/肉肉脸/鼻子没整 21 | 📷 均相机拍摄:无滤镜(不要连赞噢" --------------------------------------------------------------------------------