├── README.md └── note.py /README.md: -------------------------------------------------------------------------------- 1 | # spider-xiaohongshu 2 | 【学习交流用】爬取小红书的内容 3 | -------------------------------------------------------------------------------- /note.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | from bs4 import BeautifulSoup 3 | import urllib.request 4 | from urllib import request 5 | import re 6 | from selenium import webdriver 7 | import time 8 | import csv 9 | from fake_useragent import UserAgent 10 | import json 11 | import random 12 | 13 | #获取文章链接 14 | def get_note_url_list(urls): 15 | driver = webdriver.PhantomJS() 16 | for url in urls : 17 | driver.get('http://www.xiaohongshu.com/user/profile/' + url) 18 | #获取条目数量 19 | soup = BeautifulSoup(driver.page_source, 'html.parser') 20 | num = soup.select('div[class="tab-item active owl-imp-showed"] div small') 21 | if num == []: 22 | num = soup.select('div[class="tab-item active"] div small') 23 | try: 24 | num = num[0].string[1:] 25 | page = int(num) // 10 + 3 26 | except: 27 | page = 10 28 | # 将页面滚动条拖到底部 29 | js = "document.documentElement.scrollTop=1000000" 30 | for s in range(0, page): 31 | driver.execute_script(js) 32 | time.sleep(3) 33 | # 解析链接 34 | soup = BeautifulSoup(driver.page_source, 'html.parser') 35 | url_list = soup.select('div[class="note-item note-item"] a') 36 | url_list = url_list + soup.select('div[class="note-item note-item owl-imp-showed"] a') 37 | for url in url_list: 38 | note_url = url.get('href') 39 | if note_url[:10] == '/discovery': 40 | print(note_url) 41 | get_note('https://www.xiaohongshu.com' + note_url) 42 | driver.close() 43 | 44 | #获取文章内容 45 | def get_note(url): 46 | try: 47 | req = request.Request(url) 48 | response = urllib.request.urlopen(req) 49 | except : 50 | try : 51 | req = request.Request(url) 52 | response = urllib.request.urlopen(req) 53 | except : 54 | return 55 | 56 | # 读取Html 57 | data = response.read() 58 | soup = BeautifulSoup(data, 'html.parser', from_encoding="UTF-8") 59 | 60 | try: 61 | nickname = soup.select('h3[class="nickname"] a')[0].string #昵称 62 | except: 63 | nickname = '' 64 | try: 65 | avatar = soup.select('img[class="avatar-img"]')[0].get('src') #头像 66 | except: 67 | avatar = '' 68 | try: 69 | title = soup.select('h1[class="title"]')[0].string #标题 70 | print(title) 71 | time.sleep(16) 72 | title = str(title).encode('utf-8', 'ignore').decode('utf-8', 'ignore') 73 | except: 74 | title = '' 75 | publish_date = soup.select('div[class="publish-date"]')[0].string[4:] #发布时间 76 | tags = soup.select('a[class="hash-tag topic"]') #标签 77 | if tags != []: 78 | tag_list = [] 79 | for tag in tags: 80 | tag = str(tag) 81 | svg = tag.index('') 82 | a = tag.index('') 83 | tag_list.append(tag[svg+6:a]) 84 | tag_str = ','.join(tag_list) 85 | else: 86 | tag_str = '' 87 | 88 | try : 89 | cover = soup.select('div[class="multi-note-cover cube-image normal-image"] img')[0].get('src') #封面图 90 | except: 91 | cover = '' 92 | 93 | #轮播图 94 | note_img = soup.select('img[class="note-image"]') # 图片 95 | img_list = '' 96 | for img in note_img: 97 | img_list = img_list + ',' + img.get('src') 98 | img_list = img_list[1:] 99 | 100 | #如果cover不存在用轮播图补全 101 | try : 102 | if cover == '': 103 | cover = note_img[0].get('src') 104 | except: 105 | cover = '' 106 | 107 | # 富文本 108 | content_list = soup.select('div[class="content"]') 109 | content_str = '' 110 | dr_a = re.compile(r'(.*?)', re.S)#去除标签的超链接 111 | for content in content_list: 112 | content = str(content).replace(' data-v-57ee69ec=""', '').replace(' data-v-798decb0=""', '').replace(' alt="小红书"', '').replace(' data-v-52254b4c=""', '').replace(' data-v-4b7a01f4="" ', '').replace(' data-v-0ffb6d22=""', '').replace(u"\u200b", '').replace(u"\u2022", '') 113 | content = re.sub('[\r\n\t]', '', content) 114 | content = dr_a.sub('', str(content)) 115 | content_str = content_str + content 116 | try: 117 | index = content_str.index('
') 118 | except: 119 | index = 0 120 | if index > 0: 121 | content_str = content_str[:index] + '
' 122 | 123 | # 写入CSV 124 | if cover != '': 125 | with open('note.csv', 'a+', newline ='',encoding='gb18030') as csvfile: 126 | spamwriter = csv.writer(csvfile) 127 | data = ([str(nickname), str(avatar), str(title), str(cover), str(img_list), str(content_str), str(tag_str), str(publish_date), str(url)]) 128 | spamwriter.writerow(data) 129 | 130 | #代理获取 131 | def proxy_get(): 132 | url = 'http://127.0.0.1:8000/select?name=httpbin&order=speed&sort=asc&count=20' 133 | req = request.Request(url) 134 | response = urllib.request.urlopen(req) 135 | data = response.read() 136 | try: 137 | json_str = json.loads(data) 138 | proxy = random.sample(json_str, 1) 139 | except: 140 | req = request.Request(url) 141 | response = urllib.request.urlopen(req) 142 | data = response.read() 143 | json_str = json.loads(data) 144 | proxy = random.sample(json_str, 1) 145 | return proxy[0] 146 | 147 | #m站个人主页的地址 148 | get_note_url_list([ 149 | '554d98baa46e9626b84ebe39', 150 | '57367ccf50c4b4528a90f723', 151 | '54b0f079b4c4d65f85e76bdc', 152 | '5756621a82ec39663c40c45d', 153 | '59a65a295e87e75966563793', 154 | '561f967641a2b3550b12fd4c', 155 | '54f697fb4fac6379b6f3bc95', 156 | '596c805750c4b4218d929a59', 157 | ]) 158 | --------------------------------------------------------------------------------