├── README.md
└── note.py


/README.md:
--------------------------------------------------------------------------------
1 | # spider-xiaohongshu
2 | 【学习交流用】爬取小红书的内容
3 | 


--------------------------------------------------------------------------------
/note.py:
--------------------------------------------------------------------------------
  1 | # coding=UTF-8
  2 | from bs4 import BeautifulSoup
  3 | import urllib.request
  4 | from urllib import request
  5 | import re
  6 | from selenium import webdriver
  7 | import time
  8 | import csv
  9 | from fake_useragent import UserAgent
 10 | import json
 11 | import random
 12 | 
 13 | #获取文章链接
 14 | def get_note_url_list(urls):
 15 |     driver = webdriver.PhantomJS()
 16 |     for url in urls :
 17 |         driver.get('http://www.xiaohongshu.com/user/profile/' + url)
 18 |         #获取条目数量
 19 |         soup = BeautifulSoup(driver.page_source, 'html.parser')
 20 |         num = soup.select('div[class="tab-item active owl-imp-showed"] div small')
 21 |         if num == []:
 22 |             num = soup.select('div[class="tab-item active"] div small')
 23 |         try:
 24 |             num = num[0].string[1:]
 25 |             page = int(num) // 10 + 3
 26 |         except:
 27 |             page = 10
 28 |         # 将页面滚动条拖到底部
 29 |         js = "document.documentElement.scrollTop=1000000"
 30 |         for s in range(0, page):
 31 |             driver.execute_script(js)
 32 |             time.sleep(3)
 33 |         # 解析链接
 34 |         soup = BeautifulSoup(driver.page_source, 'html.parser')
 35 |         url_list = soup.select('div[class="note-item note-item"] a')
 36 |         url_list = url_list + soup.select('div[class="note-item note-item owl-imp-showed"] a')
 37 |         for url in url_list:
 38 |             note_url = url.get('href')
 39 |             if note_url[:10] == '/discovery':
 40 |                 print(note_url)
 41 |                 get_note('https://www.xiaohongshu.com' + note_url)
 42 |     driver.close()
 43 | 
 44 | #获取文章内容
 45 | def get_note(url):
 46 |     try:
 47 |         req = request.Request(url)
 48 |         response = urllib.request.urlopen(req)
 49 |     except :
 50 |         try :
 51 |             req = request.Request(url)
 52 |             response = urllib.request.urlopen(req)
 53 |         except :
 54 |             return
 55 | 
 56 |     # 读取Html
 57 |     data = response.read()
 58 |     soup = BeautifulSoup(data, 'html.parser', from_encoding="UTF-8")
 59 | 
 60 |     try:
 61 |         nickname = soup.select('h3[class="nickname"] a')[0].string #昵称
 62 |     except:
 63 |         nickname = ''
 64 |     try:
 65 |         avatar = soup.select('img[class="avatar-img"]')[0].get('src') #头像
 66 |     except:
 67 |         avatar = ''
 68 |     try:
 69 |         title = soup.select('h1[class="title"]')[0].string #标题
 70 |         print(title)
 71 |         time.sleep(16)
 72 |         title = str(title).encode('utf-8', 'ignore').decode('utf-8', 'ignore')
 73 |     except:
 74 |         title = ''
 75 |     publish_date = soup.select('div[class="publish-date"]')[0].string[4:] #发布时间
 76 |     tags = soup.select('a[class="hash-tag topic"]') #标签
 77 |     if tags != []:
 78 |         tag_list = []
 79 |         for tag in tags:
 80 |             tag = str(tag)
 81 |             svg = tag.index('</svg>')
 82 |             a = tag.index('</a>')
 83 |             tag_list.append(tag[svg+6:a])
 84 |         tag_str = ','.join(tag_list)
 85 |     else:
 86 |         tag_str = ''
 87 | 
 88 |     try :
 89 |         cover = soup.select('div[class="multi-note-cover cube-image normal-image"] img')[0].get('src') #封面图
 90 |     except:
 91 |         cover = ''
 92 | 
 93 |     #轮播图
 94 |     note_img = soup.select('img[class="note-image"]')  # 图片
 95 |     img_list = ''
 96 |     for img in note_img:
 97 |         img_list = img_list + ',' + img.get('src')
 98 |     img_list = img_list[1:]
 99 | 
100 |     #如果cover不存在用轮播图补全
101 |     try :
102 |         if cover == '':
103 |             cover = note_img[0].get('src')
104 |     except:
105 |         cover = ''
106 | 
107 |     # 富文本
108 |     content_list = soup.select('div[class="content"]')
109 |     content_str = ''
110 |     dr_a = re.compile(r'<a.*?href="(.*?)">(.*?)</a>', re.S)#去除标签的超链接
111 |     for content in content_list:
112 |         content = str(content).replace(' data-v-57ee69ec=""', '').replace(' data-v-798decb0=""', '').replace(' alt="小红书"', '').replace(' data-v-52254b4c=""', '').replace(' data-v-4b7a01f4="" ', '').replace(' data-v-0ffb6d22=""', '').replace(u"\u200b", '').replace(u"\u2022", '')
113 |         content = re.sub('[\r\n\t]', '', content)
114 |         content = dr_a.sub('', str(content))
115 |         content_str = content_str + content
116 |     try:
117 |         index = content_str.index('<div class="cell item-cell">')
118 |     except:
119 |         index = 0
120 |     if index > 0:
121 |         content_str = content_str[:index] + '</div>'
122 | 
123 |     # 写入CSV
124 |     if cover != '':
125 |         with open('note.csv', 'a+', newline ='',encoding='gb18030') as csvfile:
126 |             spamwriter = csv.writer(csvfile)
127 |             data = ([str(nickname), str(avatar), str(title), str(cover), str(img_list), str(content_str), str(tag_str), str(publish_date), str(url)])
128 |             spamwriter.writerow(data)
129 | 
130 | #代理获取
131 | def proxy_get():
132 |     url = 'http://127.0.0.1:8000/select?name=httpbin&order=speed&sort=asc&count=20'
133 |     req = request.Request(url)
134 |     response = urllib.request.urlopen(req)
135 |     data = response.read()
136 |     try:
137 |         json_str = json.loads(data)
138 |         proxy =  random.sample(json_str, 1)
139 |     except:
140 |         req = request.Request(url)
141 |         response = urllib.request.urlopen(req)
142 |         data = response.read()
143 |         json_str = json.loads(data)
144 |         proxy = random.sample(json_str, 1)
145 |     return proxy[0]
146 | 
147 | #m站个人主页的地址
148 | get_note_url_list([
149 |     '554d98baa46e9626b84ebe39',
150 |     '57367ccf50c4b4528a90f723',
151 |     '54b0f079b4c4d65f85e76bdc',
152 |     '5756621a82ec39663c40c45d',
153 |     '59a65a295e87e75966563793',
154 |     '561f967641a2b3550b12fd4c',
155 |     '54f697fb4fac6379b6f3bc95',
156 |     '596c805750c4b4218d929a59',
157 |                   ])
158 | 


--------------------------------------------------------------------------------