├── LICENSE
├── README.md
├── code
    ├── .gitignore
    ├── bookv2.py
    ├── celebreties.py
    ├── doubanUtils.py
    ├── douban_critique.py
    ├── douban_diary.py
    ├── doubanboardcast.py
    ├── doubanbook.py
    ├── doubanmovie.py
    ├── doubanmusic.py
    ├── doulist.py
    ├── moviev2.py
    ├── personalCrawler.py
    ├── series.py
    └── simple_crawler.py
├── draft
    └── develop_path.md
└── requirements.txt


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2020] [douban_clawer]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Douban Crawler
 2 | 
 3 | 
 4 | ## What is it?
 5 | 
 6 |   It is a very simple crawler in order to back-up one's DouBan account considering douban accounts might be blocked without any notification in Douban.
 7 | 
 8 |   为了防止被豆瓣未告知便封禁账号，写了这个简单的爬虫脚本。
 9 | 
10 | 
11 | ## Function
12 | 
13 | * DouBan Movie Back-up
14 | * DouBan Reading Back-up
15 | * DouBan Music Back-up
16 | * DouBan Broadcast clawer
17 | * DouBan Diary Back-up
18 | * DouBan Critique Back-up
19 | * Movie label feature
20 | * Book label feature
21 | * DouBan Dou-List back-up
22 | 
23 | 
24 | ## Install
25 | 
26 | - 安装python、pip
27 | - 切换项目目录
28 | 
29 | ```shell
30 | cd {project_path}/douban_crawler
31 | pip install virtualenv
32 | virtualenv venv
33 | source venv/bin/activate
34 | pip install -r requirements.txt
35 | python personalCrawler.py
36 | 
37 | ```
38 | 
39 | ## Application
40 | 
41 | ### [DouBan Run Away Plan](https://www.notion.so/jimsun6428/for-Share-26945cf67a2a407cb9f381109dd438a1)
42 | use `personalCrawler.py` backup douban book+movie marking and import csv into Notion.
43 | 
44 | ### Book, Movie, Music [Planing System](https://www.notion.so/jimsun6428/for-Share-9248be8af2144960858de9cb9a3e75c2) base on Douban & Notion
45 | use `doulist.py` download doulist, import into notion calender template create a planing system.
46 | 
47 | 
48 | ## About
49 | 
50 | * 这个小小的项目结束~~
51 | 


--------------------------------------------------------------------------------
/code/.gitignore:
--------------------------------------------------------------------------------
 1 | *.spec
 2 | *.zip
 3 | *.exe
 4 | *.csv
 5 | *.pyc
 6 | .DS_Store
 7 | 
 8 | build/
 9 | venv/**/
10 | __pycache__/
11 | dist/
12 | 
13 | 


--------------------------------------------------------------------------------
/code/bookv2.py:
--------------------------------------------------------------------------------
  1 | import requests, traceback
  2 | from bs4 import BeautifulSoup
  3 | from time import sleep
  4 | from random import uniform,choice
  5 | from doubanUtils import *
  6 | 
  7 | headers0 = {'User-Agent':getAgent()}
  8 | 
  9 | class Douban_Book:
 10 |     def __init__(self,doubanid):
 11 |         self.s=requests.Session()
 12 |         #加上头部
 13 |         self.s.headers.update(headers0)
 14 |         self.id=doubanid
 15 |         #wish dict format: {bookid:[书名,作者,译者,原作名,出版社,出版年,页数,ISBN,评分,评分人数]}
 16 |         self.wish_dict={}
 17 |         self.itemKeys=['subjectId','书名','封面','作者','译者','原作名','丛书',\
 18 |                     '出版社','出版年','页数','ISBN','评分','评分人数','标记日期','短评们']
 19 |         #saw dict format: {bookid:[书名,作者,译者,出版社,出版年,页数,ISBN,评分,评分人数,用户评分,评论,标记日期]}
 20 |         self.sawKeys = self.itemKeys + ['用户标签','用户评分','短评']
 21 |         self.saw_dict={}
 22 |         self.head='https://book.douban.com/subject/'
 23 | 
 24 |     def get_soup(self,url):
 25 |         req = self.s.get(url)
 26 |         return BeautifulSoup(req.text,'html.parser'), req.status_code
 27 |     
 28 |     def wish_get(self,item):
 29 |         date = item(class_=re.compile('date'))[0].get_text(strip=True)
 30 |         name = item(href=re.compile('subject'))[0].get_text(strip=True)
 31 |         url = item.find(href=re.compile('subject')).get('href')
 32 |         bid = url.split('/')[-2]
 33 |         return date,name,url,bid
 34 | 
 35 |     def wish_store(self,wishes,lastBid):
 36 |         for item in wishes:
 37 |             date,name,url,bid = self.wish_get(item)
 38 |             if (lastBid == str(bid)):
 39 |                 return -1
 40 |             self.wish_dict[bid]={'书名':name,'豆瓣链接':url,\
 41 |                 '标记日期':date,'subjectId':bid}
 42 | 
 43 |     def Wish(self):
 44 |         # 豆瓣图书反爬机制
 45 |         homepage='https://book.douban.com/people/'+self.id
 46 |         self.s.get(homepage)
 47 |         self.s.get(homepage+'/wish')
 48 | 
 49 |         print('\n开始爬取'+self.id+'的想读列表')
 50 |         beg,end = pageControl(10)
 51 |         page=beg
 52 |         firstpage='https://book.douban.com/people/'+self.id+\
 53 |             '/wish?sort=time&start='+str((beg-1)*30)\
 54 |             +'&filter=all&mode=list&tags_sort=count'
 55 |         soup, status = self.get_soup(firstpage)
 56 |         print(f'第{page}页',status)
 57 | 
 58 |         lastBid = getLastBackUpItem(self.id,"想读")
 59 | 
 60 |         #get book name and id
 61 |         if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1):
 62 |             self.feature_helper(self.wish_dict)
 63 |             return self.wish_dict
 64 |         next_ = hasNextPage(soup)
 65 | 
 66 |         #get all wish list
 67 |         while (next_!=False) and (page < end):
 68 |             NextPage = 'https://book.douban.com'+next_
 69 |             soup, status = self.get_soup(NextPage)
 70 |             page += 1
 71 |             print(f'第{page}页',status)
 72 |             if (self.wish_store(soup.find_all(class_='item'),lastBid) == -1):
 73 |                 self.feature_helper(self.wish_dict)
 74 |                 return self.wish_dict
 75 |             next_ = hasNextPage(soup)
 76 | 
 77 |         #add feature for every book
 78 |         self.feature_helper(self.wish_dict)
 79 |         return self.wish_dict
 80 | 
 81 |     def feature_helper(self, dic):
 82 |         #add feature for every book
 83 |         print('一共有{}本书'.format(len(dic.keys())))
 84 |         count=0
 85 |         st=perf_counter()
 86 |         total=len(dic)
 87 |         fail=[]
 88 |         for bid in dic.keys():
 89 |             count+=1
 90 |             if count%50==0:
 91 |                 sleep(10)
 92 |             sleep(uniform(1,2))
 93 |             timebar(30,st,count/total)
 94 |             fail.append(self.get_feature(bid,dic))
 95 |         print('\n再次尝试打开失败的书籍页')
 96 |         # sleep(10)
 97 |         for fbid in fail:
 98 |             if fbid!=None:
 99 |                 sleep(2)
100 |                 print()
101 |                 self.get_feature(fbid,dic)
102 |             
103 |     def get_feature(self,bid,dic):
104 |         head=self.head
105 |         try:
106 |             req2=self.s.get(head+bid)
107 |             print('  '+dic[bid]['书名']+' 状态：',req2.status_code,end=' ')
108 |             if req2.status_code == requests.codes.ok:
109 |                 soup2=BeautifulSoup(req2.text,'html.parser')
110 |                 c=soup2.find(id='info').text.replace('\xa0','').replace('\n   ','')
111 |                 intro=c.split('\n')
112 |                 for i in intro:
113 |                     if ':' in i :
114 |                         i=i.replace(' ','')
115 |                         key,value=i.split(':',1)
116 |                         dic[bid][key]=value
117 |                 dic[bid]['封面']=soup2.find('img').get('src')
118 |                 dic[bid]['出版年']=getYear(dic[bid]['出版年'])
119 |                 try:
120 |                     dic[bid]['短评们']=getShortComments(soup2.findAll(class_="comment"))
121 |                 except:
122 |                     dic[bid]['短评们']='...'
123 |                 try:
124 |                     dic[bid]['评分']=soup2.find(property=re.compile('average')).text.strip(' ')
125 |                 except:
126 |                     dic[bid]['评分']=''
127 |                 try:
128 |                     dic[bid]['评分人数']=soup2.find(class_="rating_people").span.text.strip(' ')
129 |                 except:
130 |                     dic[bid]['评分人数']='0'
131 |         except Exception as e:
132 |             print('\r打开书籍页失败，失败的书籍链接：'+head+bid)
133 |             print(e)
134 |             self.switch_header()
135 |             return bid
136 |     
137 |     def saw_store(self,saw,lastBid):
138 |         for item in saw:
139 |             date,star,comment,owntag,name,bid=self.saw_get(item)
140 |             if (lastBid == str(bid)):
141 |                 return -1
142 |             self.saw_dict[bid]={'书名':name,'封面':'','豆瓣链接':self.head+bid,\
143 |                 '标记日期':date,'用户评分':star,'短评':comment,\
144 |                 '用户标签':owntag,'subjectId':bid}
145 | 
146 |     def saw_get(self,saw):
147 |         date=saw(class_=re.compile('date'))[0].get_text(strip=True)
148 |         try:
149 |             star=saw(class_=re.compile('rat'))[0]['class'][0][6]
150 |         except:
151 |             star=''
152 |         try:
153 |             comment=saw(class_=re.compile('comment'))[0].get_text(strip=True)
154 |         except:
155 |             comment=''
156 |         try:
157 |             owntag_list=saw.find(class_='tags').get_text(strip=True).split(': ',1)[1].split(' ')
158 |             owntag='/'.join(owntag_list)
159 |         except:
160 |             owntag=''
161 |         name=saw.find(href=re.compile('subject')).get_text(strip=True)
162 |         bid=saw.find(href=re.compile('subject')).get('href').split('/')[-2]
163 |         return date,star,comment,owntag,name,bid
164 |     
165 |     def Saw(self):
166 |         # 豆瓣图书反爬机制
167 |         homepage='https://book.douban.com/people/'+self.id
168 |         self.s.get(homepage)
169 | 
170 |         print('\n开始爬取'+self.id+'的读过列表')
171 |         beg, end = pageControl(10)
172 |         page=beg
173 |         
174 |         Sfirstpage='https://book.douban.com/people/'+self.id+\
175 |             '/collect?&sort=time&start='+str((beg-1)*30)+\
176 |             '&filter=all&mode=list'
177 |         soup, status = self.get_soup(Sfirstpage)
178 |         print(f'第{page}页',status)
179 | 
180 |         lastBid = getLastBackUpItem(self.id,"读过")
181 | 
182 |         #get book name and id
183 |         if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1):
184 |             self.feature_helper(self.saw_dict)
185 |             return self.saw_dict
186 |         next_ = hasNextPage(soup)
187 | 
188 |         #get all saw list
189 |         while (next_ != False) and (page < end):
190 |             sleep(1.3)
191 |             NextPage='https://book.douban.com'+next_
192 |             soup, status = self.get_soup(NextPage)
193 |             page += 1
194 |             print(f'第{page}页',status)
195 |             if (self.saw_store(soup.find_all(class_='item'),lastBid) == -1):
196 |                 self.feature_helper(self.saw_dict)
197 |                 return self.saw_dict
198 |             next_ = hasNextPage(soup)
199 | 
200 |         #add feature for every book
201 |         self.feature_helper(self.saw_dict)
202 |         return self.saw_dict
203 |     
204 |     def save_helper(self, dic, Type):
205 |         with open(fn(self.id+'-'+getFormatTime()+Type+'plus.csv'),\
206 |             'a',encoding='utf-8_sig') as f:
207 |             fieldNames = self.sawKeys if Type == '读过' else self.itemKeys
208 |             writer = csv.DictWriter(f, fieldnames=fieldNames, restval="...", extrasaction='ignore')
209 |             writer.writeheader()
210 |             for bid in dic.keys():
211 |                 writer.writerow(dic[bid])
212 |     
213 |     def save_as_csv(self,choice):
214 |         if choice in ['a','c']:
215 |             self.save_helper(self.wish_dict, '想读')
216 |         if choice in ['b','c']:
217 |             self.save_helper(self.saw_dict, '读过')
218 |     
219 |     def switch_header(self):
220 |         headers0['User-Agent']=choice(user_agent_list)
221 |         self.s.headers.update(headers0)
222 | 
223 |     def add_cookies(self,raw_cookies):
224 |         cookies=getCookie(raw_cookies)
225 |         self.s.cookies.update(cookies)
226 | 
227 | 
228 |     def main(self):
229 |         print('''
230 |         以下为选项
231 |             A：想读列表
232 |             B：读过列表
233 |             C：想读+读过''')
234 |         ans2=input('请输入你需要爬取的内容：')
235 |         ans2=ans2.lower()
236 |         if ans2=='a':
237 |             self.Wish()
238 |         elif ans2=='b':
239 |             self.Saw()
240 |         elif ans2=='c':
241 |             self.Wish()
242 |             self.Saw()
243 |         self.save_as_csv(choice=ans2)
244 | 
245 | def main():
246 |     print('嘿，据说你想要备份你的豆瓣书籍记录？')
247 |     print('''你需要知道：
248 |     1. 本程序是一个爬虫程序，在爬取书籍条目特征时会产生大量的网页访问，爬完后你的ip也许会被豆瓣封一段时间（登陆账号还是可以用啦）。
249 |     2. 大量的网页访问意味着需要大量的流量。
250 |     3. 爬取成功后，你的文件(csv)会被存储在该exe目录下，请不要在压缩包内使用该程序，解压后再使用。
251 |     4. 可能会比较耗时。''')
252 |     ans1=input('请确定你要开始备份(yes/no)： ')
253 |     if ans1=='yes':
254 |         Douid=input('请输入你的豆瓣id： ')
255 |         clawer=Douban_Book(doubanid=Douid)
256 |         # book.douban.com 有反爬，需要cookies
257 |         print("由于豆瓣图书的防爬虫机制，需要你提供cookies")
258 |         raw_cookies=input('输入cookies: ')
259 |         clawer.add_cookies(raw_cookies)
260 |         clawer.main()
261 |     print('\n问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
262 | 
263 | 
264 | if __name__ == '__main__':
265 |     try:
266 |         main()
267 |     except Exception as e:
268 |         traceback.print_exc()
269 |     finally:
270 |         sleep(10)
271 |         over=input('按任意键退出')


--------------------------------------------------------------------------------
/code/celebreties.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | from bs4 import BeautifulSoup
 4 | from time import sleep
 5 | from doubanUtils import *
 6 | 
 7 | user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
 8 |                 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
 9 |                 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
10 |                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
11 |                 ]
12 | headers0 = {'User-Agent':user_agent_list[3]}
13 | 
14 | 
15 | 
16 | class Celebreties_Crawler:
17 | 
18 |     def __init__(self, url):
19 |         self.s = requests.Session()
20 |         self.s.headers.update(headers0)
21 |         self.url = url
22 |         self.list_name=''
23 |         self.finalOutput = ''
24 |         self.urlHead = url.split('?')[0]
25 | 
26 |     def saveItems(self,items):
27 |         for item in items:
28 |             title = item.h6.a.get_text(strip=True)
29 |             link = item.h6.a.get('href')
30 |             try:
31 |                 picList = item.img.get('src').split('.')[:-1]
32 |                 picList.append('jpg')
33 |                 picUrl = '.'.join(picList)
34 |             except:
35 |                 picUrl = ''
36 |             try:
37 |                 year = item.h6.span.get_text(strip=True).replace('(','').replace(')','')
38 |             except:
39 |                 year = ""
40 |             
41 |             try:
42 |                 info1 = item.dl.dl.find_all('dd')[0].get_text(strip=True)
43 |                 info2 = item.dl.dl.find_all('dd')[1].get_text(strip=True)
44 |             except:
45 |                 info1 = ''
46 |                 info2 = ''
47 |             
48 |             try:
49 |                 star = item.find(class_=re.compile('star')).find_all('span')[1].get_text(strip=True)
50 |             except:
51 |                 star = ''
52 |             
53 |             res = [title,link,picUrl,star,year,info1,info2]
54 |             row = '"' + '","'.join(res) + '"\n'
55 |             self.finalOutput += row
56 |             print(row)
57 | 
58 |     def walk_through(self):
59 |         url = self.url
60 |         req = self.s.get(url)
61 |         page = 1 
62 |         print(f'第{page}页：',req.status_code)
63 |         soup=BeautifulSoup(req.text,'html.parser')
64 |         self.list_name=soup.h1.text
65 |         print('开始爬取豆列：'+self.list_name)
66 |         items = soup.find(class_='grid_view').find('ul',class_='').find_all('li')
67 |         self.saveItems(items)
68 |         # 对每一页爬取
69 |         while 1:
70 |             sleep(2)
71 |             try:
72 |                 page+=1
73 |                 soup=doubanUtils.nextPageLink(self.s,soup,page,self.urlHead)
74 |             except:
75 |                 break
76 |             else:
77 |                 items=soup.find(class_='grid_view').find('ul',class_='').find_all('li')
78 |                 self.saveItems(items)
79 |         # 输出
80 |         with open(fn(self.list_name)+'_subjects.csv','a',encoding='utf-8_sig') as f:
81 |             f.write('电影名,豆瓣链接,封面,豆瓣评分,年份,导演,主演\n')
82 |             f.write(self.finalOutput)
83 | 
84 | if __name__ == '__main__':
85 |     print('这是一个爬取豆瓣导演/明星作品的程序，取决于作品的数量和内容可能会发出大量的请求，甚至可能被豆瓣屏蔽ip一段时间。')
86 |     ch0=input('请确定你要备份豆列(yes/no)：')
87 |     if ch0.lower()=='yes':
88 |         URL=input('请输入需要备份的【网页地址】：')
89 |         crawler=Celebreties_Crawler(URL)
90 |         crawler.walk_through()
91 |     print('\n问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
92 |     sleep(8)
93 |     over=input('按任意键退出')


--------------------------------------------------------------------------------
/code/doubanUtils.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import csv, os, os.path, re
  3 | from functools import reduce
  4 | from bs4 import BeautifulSoup
  5 | from time import localtime,strftime,perf_counter,strptime
  6 | 
  7 | user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
  8 |                 "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
  9 |                 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
 10 |                 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36 Edg/84.0.522.63",
 11 |                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
 12 |                 "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
 13 |                 ]
 14 | 
 15 | def getAgent(n=3):
 16 |     return user_agent_list[n]
 17 | 
 18 | def hasNextPage(soup):
 19 |     try:
 20 |         NextPage=soup.find(class_='next').link.get('href')
 21 |         return NextPage
 22 |     except:
 23 |         return False
 24 | 
 25 | def nextPageLink(sess,soup,page,head=""):
 26 |     NextPage=soup.find(class_='next').link.get('href')
 27 |     req=sess.get(head + NextPage)
 28 |     print(f'第{page}页：',req.status_code)
 29 |     return BeautifulSoup(req.text,'html.parser')
 30 | 
 31 | # file name
 32 | def fn(name):
 33 |     return name.replace('\\','-').replace('/','-')\
 34 |         .replace(':','-').replace('*','-').replace('"','“')\
 35 |         .replace('<','《').replace('>','》').replace('|','-').replace('?','？')
 36 | 
 37 | # page control
 38 | def pageControl(limit=50):
 39 |     beg=eval(input('请输入你要爬取的起始页码（比如1）：'))
 40 |     end=eval(input('请输入终止页码（建议一次爬取{}页以下）：'.format(limit)))
 41 |     return beg, end
 42 | 
 43 | 
 44 | def timebar(scale,start,p):
 45 |     a='※'*round(p*scale)
 46 |     b='.'*(scale-round(p*scale))
 47 |     dur=(perf_counter()-start)/60
 48 |     print("\r{:^3.0f}%[{}->{}]已运行{:.2f}分钟"\
 49 |         .format(p*100,a,b,dur),end=' ')
 50 | 
 51 | def noco(txt):
 52 |     if len(txt)==0: return '...'
 53 |     return txt.replace(',','、').replace('，','、').replace('\n','  ')
 54 | 
 55 | 
 56 | def getFormatTime():
 57 |     return strftime("%Y-%m-%d %H-%M-%S", localtime())
 58 | 
 59 | def string2Time(s):
 60 |     return strptime(s, '%Y-%m-%d %H-%M-%S')
 61 | 
 62 | def fileTimeCompare(fn1, fn2):
 63 |     fn1 = fn1.replace(".csv","").split('-',1)[1][:-6]
 64 |     fn2 = fn2.replace(".csv","").split('-',1)[1][:-6]
 65 |     return string2Time(fn1) > string2Time(fn2) 
 66 | 
 67 | def getLastBackUpItem(douId,Type):
 68 |     # 获取上次文件
 69 |     matchFiles = []
 70 |     # 文件名
 71 |     fnMatch = r"iiid-\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}tttypeplus.csv"\
 72 |         .replace('iiid',douId).replace('tttype',Type)
 73 |     for _, _, files in os.walk("."):
 74 |         for file in files:
 75 |             # print(file)
 76 |             if re.match(fnMatch,file):
 77 |                 matchFiles.append(file)
 78 |     ## 得到最新的电影名
 79 |     if len(matchFiles) != 0:
 80 |         latest = reduce(lambda x,y: x if fileTimeCompare(x,y) else y,\
 81 |             matchFiles)
 82 |         with open(latest, 'r', encoding='utf-8_sig') as f:
 83 |             reader = csv.DictReader(f)
 84 |             # 获取第一行电影的id
 85 |             try:
 86 |                 row = reader.__next__()
 87 |                 return row['subjectId']
 88 |             except:
 89 |                 return None
 90 |     else: 
 91 |         return None 
 92 | 
 93 | def getCookie(raw_cookies):
 94 |     cookies={}
 95 |     for line in raw_cookies.split(';'):
 96 |         key,value=line.split('=',1) 
 97 |         cookies[key]=value
 98 |     return cookies   
 99 | 
100 | def getYear(raw):
101 |     yearRex = r'([1|2][9|0]\d{2})'
102 |     res = re.match(yearRex,raw)
103 |     try:
104 |         return res.group(1)
105 |     except:
106 |         return ''
107 | 
108 | def getShortComments(comments):
109 |     res = ''
110 |     for com in comments:
111 |         # 先得到评价用户名
112 |         user = com.find(class_="comment-info").get_text(strip=True).replace('\xa0','').replace('\n','')
113 |         res += user
114 |         res += '：'
115 |         short = com.find(class_="short").get_text(strip=True).replace('\xa0','').replace('\n','')
116 |         res += short
117 |         res += '；  |  '
118 |     return res.replace("看过"," ")


--------------------------------------------------------------------------------
/code/douban_critique.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import re
  3 | from bs4 import BeautifulSoup
  4 | import time
  5 | 
  6 | user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
  7 |                 "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
  8 |                 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
  9 |                 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
 10 |                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
 11 |                 "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
 12 |                 ]
 13 | 
 14 | headers0 = {'User-Agent':user_agent_list[3]}
 15 | 
 16 | 
 17 | class critique:
 18 |     def __init__(self,doubanid):
 19 |         self.id=doubanid
 20 |         self.s=requests.Session()
 21 |         self.s.headers.update(headers0)
 22 |         self.CUs=[]
 23 |         self.CRs=[]
 24 | 
 25 |     def critique_list(self):
 26 |         review_url='https://www.douban.com/people/'+self.id+'/reviews'
 27 |         res0=self.s.get(review_url)
 28 |         soup=BeautifulSoup(res0.text,'html.parser')
 29 |         critique_list=soup.find_all('h2')
 30 |         self.CUs=[i.a.get('href') for i in critique_list]
 31 |         while 1:
 32 |             time.sleep(1.5)
 33 |             try:
 34 |                 next_url='https://www.douban.com/people/'+self.id+'/'+soup.find(class_="next").link.get('href')
 35 |             except:
 36 |                 break
 37 |             else:
 38 |                 res=self.s.get(next_url)
 39 |                 soup=BeautifulSoup(res.text,'html.parser')
 40 |                 critique_list=soup.find_all('h2')
 41 |                 for i in critique_list:
 42 |                     self.CUs.append(i.a.get('href'))
 43 |         print('评论url如下：')
 44 |         print(self.CUs)
 45 |         print('开始访问每个评论...')
 46 |         for i in range(len(self.CUs)):
 47 |             if (i+1)%50==0:
 48 |                 print('已经爬了50个评论，停2分钟')
 49 |                 time.sleep(120)
 50 |             time.sleep(1.5)
 51 |             try:
 52 |                 res=self.s.get(self.CUs[i])
 53 |                 self.CRs.append(res.text)
 54 |                 print(f'打开第{i}篇评论',res.status_code)
 55 |             except Exception as e:
 56 |                 print(f'打开第{i}篇评论失败。\n',e)
 57 |     
 58 |     def deal_with_text(self,h):
 59 |         html=BeautifulSoup(h,'html.parser')
 60 |         time=html.find(class_='main-meta').text+'\n'
 61 |         nf=html.find(id='link-report')
 62 |         words=str(nf(['p','blockquote','h2','h1','h3']))
 63 |         if words=='[]':
 64 |             note=nf.text
 65 |         else:
 66 |             note=words.replace('</p>, <p>','\n\n').replace('</p>, ','\n\n').replace('<p>','')\
 67 |                 .replace('<span style="font-weight: bold;">','**').replace('</span>, ','\n\n').replace('</span>','')\
 68 |                 .replace('<h2>','## ').replace('<h3>','### ').replace('<h1>','# ')\
 69 |                 .replace('</h2>, ','\n\n').replace('</h3>, ','\n\n').replace('</h1>, ','\n\n')\
 70 |                 .replace('<blockquote>','>').replace('</blockquote>, ','\n\n').replace('<a href=','\n')\
 71 |                 .replace('</a>','\n').replace('&lt;','<').replace('&gt;','>').replace('[','').replace('</p>]','')\
 72 |                 .replace('rel=“nofollow” target="_blank">','')
 73 |         return time+note
 74 |     
 75 |     def save_html(self):
 76 |         file_name=self.id+"'s_Critique"
 77 |         count=1
 78 |         with open (file_name.replace('/','_')+".html","wb") as f:
 79 |             for file_content in self.CRs:
 80 |                 #写文件用bytes而不是str，所以要转码  
 81 |                 f.write(bytes(file_content+'\n',encoding='utf-8'))
 82 |                 print(f'第{count}页HTML完成')
 83 |                 count+=1
 84 |     
 85 |     def save_text(self):
 86 |         count=0
 87 |         for n in self.CRs:
 88 |             count+=1
 89 |             with open(self.id+f'Critique_No.{count}.txt','w',encoding='utf-8_sig') as f:
 90 |                 txt=self.deal_with_text(n)
 91 |                 f.write(str(txt))
 92 |                 print(f"第{count}个评论已保存")
 93 | 
 94 | def critique_main():
 95 |     print('hello，这是一个备份豆瓣评论的程序。\n需要你自己的cookie用来爬取评论。')
 96 |     choice=input('该过程有风险，请确定你要开始备份(yes/no)：')
 97 |     if choice=='yes':
 98 |         doubanid=input('请输入你的豆瓣id：')
 99 |         Cclaw=critique(doubanid)
100 |         Cclaw.critique_list()
101 |         choice2=input('请选择你要输出html结果(a)还是文本txt结果(b)或者我全都要(all)：')
102 |         choice2=choice2.lower()
103 |         if choice2 == 'a':
104 |             try:
105 |                 Cclaw.save_html()
106 |             except Exception as e:
107 |                 print(e)
108 |                 print('储存html文件出错')
109 |             else:
110 |                 print('成功')
111 |         elif choice2 == 'b':
112 |             try:
113 |                 Cclaw.save_text()
114 |             except Exception as e:
115 |                 print(e)
116 |                 print('储存txt文件出错')
117 |             else:
118 |                 print('成功')
119 |         elif choice2 == 'all':
120 |             try:
121 |                 Cclaw.save_html()
122 |                 Cclaw.save_text()
123 |             except Exception as e:
124 |                 print(e)
125 |                 print('出错')
126 |         print('程序结束，文件存在该exe目录中')
127 |     print('问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
128 |     input('按任意键退出')
129 | 
130 | critique_main()


--------------------------------------------------------------------------------
/code/douban_diary.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import re
  3 | from bs4 import BeautifulSoup
  4 | import time
  5 | 
  6 | user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
  7 |                 "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/61.0",
  8 |                 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
  9 |                 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
 10 |                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",
 11 |                 "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
 12 |                 ]
 13 | 
 14 | headers0 = {'User-Agent':user_agent_list[3]}
 15 | 
 16 | class diary:
 17 |     def __init__(self,doubanid,cookie=None):
 18 |         self.id=doubanid
 19 |         self.s=requests.Session()
 20 |         self.s.headers.update(headers0)
 21 |         self.NUs=[]
 22 |         self.NRs=[]
 23 |     
 24 |     def add_cookie(self,cookie):
 25 |         cookies=getCookie(cookie)
 26 |         self.s.cookies.update(cookies)
 27 |     
 28 |     def note_list(self):
 29 |         diary_url='https://www.douban.com/people/'+self.id+'/notes'
 30 |         res0=self.s.get(diary_url)
 31 |         soup=BeautifulSoup(res0.text,'html.parser')
 32 |         page=1
 33 |         print(f'第{page}页',res0.status_code)
 34 |         note_list=soup.find_all(id=re.compile('naf-'))
 35 |         self.NUs=[i.get('href') for i in note_list]
 36 |         while 1:
 37 |             time.sleep(1.5)
 38 |             try:
 39 |                 next_url=soup.find(class_="next").link.get('href')
 40 |             except:
 41 |                 break
 42 |             else:
 43 |                 page+=1
 44 |                 res=self.s.get(next_url)
 45 |                 print(f'第{page}页',res.status_code)
 46 |                 soup=BeautifulSoup(res.text,'html.parser')
 47 |                 note_list=soup.find_all(id=re.compile('naf-'))
 48 |                 for i in note_list:
 49 |                     self.NUs.append(i.get('href'))
 50 |         print('日记url如下：')
 51 |         print(self.NUs)
 52 |         print('开始访问每个日记...')
 53 |         for i in range(len(self.NUs)):
 54 |             if (i+1)%50==0:
 55 |                 print('已经爬了50个日记，停2分钟')
 56 |                 time.sleep(120)
 57 |             time.sleep(1.5)
 58 |             try:
 59 |                 res=self.s.get(self.NUs[i])
 60 |                 self.NRs.append(res.text)
 61 |                 print(f'打开第{i}篇日记',res.status_code)
 62 |             except Exception as e:
 63 |                 print(f'打开第{i}篇日记失败。\n',e)
 64 |     
 65 |     def deal_with_text(self,h):
 66 |         html=BeautifulSoup(h,'html.parser')
 67 |         time=html.find(class_='pub-date').text+'\n'
 68 |         nf=html.find(id=re.compile('full')).find(class_='note')
 69 |         words=str(nf(['p','blockquote','h2','h1','h3']))
 70 |         if words=='[]':
 71 |             note=nf.text
 72 |         else:
 73 |             note=words.replace('</p>, <p>','\n\n').replace('</p>, ','\n\n').replace('<p>','')\
 74 |                 .replace('<span style="font-weight: bold;">','**').replace('</span>, ','\n\n').replace('</span>','')\
 75 |                 .replace('<h2>','## ').replace('<h3>','### ').replace('<h1>','# ')\
 76 |                 .replace('</h2>, ','\n\n').replace('</h3>, ','\n\n').replace('</h1>, ','\n\n')\
 77 |                 .replace('<blockquote>','>').replace('</blockquote>, ','\n\n').replace('<a href=','\n')\
 78 |                 .replace('</a>','\n').replace('&lt;','<').replace('&gt;','>').replace('[','').replace('</p>]','')\
 79 |                 .replace('rel=“nofollow” target="_blank">','')
 80 |         return time+note
 81 |     
 82 |     def save_html(self):
 83 |         file_name=self.id+"'s_Diary"
 84 |         count=1
 85 |         with open (file_name.replace('/','_')+".html","wb") as f:
 86 |             for file_content in self.NRs:
 87 |                 #写文件用bytes而不是str，所以要转码  
 88 |                 f.write(bytes(file_content+'\n',encoding='utf-8'))
 89 |                 print(f'第{count}页HTML完成')
 90 |                 count+=1
 91 |     
 92 |     def save_text(self):
 93 |         count=0
 94 |         for n in self.NRs:
 95 |             count+=1
 96 |             with open(self.id+f'Diary_No.{count}.txt','w',encoding='utf-8_sig') as f:
 97 |                 txt=self.deal_with_text(n)
 98 |                 f.write(str(txt))
 99 |                 print(f"第{count}个日记已保存")
100 | 
101 | def getCookie(raw_cookies):
102 |     cookies={}
103 |     for line in raw_cookies.split(';'):
104 |         key,value=line.split('=',1) #1代表只分一次，得到两个数据
105 |         cookies[key]=value
106 |     return cookies                
107 | 
108 | 
109 | def main():
110 |     print('hello，这是一个备份豆瓣日记的程序。\n需要你自己的cookie用来爬取日记。')
111 |     choice=input('该过程有风险，请确定你要开始备份(yes/no)：')
112 |     if choice=='yes':
113 |         doubanid=input('请输入你的豆瓣id：')
114 |         private=input('请选择：\nA.只需要备份他人可见的日记\nB.需要备份包括自己可见的日记（需要你提供自己的cookie）\n')
115 |         dclaw=diary(doubanid)
116 |         if private.lower()=='b':
117 |             raw_cookies=input('请输入你的cookie(最后不要带空格)：')
118 |             dclaw.add_cookie(cookie=raw_cookies)
119 |         dclaw.note_list()
120 |         choice2=input('请选择你要输出html结果(a)还是文本txt结果(b)或者我全都要(all)：')
121 |         choice2=choice2.lower()
122 |         if choice2 == 'a':
123 |             try:
124 |                 dclaw.save_html()
125 |             except Exception as e:
126 |                 print(e)
127 |                 print('储存html文件出错')
128 |             else:
129 |                 print('成功')
130 |         elif choice2 == 'b':
131 |             try:
132 |                 dclaw.save_text()
133 |             except Exception as e:
134 |                 print(e)
135 |                 print('储存txt文件出错')
136 |             else:
137 |                 print('成功')
138 |         elif choice2 == 'all':
139 |             try:
140 |                 dclaw.save_html()
141 |                 dclaw.save_text()
142 |             except Exception as e:
143 |                 print(e)
144 |                 print('出错')
145 |         print('程序结束，文件存在该exe目录中')
146 |     print('问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
147 |     input('按任意键退出')
148 | 
149 | main()


--------------------------------------------------------------------------------
/code/doubanboardcast.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import re
  3 | import time
  4 | import requests
  5 | import random
  6 | 
  7 | def getwords(item):
  8 |     txt=item.get_text(strip=False).replace(' ','').replace('\r','\n')\
  9 |           .replace('\n\n\n\n\n\n','\$#n').replace('\n\n\n\n',' ').replace('\n','')\
 10 |           .replace('$#','').replace('\xa0','').replace('\\n','\n').replace('+','')
 11 |     try:
 12 |         pic=item(class_=re.compile('view-large'))[0]['href']
 13 |     except:
 14 |         pic=''
 15 |     return txt+pic
 16 | 
 17 | def madeBox(txt):
 18 |     box='\t------------------------------------------------------------------------------------------\n'+'\t'+\
 19 |         txt+'\n\t'+'------------------------------------------------------------------------------------------\n'
 20 |     return box
 21 | 
 22 | def dealwithshare(txt1,txt2):
 23 |     li=txt1.split('\n')
 24 |     li[-4]=li[-4]+' @'+li[-3]
 25 |     for word in li:
 26 |         word.replace(' ','')
 27 |     li.remove(li[-3])
 28 |     li2=txt2.split('\n')
 29 |     li.insert(-2,madeBox(''.join(li2[0:-3])))
 30 |     return '\n'.join(li)
 31 | 
 32 | 
 33 | 
 34 | headers = {
 35 |     'Uesr-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
 36 | }
 37 | rawji_america='ll="108297"; bid=TFLDE9t44mY; _pk_ses.100001.8cb4=*; __utmc=30149280; __utma=30149280.995009761.1561041533.1561041533.1561042634.2; __utmz=30149280.1561042634.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmt=1; dbcl2="198268851:+gNufdpzGAw"; ck=AnNe; ap_v=0,6.0; douban-profile-remind=1; push_noty_num=0; push_doumail_num=0; __utmv=30149280.19826; __yadk_uid=nFY3eG607ZoqEtBaMMWYVuqNCXZIycd6; douban-fav-remind=1; __utmb=30149280.12.10.1561042634; _pk_id.100001.8cb4=9f8810e4b7a61874.1561041531.1.1561043477.1561041531.'
 38 | rawji='ll="108297"; bid=BQLi_2UIMh8; __utmc=30149280; __yadk_uid=Fl0aRuIUatWP1JCilVDTUzW1h2R71qWN; push_noty_num=0; push_doumail_num=0; __utmv=30149280.19826; ps=y; _vwo_uuid_v2=DD4476A9DC58A854DCFFF0D91547908DA|534c6354fc5886543fd8704a8eb02aeb; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1561087673%2C%22https%3A%2F%2Faccounts.douban.com%2Faccounts%2Fsafety%2Funlock_phone%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1984600914.1561080464.1561080464.1561087675.2; __utmz=30149280.1561087675.2.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/accounts/safety/unlock_phone; __utmt=1; dbcl2="198268851:Co+RFApa9xQ"; ck=g0Bz; douban-profile-remind=1; ap_v=0,6.0; douban-fav-remind=1; __gads=ID=750f05eb1a424666:T=1561087843:S=ALNI_MaWUrys775-4HBWVFaGDarZgSJRCA; _pk_id.100001.8cb4=3db8de030f64f76f.1561080462.2.1561087888.1561080727.; __utmb=30149280.21.10.1561087675'
 39 | 
 40 | def getCookie(raw_cookies):
 41 |     cookies={}
 42 |     for line in raw_cookies.split(';'):
 43 |         key,value=line.split('=',1) #1代表只分一次，得到两个数据
 44 |         cookies[key]=value
 45 |     return cookies
 46 | 
 47 | 
 48 | def getHtml(douid,raw_cookies=rawji,beg=1,end=10):
 49 |     html_list=[]
 50 |     cookies={}
 51 |     cookies=getCookie(raw_cookies)
 52 |     firstpage='https://www.douban.com/people/'+douid+'/statuses?p='+str(beg)
 53 |     s=requests.Session()
 54 |     res=s.get(firstpage,headers=headers,cookies=cookies)
 55 |     html_list.append(res.text)
 56 |     print(f'第{beg}页',res.status_code,res.reason)
 57 |     while beg<end:
 58 |         beg+=1
 59 |         time.sleep(random.uniform(1,5))
 60 |         try:
 61 |             nextpage='https://www.douban.com/people/'+douid+'/statuses?p='+str(beg)
 62 |             res2=s.get(nextpage,headers=headers,cookies=cookies)
 63 |             soup=BeautifulSoup(res2.text,"html.parser")
 64 |             items=soup.find_all(class_=re.compile('status-item'))
 65 |             print(f'第{beg}页',res2.status_code,res.reason)
 66 |         except:
 67 |             print('网页请求错误')
 68 |         else:
 69 |             html_list.append(res2.text)
 70 |     return html_list
 71 | 
 72 | def saveHtml(douid,html_list,beg,end):  
 73 |     file_name=douid+"'s_board_cast_page_"+str(beg)+'-'+str(end)
 74 |     with open (file_name.replace('/','_')+".html","wb") as f:
 75 |         for file_content in html_list:
 76 |             #写文件用bytes而不是str，所以要转码  
 77 |             f.write(bytes(file_content+'\n',encoding='utf-8'))
 78 |             print(f'第{beg}页HTML完成')
 79 |             beg+=1
 80 | 
 81 | def saveTXT(douid,htmlList,beg,end):
 82 |     with open(douid+'board_cast_'+str(beg)+'-'+str(end)+'.txt','w',encoding='utf-8_sig') as f:
 83 |         for text in htmlList:
 84 |             soup=BeautifulSoup(text,"html.parser")
 85 |             items=soup.find_all(class_=re.compile('status-item'))
 86 |             t=0
 87 |             for i in range(len(items)):
 88 |                 if t>=len(items):
 89 |                     break
 90 |                 txt=getwords(items[t])
 91 |                 if '转发:' in txt:
 92 |                     origin=getwords(items[t+1])
 93 |                     txt=dealwithshare(txt,origin)
 94 |                     t+=1
 95 |                 f.write(str(txt)+'\n')
 96 |                 t+=1
 97 |             print(f'第{beg}页TXT完成')
 98 |             beg+=1
 99 | 
100 | def main():
101 |     print('hello，这是一个备份豆瓣广播的程序。\n需要你自己的cookie用来爬取广播。')
102 |     choice=input('该过程有风险，请确定你要开始备份(yes/no)：')
103 |     if choice=='yes':
104 |         user_raw=input('请输入你的cookie(最后不要带空格)：')
105 |         doubanid=input('请输入你的豆瓣id：')
106 |         begin=eval(input('请输入你开始备份的页码(比如1)：'))
107 |         endpage=eval(input('请输入你结束备份的页码：'))
108 |         Hlist=getHtml(douid=doubanid,raw_cookies=user_raw,beg=begin,end=endpage)
109 |         print(type(Hlist[0]))
110 |         print(f'爬取了{len(Hlist)}页')
111 |         choice2=input('请选择你要输出html结果(a)还是文本txt结果(b)或者我全都要(all)：')
112 |         choice2=choice2.lower()
113 |         if choice2 == 'a':
114 |             try:
115 |                 saveHtml(doubanid,Hlist,beg=begin,end=endpage)
116 |             except Exception as e:
117 |                 print(e)
118 |                 print('储存html文件出错')
119 |                 print('问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
120 |                 over=input('按任意键退出')
121 |             else:
122 |                 print('成功')
123 |         elif choice2 == 'b':
124 |             try:
125 |                 saveTXT(doubanid,Hlist,beg=begin,end=endpage)
126 |             except Exception as e:
127 |                 print(e)
128 |                 print('储存txt文件出错')
129 |                 print('问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
130 |                 over=input('按任意键退出')
131 |             else:
132 |                 print('成功')
133 |         elif choice2 == 'all':
134 |             try:
135 |                 saveHtml(doubanid,Hlist,beg=begin,end=endpage)
136 |                 saveTXT(doubanid,Hlist,beg=begin,end=endpage)
137 |             except Exception as e:
138 |                 print(e)
139 |                 print('出错')
140 |         print('程序结束，文件已存在该exe目录中')
141 |         print('问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
142 |         over=input('按任意键退出')
143 | 
144 | main()


--------------------------------------------------------------------------------
/code/doubanbook.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import re
  3 | import time
  4 | import requests
  5 | 
  6 | headers0 = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"}
  7 | 
  8 | def BWappend(BWdict,Items):
  9 |     for i in range(len(Items)):
 10 |         try:
 11 |             title=Items[i](href=re.compile('subject'))[0].get_text(strip=True)
 12 |             intro=Items[i](class_='intro')[0].get_text(strip=True).split('/')
 13 |             author=intro[0]
 14 |             publisher=intro[-3]
 15 |             translater='/'.join(intro[1:-3])
 16 |             BWdict[title]=[author,translater,publisher]
 17 |         except:
 18 |             try:
 19 |                 title=Items[i](href=re.compile('subject'))[0].get_text(strip=True)
 20 |                 intro=Items[i](class_='intro')[0].get_text(strip=True).split(';')
 21 |                 author=intro[0]
 22 |                 publisher=intro[-1]
 23 |                 translater='/'.join(intro[1:-1])
 24 |                 BWdict[title]=[author,translater,publisher]
 25 |             except:
 26 |                 BWdict[title]=['格式过于诡异','nah','nah']
 27 | 
 28 | def bookwish(doubanid):
 29 |     firstpage='https://book.douban.com/people/'+doubanid+'/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count'
 30 |     sess = requests.Session()
 31 |     sess.headers.update(headers0)
 32 |     request=sess.get(firstpage)
 33 |     soup=BeautifulSoup(request.text,'html.parser')
 34 |     page=1
 35 |     print(f'第{page}页',request.reason)
 36 |     bookwishdict={}
 37 |     items=soup.find_all(class_='item')
 38 |     BWappend(BWdict=bookwishdict,Items=items)
 39 |     while 1:
 40 |         try:
 41 |             Nextpage='https://book.douban.com'+soup.find(class_='next').link.get('href')
 42 |         except:
 43 |             print('已到最终页')
 44 |             break
 45 |         else:
 46 |             request=sess.get(Nextpage)
 47 |             soup=BeautifulSoup(request.text,'html.parser')
 48 |             page+=1
 49 |             print(f'第{page}页',request.reason)
 50 |             items2=soup.find_all(class_='item')
 51 |             BWappend(BWdict=bookwishdict,Items=items2)
 52 |             time.sleep(1)
 53 |     fw=open(doubanid+'_TOread_List.csv','w',encoding='utf-8_sig')
 54 |     fw.write('书名,作者,译者,出版社\n')
 55 |     for title in bookwishdict.keys():
 56 |         fw.write(title.replace(',','、').replace('，','、')+','+bookwishdict[title][0]+\
 57 |                  ','+bookwishdict[title][1]+','+bookwishdict[title][2].replace(',','、').replace('，','、')+'\n')
 58 |     fw.close()
 59 | 
 60 | def BRappend(BRdict,Items):
 61 |     for i in range(len(Items)):
 62 |         title=Items[i]('a')[0].get_text(strip=True)
 63 |         date=Items[i](class_=re.compile('date'))[0].get_text(strip=True)
 64 |         try:
 65 |             intro=Items[i](class_=re.compile('intro'))[0].get_text(strip=True).split('/')
 66 |             author=intro[0]
 67 |             publisher=intro[-3]
 68 |             translater='/'.join(intro[1:-3])
 69 |         except:
 70 |             try:
 71 |                 intro=Items[i](class_=re.compile('intro'))[0].get_text(strip=True).replace(';','/').split('/')
 72 |                 author=intro[0]
 73 |                 publisher=intro[-1]
 74 |                 translater='/'.join(intro[1:-1])
 75 |             except:
 76 |                 intro='格式过于诡异'
 77 |                 author='nah'
 78 |                 publisher='nah'
 79 |                 translater='nah'
 80 |         try:
 81 |             comment=Items[i](class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
 82 |         except:
 83 |             comment='Nah'
 84 |         try:
 85 |             stars=Items[i](class_=re.compile('rat'))[0]['class'][0][6]
 86 |         except:
 87 |             stars='Nah'
 88 |         BRdict[title]=[author,translater,publisher,stars,date,comment]
 89 | 
 90 | def ReadBookList(doubanid):
 91 |     mainpage='https://book.douban.com/people/'+doubanid
 92 |     firstpage='https://book.douban.com/people/'+doubanid+'/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
 93 |     s=requests.Session()
 94 |     s.headers.update(headers0)
 95 |     s.get(mainpage)
 96 |     res2=s.get(firstpage)
 97 |     soup=BeautifulSoup(res2.text,"html.parser")
 98 |     items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
 99 |     read_book={}
100 |     BRappend(BRdict=read_book,Items=items)
101 |     page=1
102 |     print(f"第{page}页",res2.reason)
103 |     while 1:
104 |         time.sleep(2)
105 |         try:
106 |             NextPage='https://book.douban.com'+soup.find(class_='next').link.get('href')
107 |         except:
108 |             print('已到最终页')
109 |             break
110 |         else:
111 |             res=s.get(NextPage)
112 |             soup=BeautifulSoup(res.text,"html.parser")
113 |             items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
114 |             page+=1
115 |             print(f"第{page}页",res.reason)
116 |             BRappend(BRdict=read_book,Items=items)
117 |     fw=open(doubanid+'_READ_List.csv','w',encoding='utf-8_sig')
118 |     fw.write('书名,作者,译者,出版社,评分,日期,短评\n')
119 |     for title in read_book.keys():
120 |         fw.write(title.replace(',','、').replace('，','、')+','+read_book[title][0]+\
121 |                  ','+read_book[title][1]+','+read_book[title][2].replace(',','、').replace('，','、')+\
122 |                  ','+read_book[title][3]+','+read_book[title][4]+','+read_book[title][5].replace(',','、').replace('，','、')+'\n')
123 |     fw.close()
124 |     return read_book
125 | 
126 | 
127 | def main():
128 |     print('注意：本脚本将会爬取已读list')
129 |     choice=input('请确定你要运行此脚本(yes/no):')
130 |     if choice=='yes':
131 |         douid=input('请输入想备份的豆瓣id：')
132 |         print('开始备份-想读-列表')
133 |         bookwish(doubanid=douid)
134 |         time.sleep(2)
135 |         print('开始备份-已读-列表')
136 |         ReadBookList(doubanid=douid)
137 |         print('程序结束，文件已存在该exe目录中')
138 |         print('问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
139 |         input('按任意键退出')
140 |     else:
141 |         print('bye')
142 | 
143 | main()


--------------------------------------------------------------------------------
/code/doubanmovie.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | import time
 4 | import requests
 5 | 
 6 | headers0 = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"}
 7 | 
 8 | def getWishList(doubanid):
 9 |     firstpage='https://movie.douban.com/people/'+doubanid+'/wish?start=0&sort=time&rating=all&filter=all&mode=list'
10 |     sess = requests.Session()
11 |     sess.headers.update(headers0)
12 |     request=sess.get(firstpage)
13 |     page=1
14 |     print(f'第{page}页',request.reason)
15 |     wish_list=[]
16 |     soup=BeautifulSoup(request.text,'html.parser')
17 |     for item in soup.find_all('a',href=re.compile("subject")):
18 |         wish_list.append(item.string.replace(' ','').strip('\n'))
19 |     while 1:
20 |         try:
21 |             NextPage='https://movie.douban.com'+soup.find(class_='next').link.get('href')
22 |         except:
23 |             break
24 |         else:
25 |             request=sess.get(NextPage)
26 |             page+=1
27 |             print(f'第{page}页',request.reason)
28 |             soup=BeautifulSoup(request.text,'html.parser')
29 |             for item in soup.find_all('a',href=re.compile("subject")):
30 |                 wish_list.append(item.string.replace(' ','').strip('\n'))
31 |             time.sleep(0.5)
32 |     fw=open(doubanid+'_Wish_List.txt','w',encoding='utf-8_sig')
33 |     fw.write('中文名 / 原名 \n')
34 |     for item in wish_list:
35 |         fw.write(str(item)+'\n')
36 | 
37 | def TCappend(TC,titandcom):
38 |     for i in range(len(titandcom)):
39 |         title=titandcom[i].em.text
40 |         date=titandcom[i](class_=re.compile('date'))[0].text
41 |         try:
42 |             star=titandcom[i](class_=re.compile('rat'))[0]['class'][0][6]
43 |         except:
44 |             star='Nah'
45 |         try:
46 |             comment=titandcom[i](class_=re.compile('comment'))[0].text.replace('\n','-')
47 |         except:
48 |             comment='Nah'
49 |         TC[title]=[date,star,comment]
50 | 
51 | def getSawList(doubanid):
52 |     firstpage='https://movie.douban.com/people/'+doubanid+'/collect'
53 |     sess = requests.Session()
54 |     sess.headers.update(headers0)
55 |     request=sess.get(firstpage)
56 |     page=1
57 |     print(f'第{page}页',request.reason)
58 |     saw_dic={}
59 |     soup=BeautifulSoup(request.text,'html.parser')
60 |     tandc=soup.find_all(class_=['item'])
61 |     TCappend(TC=saw_dic,titandcom=tandc)
62 |     while 1:
63 |         try:
64 |             NextPage='https://movie.douban.com'+soup.find(class_='next').link.get('href')
65 |         except:
66 |             break
67 |         else:
68 |             request=sess.get(NextPage)
69 |             page+=1
70 |             print(f'第{page}页',request.reason)
71 |             soup=BeautifulSoup(request.text,'html.parser')
72 |             tandc=soup.find_all(class_=['item'])
73 |             TCappend(saw_dic,titandcom=tandc)
74 |             time.sleep(0.5)
75 |     fw=open(doubanid+'_Watched_List.csv','w',encoding='utf-8_sig')
76 |     fw.write('中文名/原名,标记日期,评分,短评\n')
77 |     for title in saw_dic.keys():
78 |         fw.write(title.replace(',','、').replace('，','、')+','+saw_dic[title][0]+\
79 |                  ','+saw_dic[title][1]+','+saw_dic[title][2].replace(',','、').replace('，','、')+'\n')
80 | 
81 | def main():
82 |     douid=input('请输入你的豆瓣id：')
83 |     print('正在下载‘想看’清单,存储为'+douid+'_Wish_List.txt')
84 |     getWishList(doubanid=douid)
85 |     print('开始下载电影评分与短评,存储为'+douid+'_Watched_List.csv')
86 |     getSawList(doubanid=douid)
87 |     print('程序结束，有问题发:<jimsun6428@gmail.com> | https://github.com/JimSunJing/douban_clawer')
88 |     input('按任意键结束')
89 | 
90 | main()


--------------------------------------------------------------------------------
/code/doubanmusic.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import re
  3 | import time
  4 | import requests
  5 | 
  6 | headers0 = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"}
  7 | 
  8 | def Musappend(Mdict,Items):
  9 |     for it in Items:
 10 |         title=it('a')[0].get_text(strip=True)
 11 |         date=it(class_=re.compile('date'))[0].get_text(strip=True)
 12 |         try:
 13 |             stars=it(class_=re.compile('rat'))[0]['class'][0][6]
 14 |         except:
 15 |             stars='Nah'
 16 |         try:
 17 |             comment=it(class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
 18 |         except:
 19 |             comment='Nah'
 20 |         try:
 21 |             intro=it(class_='intro')[0].get_text(strip=True)
 22 |         except:
 23 |             intro='Nah'
 24 |         Mdict[title]=[intro,date,stars,comment]
 25 | 
 26 | def HeardList(doubanid):
 27 |     firstpage='https://music.douban.com/people/'+doubanid+'/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
 28 |     sess = requests.Session()
 29 |     sess.headers.update(headers0)
 30 |     request=sess.get(firstpage)
 31 |     soup=BeautifulSoup(request.text,'html.parser')
 32 |     items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
 33 |     heard_dic={}
 34 |     Musappend(Mdict=heard_dic,Items=items)
 35 |     page=1
 36 |     print(f'第{page}页',request.reason)
 37 |     while 1:
 38 |         time.sleep(1)
 39 |         try:
 40 |             NextPage=soup.find(class_='next').link.get('href')
 41 |         except:
 42 |             print('已到最终页')
 43 |             break
 44 |         else:
 45 |             request=sess.get(NextPage)
 46 |             soup=BeautifulSoup(request.text,'html.parser')
 47 |             items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
 48 |             Musappend(Mdict=heard_dic,Items=items)
 49 |             page+=1
 50 |             print(f'第{page}页',request.reason)
 51 |     fw=open(doubanid+'_Heard_List.csv','w',encoding='utf-8_sig')
 52 |     fw.write('专辑/单曲,简介,日期,评分,短评\n')
 53 |     for title in heard_dic.keys():
 54 |         fw.write(title.replace(',','、').replace('，','、')+','+heard_dic[title][0].replace(',','、').replace('，','、')+\
 55 |                  ','+heard_dic[title][1]+','+heard_dic[title][2]+\
 56 |                  ','+heard_dic[title][3].replace(',','、').replace('，','、')+'\n')
 57 |     fw.close()
 58 | 
 59 | def WMusappend(Mdict,Items):
 60 |     for it in Items:
 61 |         title=it('a')[0].get_text(strip=True)
 62 |         date=it(class_=re.compile('date'))[0].get_text(strip=True)
 63 |         try:
 64 |             comment=it(class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
 65 |         except:
 66 |             comment='Nah'
 67 |         try:
 68 |             intro=it(class_='intro')[0].get_text(strip=True)
 69 |         except:
 70 |             intro='Nah'
 71 |         Mdict[title]=[intro,date,comment]
 72 | 
 73 | 
 74 | def WHeardList(doubanid):
 75 |     firstpage='https://music.douban.com/people/'+doubanid+'/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count'
 76 |     sess = requests.Session()
 77 |     sess.headers.update(headers0)
 78 |     request=sess.get(firstpage)
 79 |     soup=BeautifulSoup(request.text,'html.parser')
 80 |     items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
 81 |     whear_dic={}
 82 |     WMusappend(Mdict=whear_dic,Items=items)
 83 |     page=1
 84 |     print(f'第{page}页',request.reason)
 85 |     while 1:
 86 |         time.sleep(1)
 87 |         try:
 88 |             NextPage=soup.find(class_='next').link.get('href')
 89 |         except:
 90 |             print('已到最终页')
 91 |             break
 92 |         else:
 93 |             request=sess.get(NextPage)
 94 |             soup=BeautifulSoup(request.text,'html.parser')
 95 |             items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
 96 |             Musappend(Mdict=whear_dic,Items=items)
 97 |             page+=1
 98 |             print(f'第{page}页',request.reason)
 99 |     fw=open(doubanid+'_MusicWish_List.csv','w',encoding='utf-8_sig')
100 |     fw.write('专辑/单曲,简介,日期,留言\n')
101 |     for title in whear_dic.keys():
102 |         fw.write(title.replace(',','、').replace('，','、')+','+whear_dic[title][0].replace(',','、').replace('，','、')+\
103 |                  ','+whear_dic[title][1]+','+whear_dic[title][2].replace(',','、').replace('，','、')+'\n')
104 |     fw.close()
105 | 
106 | 
107 | def main():
108 |     print('本程序备份用户的豆瓣音乐')
109 |     choice=input('请确定你要备份(yes/no)：')
110 |     if choice == 'yes':
111 |         id=input('请输入你的豆瓣ID：')
112 |         print('开始备份听过列表')
113 |         HeardList(doubanid=id)
114 |         time.sleep(2)
115 |         print('开始备份想听列表')
116 |         WHeardList(doubanid=id)
117 |         print('备份已存在该exe所在目录下（如果没出错的话）')
118 |         print('问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
119 |         input('按任意键退出')
120 |     else:
121 |         print('bye')
122 | 
123 | main()


--------------------------------------------------------------------------------
/code/doulist.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import re
  3 | from bs4 import BeautifulSoup
  4 | from time import sleep,perf_counter
  5 | from random import uniform,choice
  6 | from os import mkdir,getcwd,path
  7 | from doubanUtils import fn
  8 | user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
  9 |                 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
 10 |                 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
 11 |                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
 12 |                 ]
 13 | headers0 = {'User-Agent':user_agent_list[3]}
 14 | 
 15 | def clean_abstract(text):
 16 |     res = text.replace(',','，').replace('\n  ',',').replace(' ','').replace(',,',',')
 17 |     if res[-1] == ',':
 18 |         return res[:-1]
 19 |     return res
 20 | 
 21 | class Douban_List:
 22 |     def __init__(self,listid):
 23 |         self.id=listid
 24 |         self.list_name=''
 25 |         self.s=requests.Session()
 26 |         self.s.headers.update(headers0)
 27 |         self.notes=[]
 28 |         self.album=[]
 29 |         self.boardcast=[]
 30 |         self.groups=[]
 31 |         self.others=[]
 32 |         self.skip_douxi=False
 33 |         self.match={'日记':self.notes,'评论':self.notes,'讨论':self.groups,\
 34 |                     '相册':self.album,'小组':self.groups,'广播':self.boardcast,\
 35 |                     '电影':'','音乐':'','读书':''}
 36 |     
 37 |     def classify(self,item):
 38 |         try:
 39 |             source=item.find(class_='source').get_text(strip=True)
 40 |             if '东西' in source and self.skip_douxi:
 41 |                 return
 42 |         except:
 43 |             return
 44 |         for n in self.match.keys():
 45 |             if n in source:
 46 |                 try:
 47 |                     #看是否有标题
 48 |                     link=item.find(class_='title').a.get('href')
 49 |                     title=item.find(class_='title').get_text(strip=True)
 50 |                 except:
 51 |                     try:
 52 |                         link=item.find('div',class_='title').a.get('href')
 53 |                         title=item.find('div',class_='title').get_text(strip=True)
 54 |                     except:
 55 |                         try:
 56 |                             #可能是广播
 57 |                             link=item.find(class_='status-content').a.get('href')
 58 |                             title=item.find(class_='status-content').a.get_text(strip=True)
 59 |                         except:
 60 |                             #可能是单张图片
 61 |                             link=item.find(class_='pic-wrap').img.get('src')
 62 |                             title='one_pic'
 63 |                 print(title)
 64 |                 if 'subject' in link and 'discussion' not in link:
 65 |                     picList = item.find(class_='post').img.get('src').split('.')[:-1]
 66 |                     picList.append('jpg')
 67 |                     picUrl = '.'.join(picList)
 68 |                     abstract = item.find(class_='abstract').get_text()
 69 |                     # 获取评分
 70 |                     try:
 71 |                         rating = item.find(class_='rating_nums').get_text(strip=True)
 72 |                     except:
 73 |                         rating = 'nan'
 74 |                     ## 文字保存条目
 75 |                     txt='\n          '+link + abstract
 76 |                     with open(fn(self.list_name+'_subjects.txt'),'a',encoding='utf-8_sig') as f:
 77 |                         f.write('\n          '+title+txt+'\n\n———————————————————————')
 78 |                     ## csv保存条目
 79 |                     row = '"' + title + '"' + ',' + link + ',' + picUrl + ',' + rating + clean_abstract(abstract) + '\n'
 80 |                     with open(fn(self.list_name+'_subjects.csv'),'a',encoding='utf-8_sig') as f:
 81 |                         f.write(row)
 82 | 
 83 |                 else:
 84 |                     self.match[n].append((fn(title),link))
 85 |                 return
 86 |         link=item.find(class_='title').a.get('href')
 87 |         title=item.find(class_='title').a.get_text(strip=True)
 88 |         print(title)
 89 |         self.others.append((fn(title),link))
 90 |         return
 91 |     
 92 |     def get_urls(self):
 93 |         url='https://www.douban.com/doulist/'+self.id
 94 |         req=self.s.get(url)
 95 |         page=1
 96 |         print(f'第{page}页：',req.status_code)
 97 |         soup=BeautifulSoup(req.text,'html.parser')
 98 |         self.list_name=soup.h1.text
 99 |         print('开始爬取豆列：'+self.list_name)
100 |         items=soup.find_all(class_='doulist-item')
101 |         for i in items:
102 |             self.classify(i)
103 |         while 1:
104 |             sleep(2)
105 |             try:
106 |                 NextPage=soup.find(class_='next').link.get('href')
107 |                 req=self.s.get(NextPage)
108 |                 page+=1
109 |                 print(f'第{page}页：',req.status_code)
110 |                 soup=BeautifulSoup(req.text,'html.parser')
111 |             except:
112 |                 break
113 |             else:
114 |                 items=soup.find_all(class_='doulist-item')
115 |                 for i in items:
116 |                     self.classify(i)
117 |     
118 |     def switch_headers(self):
119 |         self.s.headers.update({'User-Agent':choice(user_agent_list)})
120 | 
121 | class Note:
122 |     def __init__(self,S,title,url):
123 |         self.title=title
124 |         self.s=S
125 |         self.url=url
126 |         self.NR=self.s.get(self.url)
127 |     
128 |     def deal_with_text(self):
129 |         html=BeautifulSoup(self.NR.text,'html.parser')
130 |         try:
131 |             time=html.find(class_='pub-date').text+'\n\n'
132 |         except:
133 |             try:
134 |                 time=html.find(class_='main-meta').text+'\n\n'
135 |             except:
136 |                 pass
137 |         try:
138 |             nf=html.find(id=re.compile('full')).find(class_='note')
139 |         except:
140 |             nf=html.find(id='review-content')
141 |         words=str(nf(['p','blockquote','h2','h1','h3']))
142 |         if words=='[]':
143 |             note=nf.text
144 |         else:
145 |             note=words.replace('</p>, <p>','\n\n').replace('</p>, ','\n\n').replace('<p>','')\
146 |                 .replace('<span style="font-weight: bold;">','**').replace('</span>, ','\n\n').replace('</span>','')\
147 |                 .replace('<h2>','## ').replace('<h3>','### ').replace('<h1>','# ')\
148 |                 .replace('</h2>, ','\n\n').replace('</h3>, ','\n\n').replace('</h1>, ','\n\n')\
149 |                 .replace('<blockquote>','>').replace('</blockquote>, ','\n\n').replace('<a href=','\n')\
150 |                 .replace('</a>','\n').replace('&lt;','<').replace('&gt;','>').replace('[','').replace('</p>]','')\
151 |                 .replace('rel=“nofollow” target="_blank">','')
152 |         return time+note
153 |     
154 |     def save_html(self):
155 |         file_name=self.title
156 |         with open (file_name+".html","wb") as f:
157 |             #写文件用bytes而不是str，所以要转码  
158 |             f.write(bytes(self.NR.text,encoding='utf-8_sig'))
159 |             print(self.title+' HTML完成')
160 |     
161 |     def save_text(self):
162 |         with open(self.title+'.txt','w',encoding='utf-8_sig') as f:
163 |             txt=self.deal_with_text()
164 |             f.write(str(txt))
165 |             print(self.title+" TXT已保存")
166 | 
167 |     def claw(self):
168 |         self.save_html()
169 |         try:
170 |             self.save_text()
171 |             print(self.title+'  OK')
172 |         except:
173 |             print(self.title+'  纯文字爬取失败')
174 | 
175 | class DG:
176 |     def __init__(self,S,title,url):
177 |         self.s=S
178 |         self.title=title
179 |         self.url=url
180 |         self.req=self.s.get(self.url)
181 |         
182 |     def save_html(self):
183 |         file_name=self.title
184 |         with open (file_name+".html","wb") as f:
185 |             #写文件用bytes而不是str，所以要转码  
186 |             f.write(bytes(self.req.text+'\n',encoding='utf-8'))
187 |             print(self.title+'  HTML完成')
188 |     
189 |     def save_url(self):
190 |         open(self.title+'.txt','w',encoding='utf-8_sig').write('链接：'+self.url)
191 | 
192 |     def claw(self):
193 |         self.save_html()
194 |         self.save_url()
195 | 
196 | class Album:
197 |     def __init__(self,S,title,url):
198 |         self.s=S
199 |         self.title=title
200 |         self.url=url
201 |         self.req=self.s.get(url)
202 |     
203 |     def save_album(self):
204 |         try:
205 |             mkdir(self.title)
206 |         except:
207 |             sleep(1)
208 |         sloc=getcwd()+'\\\\'+self.title
209 |         soup=BeautifulSoup(self.req.text,'html.parser')
210 |         imgs=soup.find_all(class_='photo_wrap')
211 |         for IMG in imgs:
212 |             pu=IMG.img.get('src').replace('/m/','/l/')
213 |             self.save_pic(pu,loc=sloc)
214 |         while 1:
215 |             sleep(2)
216 |             try:
217 |                 NextPage=soup.find(class_='next').a.get('href')
218 |                 req=self.s.get(NextPage)
219 |                 soup=BeautifulSoup(req.text,'html.parser')
220 |             except:
221 |                 break
222 |             else:
223 |                 imgs=soup.find_all(class_='photo_wrap')
224 |                 for IMG in imgs:
225 |                     pu=IMG.img.get('src').replace('/m/','/l/')
226 |                     self.save_pic(pu,loc=sloc)
227 | 
228 |     def save_pic(self,purl,loc=''):
229 |         sleep(2)
230 |         preq=self.s.get(purl)
231 |         name=purl.split('/')[-1].replace('.jpg','')
232 |         if loc!='':
233 |             name=loc+'\\\\'+name
234 |         open(name+'.jpg','wb').write(preq.content)
235 |         print(name+'    Saved')
236 |     
237 |     def claw(self):
238 |         if self.title=='one_pic':
239 |             self.save_pic(self.url)
240 |         else:
241 |             choice=input(self.title+' 相册需要爬取吗(yes/no)：')
242 |             if choice=='yes':
243 |                 self.save_album()
244 |         
245 | class Boardcast:
246 |     def __init__(self,S,title,url):
247 |         self.s=S
248 |         self.title=title
249 |         self.url=url
250 |         self.req=self.s.get(url)
251 |     
252 |     def save_html(self):
253 |         file_name=self.title
254 |         with open (file_name+".html","ab") as f:
255 |             #写文件用bytes而不是str，所以要转码  
256 |             f.write(bytes(self.req.text,encoding='utf-8'))
257 |             print(self.title+'HTML完成')
258 |     
259 |     def save_text(self):
260 |         soup=BeautifulSoup(self.req.text,'html.parser')
261 |         try:
262 |             t=soup.find_all('blockquote')[0].get_text(strip=True)
263 |         except:
264 |             return
265 |         with open(self.title+'.txt','a',encoding='utf-8_sig') as f:
266 |             f.write(str(t))
267 |             print(self.title+"TXT已保存")
268 |     
269 |     def claw(self):
270 |         self.save_text()
271 |         self.save_html()
272 | 
273 | 
274 | def main():
275 |     print('这是一个备份豆列的程序，取决于豆列的大小和内容会发出大量的请求，甚至可能被豆瓣屏蔽ip一段时间。')
276 |     ch0=input('请确定你要备份豆列(yes/no)(备份相册请按0)：')
277 |     if ch0.lower()=='yes':
278 |         listid=input('请输入需要备份豆列的id：')
279 |         clawer=Douban_List(listid)
280 |         skip=input('是否需要跳过“东西”条目？(yes/no):')
281 |         if skip=='yes':
282 |             clawer.skip_douxi=True
283 |         clawer.get_urls()
284 |         for item in clawer.notes:
285 |             if path.exists(item[0]+'.html'):
286 |                 continue
287 |             sleep(2)
288 |             Note(clawer.s,item[0],item[1]).claw()
289 |         clawer.switch_headers()
290 |         #for item in clawer.album:
291 |             #sleep(2)
292 |             #Album(clawer.s,item[0],item[1]).claw()
293 |         clawer.switch_headers()
294 |         for item in clawer.boardcast:
295 |             sleep(2)
296 |             Boardcast(clawer.s,item[0],item[1]).claw()
297 |         clawer.switch_headers()
298 |         for item in clawer.groups:
299 |             sleep(2)
300 |             DG(clawer.s,item[0],item[1]).claw()
301 |         clawer.switch_headers()
302 |         for item in clawer.others:
303 |             sleep(2)
304 |             DG(clawer.s,item[0],item[1]).claw()
305 |     elif ch0=='0':
306 |         S=requests.Session()
307 |         S.headers.update({'User-Agent':choice(user_agent_list)})
308 |         aid=input('请输入相册id：')
309 |         aurl='https://www.douban.com/photos/album/'+aid
310 |         r=S.get(aurl)
311 |         soup=BeautifulSoup(r.text,'html.parser')
312 |         title=soup.find('h1').get_text(strip=True)
313 |         Album(S,title,aurl).claw()
314 |     print('\n问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
315 | 
316 | 
317 | main()
318 | sleep(10)
319 | over=input('按任意键退出')


--------------------------------------------------------------------------------
/code/moviev2.py:
--------------------------------------------------------------------------------
  1 | import requests, traceback
  2 | from bs4 import BeautifulSoup
  3 | from time import sleep
  4 | from random import uniform,choice
  5 | from doubanUtils import *
  6 | 
  7 | headers0 = {'User-Agent':getAgent()}
  8 | 
  9 | subject_head = 'https://movie.douban.com/subject/'
 10 | movie_ppl_head = 'https://movie.douban.com/people/'
 11 | 
 12 | class Douban_Movie:
 13 |     def __init__(self,doubanid):
 14 |         self.s=requests.Session()
 15 |         #加上头部
 16 |         self.s.headers.update(headers0)
 17 |         self.id=doubanid
 18 |         #wish dict format: {movieid:[电影名,上映日期,导演,编剧,主演,制片国家/地区,片长,评分,评分人数,标记日期,豆瓣链接]}
 19 |         self.wish_dict={}
 20 |         self.itemKeys=['subjectId','电影名','年份','豆瓣链接','封面','上映日期','导演','编剧',\
 21 |             '主演','制片国家/地区','片长','豆瓣评分','评分人数','标记日期','IMDb链接',\
 22 |             '语言','又名','类型','短评们']
 23 |         self.sawKeys = self.itemKeys + ['用户标签','用户评分','短评']
 24 |         #saw dict format: {movieid:[电影名,上映日期,导演,编剧,主演,制片国家/地区,片长,评分,评分人数,用户评分,评论,标记日期,豆瓣链接]}
 25 |         self.saw_dict={}
 26 |         self.proxies = {
 27 |             'https': "http://95.179.219.61:8080",
 28 |             'http': "http://61.7.138.240:8080"
 29 |         }
 30 |     
 31 |     def get_soup(self,url):
 32 |         req = self.s.get(url)
 33 |         return BeautifulSoup(req.text,'html.parser'), req.status_code
 34 | 
 35 |     def wish_get(self, wish):
 36 |         date=wish(class_=re.compile('date'))[0].get_text(strip=True)
 37 |         name=wish.find(href=re.compile('subject')).get_text(strip=True)
 38 |         mid=wish.find(href=re.compile('subject')).get('href').split('/')[-2]
 39 |         return date,name,mid
 40 | 
 41 |     def wish_store(self,wish,lastMid):
 42 |         for i in range(len(wish)):
 43 |             date,name,mid = self.wish_get(wish[i])
 44 |             if (lastMid == str(mid)):
 45 |                 return -1
 46 |             self.wish_dict[mid]=\
 47 |                 {'subjectId': mid,'电影名':name,'豆瓣链接':subject_head+mid,\
 48 |                     '标记日期':date}
 49 | 
 50 |     def Wish(self):
 51 |         print('\n开始爬取'+self.id+'的想看列表')
 52 |         beg, end = pageControl(10)
 53 |         page=beg
 54 |         firstpage= movie_ppl_head +\
 55 |             self.id+'/wish?start='+str((beg-1)*30)+\
 56 |             '&sort=time&rating=all&filter=all&mode=list'
 57 |         soup,status=self.get_soup(firstpage)
 58 |         print(f'第{page}页',status)
 59 | 
 60 |         # 添加新特性，可以根据上次爬取历史中断重复爬取
 61 |         ## 要求上次爬取文件在当前脚本目录中
 62 |         lastMid = getLastBackUpItem(self.id,'想看')
 63 | 
 64 |         # get movie name and id
 65 |         if (self.wish_store(soup.find_all(class_=['item']), lastMid) == -1):
 66 |             # 爬到上次的条目了，可以结束爬取并存储新的
 67 |             self.feature_helper(self.wish_dict)
 68 |             return self.wish_dict
 69 |         next_ = hasNextPage(soup)
 70 | 
 71 |         #get all wish list
 72 |         while (next_!=False) and (page < end):
 73 |             sleep(1.3)
 74 |             NextPage='https://movie.douban.com'+next_
 75 |             soup,status = self.get_soup(NextPage)
 76 |             page+=1
 77 |             print(f'第{page}页',status)
 78 |             if (self.wish_store(soup.find_all(class_=['item']), lastMid) == -1):
 79 |                 # 爬到上次的条目了，可以结束爬取并存储新的
 80 |                 self.feature_helper(self.wish_dict)
 81 |                 return self.wish_dict
 82 |             next_ = hasNextPage(soup)
 83 |         
 84 |         #add feature for every movie
 85 |         self.feature_helper(self.wish_dict)
 86 |         return self.wish_dict
 87 |     
 88 |     def feature_helper(self, dic):
 89 |         count=0
 90 |         st=perf_counter()
 91 |         total=len(dic)
 92 |         fail=[]
 93 |         for mid in dic.keys():
 94 |             count+=1
 95 |             if count%50==0:
 96 |                 sleep(15)
 97 |             sleep(uniform(1,2))
 98 |             timebar(30,st,count/total)
 99 |             fail.append(self.get_feature(mid,dic))
100 |         print('\n再次尝试打开失败的电影页')
101 |         for fmid in fail:
102 |             if fmid!=None:
103 |                 sleep(1.5)
104 |                 print()
105 |                 self.get_feature(fmid,dic)
106 | 
107 |     def get_feature(self,mid,dic):
108 |         try:
109 |             req2=self.s.get(subject_head+mid)
110 |             print(' '+dic[mid]['电影名']+' 状态：',req2.status_code,end=' ')
111 |             if req2.status_code == requests.codes.ok:
112 |                 soup2=BeautifulSoup(req2.text,'html.parser')
113 |                 c=soup2.find(id='info').text
114 |                 intro=c.split('\n')
115 |                 for i in intro:
116 |                     if ':' in i :
117 |                         key,value=i.split(':',1)
118 |                         dic[mid][key]=value.strip(' ')
119 |                 dic[mid]['封面']=soup2.find('img').get('src')
120 |                 try:
121 |                     dic[mid]['豆瓣评分']=soup2.find(property=re.compile('average')).text
122 |                 except:
123 |                     dic[mid]['豆瓣评分']=''
124 |                 try:
125 |                     dic[mid]['评分人数']=soup2.find(class_="rating_people").span.text
126 |                 except:
127 |                     dic[mid]['评分人数']='0'
128 |                 try:
129 |                     dic[mid]['年份']=getYear(dic[mid]['上映日期'])
130 |                 except:
131 |                     try:
132 |                         dic[mid]['年份']=getYear(dic[mid]['首播'])
133 |                     except:
134 |                         dic[mid]['年份']='...'
135 |                 dic[mid]['短评们']=getShortComments(soup2.findAll(class_="comment"))
136 |         except:
137 |             print('\r打开电影页失败，失败的电影链接：'+subject_head+mid)
138 |             self.switch_header()
139 |             return mid
140 |     
141 |     def saw_get(self,saw):
142 |         date=saw(class_=re.compile('date'))[0].get_text(strip=True)
143 |         try:
144 |             star=saw(class_=re.compile('rat'))[0]['class'][0][6]
145 |         except:
146 |             star=''
147 |         try:
148 |             comment=saw(class_=re.compile('comment'))[0].get_text(strip=True)
149 |         except:
150 |             comment=''
151 |         try:
152 |             owntag_list=saw.find(class_='tags').get_text(strip=True).split(': ',1)[1].split(' ')
153 |             owntag='/'.join(owntag_list)
154 |         except:
155 |             owntag=''
156 |         name=saw.find(href=re.compile('subject')).get_text(strip=True)
157 |         mid=saw.find(href=re.compile('subject')).get('href').split('/')[-2]
158 |         return date,star,comment,owntag,name,mid
159 |     
160 |     def saw_store(self,saw,lastMid):
161 |         for i in range(len(saw)):
162 |             date,star,comment,owntag,name,mid=self.saw_get(saw[i])
163 |             if (lastMid == str(mid)):
164 |                 return -1
165 |             self.saw_dict[mid]=\
166 |                 {'subjectId': mid,'电影名':name,'豆瓣链接':subject_head+mid,\
167 |                 '用户评分':star,'短评':comment,'用户标签':owntag,'标记日期':date,}
168 | 
169 |     def Saw(self):
170 |         print('\n开始爬取'+self.id+'的看过列表')
171 |         beg, end = pageControl(10)
172 |         page=beg
173 |         Sfirstpage = movie_ppl_head+self.id+'/collect?start='+\
174 |             str((beg-1)*30)+'&sort=time&rating=all&filter=all&mode=list'
175 |         soup,status = self.get_soup(Sfirstpage)
176 |         print(f'第{page}页',status)
177 |         
178 |         # 添加新特性，可以根据上次爬取历史中断重复爬取
179 |         ## 要求上次爬取文件在当前脚本目录中
180 |         lastMid = getLastBackUpItem(self.id,'看过')
181 | 
182 |         #get movie name and id
183 |         if (self.saw_store(soup.find_all(class_=['item']), lastMid) == -1):
184 |             #add feature for every movie
185 |             self.feature_helper(self.saw_dict)
186 |             return self.saw_dict
187 |         
188 |         next_ = hasNextPage(soup)
189 |         #get all saw list
190 |         while (next_ != False) and (page < end):
191 |             sleep(1.3)
192 |             NextPage='https://movie.douban.com' + next_
193 |             soup,status = self.get_soup(NextPage)
194 |             page+=1
195 |             print(f'第{page}页',status)
196 |             if (self.saw_store(soup.find_all(class_=['item']), lastMid) == -1):
197 |                 #add feature for every movie
198 |                 self.feature_helper(self.saw_dict)
199 |                 return self.saw_dict
200 |             next_ = hasNextPage(soup)
201 |         
202 |         #add feature for every movie
203 |         self.feature_helper(self.saw_dict)
204 |         return self.saw_dict
205 |     
206 |     def save_helper(self, dic, Type):
207 |         with open(fn(self.id+'-'+getFormatTime()+Type+'plus.csv'),\
208 |             'a',encoding='utf-8_sig') as f:
209 |             fieldNames = self.sawKeys if Type == '看过' else self.itemKeys
210 |             writer = csv.DictWriter(f, fieldnames=fieldNames, restval="...", extrasaction='ignore')
211 |             writer.writeheader()
212 |             for mid in dic.keys():
213 |                 writer.writerow(dic[mid])
214 |         dic = {}
215 |     
216 |     def save_as_csv(self,choice):
217 |         if choice in ['a','c']:
218 |             #保存想看
219 |             self.save_helper(self.wish_dict,'想看')
220 |         if choice in ['b','c']:
221 |             #保存看过
222 |             self.save_helper(self.saw_dict,'看过')
223 |     
224 |     def switch_header(self):
225 |         headers0['User-Agent']=choice(user_agent_list)
226 |         self.s.headers.update(headers0)
227 | 
228 |     def add_cookies(self,raw_cookies):
229 |         cookies=getCookie(raw_cookies)
230 |         self.s.cookies.update(cookies)
231 | 
232 | def movieMain():
233 |     print('嘿，据说你想要备份你的豆瓣电影记录？')
234 |     print('''你需要知道：
235 |     1. 本程序是一个爬虫程序，在爬取电影条目特征时会产生大量的网页访问，爬完后你的ip也许会被豆瓣封一段时间（登陆账号还是可以用啦）。
236 |     2. 大量的网页访问意味着需要大量的流量。
237 |     3. 爬取成功后，你的文件(csv)会被存储在该exe目录下，请不要在压缩包内使用该程序，解压后再使用。
238 |     4. 可能会比较耗时。''')
239 |     ans1=input('请确定你要开始备份(yes/no)： ')
240 |     if ans1=='yes':
241 |         Douid=input('请输入你的豆瓣id： ')
242 |         clawer=Douban_Movie(doubanid=Douid)
243 |         # 想要加cookies
244 |         if (input('想要添加cookies(爬取豆瓣隐藏条目)可以添加cookie,输入c: ').lower()=='c'):
245 |             raw_cookies = input("请输入cookies: ")
246 |             clawer.add_cookies(raw_cookies)
247 |         print('''
248 |     以下为选项
249 |         A：想看列表
250 |         B：看过列表
251 |         C：想看+看过''')
252 |         ans2=input('请输入你需要爬取的内容：')
253 |         ans2=ans2.lower()
254 |         if ans2=='a':
255 |             clawer.Wish()
256 |         elif ans2=='b':
257 |             clawer.Saw()
258 |         elif ans2=='c':
259 |             clawer.Wish()
260 |             clawer.Saw()
261 |         clawer.save_as_csv(choice=ans2)
262 |     print('\n问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
263 | 
264 | if __name__ == '__main__':
265 |     try:
266 |         movieMain()
267 |     except Exception as e:
268 |         traceback.print_exc()
269 |     finally:
270 |         sleep(10)
271 |         over=input('按任意键退出')


--------------------------------------------------------------------------------
/code/personalCrawler.py:
--------------------------------------------------------------------------------
 1 | import bookv2, moviev2, traceback
 2 | from time import sleep
 3 | 
 4 | if __name__ == '__main__':
 5 |     try:
 6 |         choice = input("图书备份请输入[b]，电影备份输入[m]: ")
 7 |         if (choice.lower() == 'b'):
 8 |             bookv2.main()
 9 |         elif (choice.lower() == 'm'):
10 |             moviev2.movieMain()
11 |     except Exception as e:
12 |         traceback.print_exc()
13 |         sleep(10)
14 |     finally:
15 |         over=input('按任意键退出')


--------------------------------------------------------------------------------
/code/series.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | from bs4 import BeautifulSoup
 4 | from time import sleep
 5 | user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
 6 |                 "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
 7 |                 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
 8 |                 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
 9 |                 ]
10 | headers0 = {'User-Agent':user_agent_list[3]}
11 | 
12 | 
13 | # file name
14 | def fn(name):
15 |     return name.replace('\\','-').replace('/','-')\
16 |         .replace(':','-').replace('*','-').replace('"','“')\
17 |         .replace('<','《').replace('>','》').replace('|','-').replace('?','？')
18 | 
19 | 
20 | class Series_Crawler:
21 |     
22 |     def __init__(self,seriesId):
23 |         self.id=seriesId
24 |         self.s=requests.Session()
25 |         self.s.headers.update(headers0)
26 |         self.list_name=''
27 |         self.finalOutput = ''
28 | 
29 |     def get_urls(self):
30 |         url='https://book.douban.com/series/'+self.id
31 |         req=self.s.get(url)
32 |         page=1
33 |         print(f'第{page}页：',req.status_code)
34 |         soup=BeautifulSoup(req.text,'html.parser')
35 |         self.list_name=soup.h1.text
36 |         print('开始爬取豆列：'+self.list_name)
37 |         items=soup.find_all(class_='subject-item')
38 |         for i in items:
39 |             self.saveItem(i)
40 |         while 1:
41 |             sleep(2)
42 |             try:
43 |                 NextPage=soup.find(class_='next').link.get('href')
44 |                 req=self.s.get(NextPage)
45 |                 page+=1
46 |                 print(f'第{page}页：',req.status_code)
47 |                 soup=BeautifulSoup(req.text,'html.parser')
48 |             except:
49 |                 break
50 |             else:
51 |                 items=soup.find_all(class_='subject-item')
52 |                 for i in items:
53 |                     self.saveItem(i)
54 |         # 输出
55 |         with open(fn(self.list_name)+'_subjects.csv','a',encoding='utf-8_sig') as f:
56 |             f.write('书名,豆瓣链接,封面,豆瓣评分,作者,出版信息,简介\n')
57 |             f.write(self.finalOutput)
58 | 
59 |     def saveItem(self,item):
60 |         title = item.h2.get_text(strip=True)
61 |         link = item.h2.a.get('href')
62 |         picList = item.img.get('src').split('.')[:-1]
63 |         picList.append('jpg')
64 |         picUrl = '.'.join(picList)
65 |         # 获取评分
66 |         try:
67 |             rating = item.find(class_='rating_nums').get_text(strip=True)
68 |         except:
69 |             rating = 'nan'
70 |         # 获取出版信息
71 |         try:
72 |             info = item.find(class_='pub').get_text(strip=True)
73 |             # 将第一个信息，一般是作者名分离出来
74 |             info = '","'.join(info.split('/',1))
75 |         except:
76 |             info = 'nan","nan'
77 |         # 获得简介
78 |         try:
79 |             desc = item.find('p').get_text(strip=True).replace('\n',' ')
80 |         except:
81 |             desc = 'nan'
82 |         li = [title,link,picUrl,rating,info,desc]
83 |         row = '"' + '","'.join(li) + '"' + '\n'
84 |         self.finalOutput += row
85 |         print(row)
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     print('这是一个爬取豆瓣丛书信息的程序，取决于丛书的大小和内容会发出大量的请求，甚至可能被豆瓣屏蔽ip一段时间。')
90 |     ch0=input('请确定你要备份豆列(yes/no)')
91 |     if ch0.lower()=='yes':
92 |         listid=input('请输入需要备份丛书的id：')
93 |         crawler=Series_Crawler(listid)
94 |         crawler.get_urls()
95 |     print('\n问题反馈：jimsun6428@gmail.com | https://github.com/JimSunJing/douban_clawer')
96 |     sleep(10)
97 |     over=input('按任意键退出')


--------------------------------------------------------------------------------
/code/simple_crawler.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import re
  3 | from time import sleep, ctime
  4 | import requests
  5 | import csv
  6 | 
  7 | headers0 = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"}
  8 | 
  9 | def getList(doubanid,Type,subType,pageLimit=50,pageStart='0'):
 10 |     # 准备好session进行爬取
 11 |     sess = requests.Session()
 12 |     sess.headers.update(headers0)
 13 |     if (Type=='b'): sess.get('https://book.douban.com/people/'+doubanid)
 14 |     # 根据输入准备好URL模板
 15 |     prefix=''
 16 |     suffix=subType
 17 |     if Type=='f':
 18 |         prefix='movie'
 19 |     elif Type=='b':
 20 |         prefix='book'
 21 |     elif Type=='m':
 22 |         prefix='music'
 23 |     else:
 24 |         print("输入类型错误")
 25 |         return -1
 26 |     Module='https://'+prefix+'.douban.com/people/'+doubanid+'/'+\
 27 |         suffix+'?start={start}&sort=time&rating=all&filter=all&mode=list'
 28 |     # 获取第一页
 29 |     request=sess.get(Module.format(start=pageStart))
 30 |     page=1
 31 |     print(f'第{page}页',request.reason)
 32 |     List=[]
 33 |     soup=BeautifulSoup(request.text,'html.parser')
 34 |     dealWithSubjects(soup.find_all(class_='item-show'),List,subType)
 35 |     # 根据页面中的“下一页”判断是否继续爬取
 36 |     while page < pageLimit:
 37 |         try:
 38 |             if Type=='m':
 39 |                 NextPage=soup.find(class_='next').link.get('href')
 40 |             else:
 41 |                 NextPage='https://'+prefix+'.douban.com'+soup.find(class_='next').link.get('href')
 42 |         except:
 43 |             break
 44 |         else:
 45 |             request=sess.get(NextPage)
 46 |             page+=1
 47 |             print(f'第{page}页',request.reason)
 48 |             soup=BeautifulSoup(request.text,'html.parser')
 49 |             dealWithSubjects(soup.find_all(class_='item-show'),List,subType)
 50 |             sleep(1.3)
 51 |     fileName = '_'.join([doubanid,prefix,suffix,str(ctime())])+'.csv'
 52 |     with open(fileName.replace(':','-').replace(' ','_'),'w',encoding='UTF-8') as f:
 53 |         fieldName = list(List[0].keys())
 54 |         fieldName.append('Type')
 55 |         writer = csv.DictWriter(f,fieldnames = fieldName)
 56 |         writer.writeheader()
 57 |         for dic in List:
 58 |             dic.update({'Type':Type})
 59 |             writer.writerow(dic)
 60 | 
 61 | 
 62 | def dealWithSubjects(itemList,container,Type):
 63 |     # 负责将subject内容装入List
 64 |     for item in itemList:
 65 |         doubanId = item.find('a',href=re.compile("subject")).get('href').split('/')[-2]
 66 |         title = item.find('a',href=re.compile("subject")).get_text(strip=True)
 67 |         if (Type!='collect'):
 68 |             dic = {'doubanId':doubanId,'title':title}
 69 |         else:
 70 |             try:
 71 |                 star = item.find_all('span')[-1]['class'][0][-3]
 72 |                 if not str.isdigit(star): star = ''
 73 |             except:
 74 |                 star = ''
 75 |             date = item.find(class_='date').get_text(strip=True)
 76 |             dic = {'doubanId':doubanId,'title':title,'date':date,'star':star}
 77 |         container.append(dic)
 78 | 
 79 | 
 80 | 
 81 | def main():
 82 |     if input("这是一个爬取豆瓣个人书影音记录的程序，如果要开始请输入y: ")!='y': return
 83 |     douid=input('请输入你的豆瓣id: ')
 84 |     Type=input('输入你要爬取的类型(图书=b,电影=f,音乐=m): ')
 85 |     subType=input('输入想要爬取的内容(想看/想读/想听=w,看过/听过=c): ')
 86 |     pageStart=input('有想从固定页面开始吗？如果有请输入（默认1）: ')
 87 |     pageLimit=input('限制爬取页数（默认50）: ')
 88 |     if pageLimit!='':
 89 |         pageLimit = int(pageLimit)
 90 |     else:
 91 |         pageLimit = 50
 92 |     if pageStart=='':
 93 |         pageStart = '0'
 94 |     else:
 95 |         pageStart = str((int(pageStart)-1)*30)
 96 |     if subType=='w':
 97 |         subType='wish'
 98 |     elif subType=='c':
 99 |         subType='collect'
100 |     else:
101 |         print("爬取内容（想看,看过...）输入错误")
102 |         return
103 | 
104 |     # try:
105 |     getList(douid,Type,subType,pageLimit,pageStart)
106 |     print("应该保存成功，请在程序所在文件夹查找")
107 |     # except Exception as e:
108 |     #     print(e)
109 |     #     print('出错，请联系开发者')
110 |     
111 |     input("程序结束，按任意键退出")
112 |     
113 | 
114 | if __name__ == '__main__':
115 |     main()


--------------------------------------------------------------------------------
/draft/develop_path.md:
--------------------------------------------------------------------------------
   1 | 
   2 | ## 想看列表爬取
   3 | 
   4 | 
   5 | ```python
   6 | #豆瓣电影主页：https://movie.douban.com/people/91835006/
   7 | #想看首页url：https://movie.douban.com/people/91835006/wish
   8 | #按列表形式第一页：https://movie.douban.com/people/91835006/wish?start=0&sort=time&rating=all&filter=all&mode=list
   9 | #第二页: https://movie.douban.com/people/91835006/wish?start=30&sort=time&rating=all&filter=all&mode=list
  10 | #第三页: https://movie.douban.com/people/91835006/wish?start=60&sort=time&rating=all&filter=all&mode=list
  11 | import urllib.request
  12 | from bs4 import BeautifulSoup
  13 | import re
  14 | import time
  15 | import random
  16 | from selenium import webdriver
  17 | ```
  18 | 
  19 | 
  20 | ```python
  21 | url='https://movie.douban.com/people/91835006/wish?start=0&sort=time&rating=all&filter=all&mode=list'
  22 | request=urllib.request.urlopen(url=url)
  23 | data=request.read()
  24 | print('Status:',request.status,request.reason)
  25 | for k,v in request.getheaders():
  26 |     print('{}: {}'.format(k,v))
  27 | ```
  28 | 
  29 |     Status: 200 OK
  30 |     Date: Tue, 18 Jun 2019 05:27:32 GMT
  31 |     Content-Type: text/html; charset=utf-8
  32 |     Transfer-Encoding: chunked
  33 |     Connection: close
  34 |     Vary: Accept-Encoding
  35 |     X-Xss-Protection: 1; mode=block
  36 |     X-Douban-Mobileapp: 0
  37 |     Expires: Sun, 1 Jan 2006 01:00:00 GMT
  38 |     Pragma: no-cache
  39 |     Cache-Control: must-revalidate, no-cache, private
  40 |     Set-Cookie: ll="118161"; domain=.douban.com; path=/; expires=Wed, 17-Jun-2020 05:27:32 GMT
  41 |     Set-Cookie: bid=7_GMh7XrBRc; Expires=Wed, 17-Jun-20 05:27:32 GMT; Domain=.douban.com; Path=/
  42 |     X-DOUBAN-NEWBID: 7_GMh7XrBRc
  43 |     X-DAE-Node: anson77
  44 |     X-DAE-App: movie
  45 |     Server: dae
  46 |     X-Content-Type-Options: nosniff
  47 |     
  48 | 
  49 | 
  50 | ```python
  51 | html=BeautifulSoup(data)
  52 | items=html.find_all('a',href=re.compile("subject"))
  53 | wish=[]
  54 | for item in items:
  55 |     wish.append(item.string.replace(' ','').strip('\n'))
  56 | ```
  57 | 
  58 | 
  59 | ```python
  60 | str(wish[1])
  61 | ```
  62 | 
  63 | 
  64 | 
  65 | 
  66 |     '纸月亮/PaperMoon'
  67 | 
  68 | 
  69 | 
  70 | 
  71 | ```python
  72 | NextPage='https://movie.douban.com'+html.find(class_='next').link.get('href')
  73 | ```
  74 | 
  75 | 
  76 | ```python
  77 | lastp='https://movie.douban.com/people/91835006/wish?start=390&sort=time&rating=all&filter=all&mode=list'
  78 | lrequest=urllib.request.urlopen(url=lastp)
  79 | ldata=lrequest.read()
  80 | lasoup=BeautifulSoup(ldata)
  81 | ```
  82 | 
  83 | 
  84 | ```python
  85 | try:
  86 |     lasoup.find(class_='next').link.get('href')
  87 | except:
  88 |     print('No Next')
  89 | ```
  90 | 
  91 |     No Next
  92 |     
  93 | 
  94 | 
  95 | ```python
  96 | 
  97 | def getWishList(doubanid='91835006'):
  98 |     firstpage='https://movie.douban.com/people/'+doubanid+'/wish?start=0&sort=time&rating=all&filter=all&mode=list'
  99 |     request=urllib.request.urlopen(url=firstpage)
 100 |     print('Status:',request.status,request.reason)
 101 |     wish_list=[]
 102 |     soup=BeautifulSoup(request.read())
 103 |     for item in soup.find_all('a',href=re.compile("subject")):
 104 |         wish_list.append(item.string.replace(' ','').strip('\n'))
 105 |     while 1:
 106 |         try:
 107 |             NextPage='https://movie.douban.com'+soup.find(class_='next').link.get('href')
 108 |         except:
 109 |             break
 110 |         else:
 111 |             request=urllib.request.urlopen(url=NextPage)
 112 |             print('Status:',request.status,request.reason)
 113 |             soup=BeautifulSoup(request.read())
 114 |             for item in soup.find_all('a',href=re.compile("subject")):
 115 |                 wish_list.append(item.string.replace(' ','').strip('\n'))
 116 |             time.sleep(0.5)
 117 |     fw=open(doubanid+'_Wish_List.txt','w',encoding='utf-8_sig')
 118 |     fw.write('中文名 / 原名 \n')
 119 |     for item in wish_list:
 120 |         fw.write(str(item)+'\n')
 121 |     return wish_list
 122 | ```
 123 | 
 124 | 
 125 | ```python
 126 | wishl=getWishList()
 127 | ```
 128 | 
 129 |     Status: 200 OK
 130 |     Status: 200 OK
 131 |     Status: 200 OK
 132 |     Status: 200 OK
 133 |     Status: 200 OK
 134 |     Status: 200 OK
 135 |     Status: 200 OK
 136 |     Status: 200 OK
 137 |     Status: 200 OK
 138 |     Status: 200 OK
 139 |     Status: 200 OK
 140 |     Status: 200 OK
 141 |     Status: 200 OK
 142 |     Status: 200 OK
 143 |     
 144 | 
 145 | 看过LIST
 146 | 
 147 | 
 148 | ```python
 149 | url='https://movie.douban.com/people/91835006/collect'
 150 | request=urllib.request.urlopen(url=url)
 151 | print('Status:',request.status,request.reason)
 152 | soup=BeautifulSoup(request.read())
 153 | ```
 154 | 
 155 |     Status: 200 OK
 156 |     
 157 | 
 158 | 
 159 | ```python
 160 | titandcom=soup.find_all(class_=['item'])
 161 | ```
 162 | 
 163 | 
 164 | ```python
 165 | def TCappend(TC,titandcom):
 166 |     for i in range(len(titandcom)):
 167 |         title=titandcom[i].em.text
 168 |         date=titandcom[i](class_=re.compile('date'))[0].text
 169 |         try:
 170 |             star=titandcom[i](class_=re.compile('rat'))[0]['class'][0][6]
 171 |         except:
 172 |             star='Nah'
 173 |         try:
 174 |             comment=titandcom[i](class_=re.compile('comment'))[0].text
 175 |         except:
 176 |             comment='Nah'
 177 |         TC[title]=[date,star,comment]
 178 | ```
 179 | 
 180 | 
 181 | ```python
 182 | def getSawList(doubanid='91835006'):
 183 |     firstpage='https://movie.douban.com/people/'+doubanid+'/collect'
 184 |     request=urllib.request.urlopen(url=firstpage)
 185 |     print('Status:',request.status,request.reason)
 186 |     saw_dic={}
 187 |     soup=BeautifulSoup(request.read())
 188 |     tandc=soup.find_all(class_=['item'])
 189 |     TCappend(TC=saw_dic,titandcom=tandc)
 190 |     while 1:
 191 |         try:
 192 |             NextPage='https://movie.douban.com'+soup.find(class_='next').link.get('href')
 193 |         except:
 194 |             break
 195 |         else:
 196 |             request=urllib.request.urlopen(url=NextPage)
 197 |             print('Status:',request.status,request.reason)
 198 |             soup=BeautifulSoup(request.read())
 199 |             tandc=soup.find_all(class_=['item'])
 200 |             TCappend(saw_dic,titandcom=tandc)
 201 |             time.sleep(0.5)
 202 |     fw=open(doubanid+'_Watched_List.csv','w',encoding='utf-8_sig')
 203 |     fw.write('中文名/原名,标记日期,评分,短评\n')
 204 |     for title in saw_dic.keys():
 205 |         fw.write(title.replace(',','、').replace('，','、')+','+saw_dic[title][0]+\
 206 |                  ','+saw_dic[title][1]+','+saw_dic[title][2].replace(',','、').replace('，','、')+'\n')
 207 |     return saw_dic
 208 | ```
 209 | 
 210 | 
 211 | ```python
 212 | saw=getSawList()
 213 | ```
 214 | 
 215 |     Status: 200 OK
 216 |     Status: 200 OK
 217 |     Status: 200 OK
 218 |     Status: 200 OK
 219 |     Status: 200 OK
 220 |     Status: 200 OK
 221 |     Status: 200 OK
 222 |     Status: 200 OK
 223 |     Status: 200 OK
 224 |     Status: 200 OK
 225 |     Status: 200 OK
 226 |     Status: 200 OK
 227 |     Status: 200 OK
 228 |     Status: 200 OK
 229 |     Status: 200 OK
 230 |     Status: 200 OK
 231 |     Status: 200 OK
 232 |     Status: 200 OK
 233 |     Status: 200 OK
 234 |     Status: 200 OK
 235 |     Status: 200 OK
 236 |     Status: 200 OK
 237 |     Status: 200 OK
 238 |     Status: 200 OK
 239 |     Status: 200 OK
 240 |     Status: 200 OK
 241 |     Status: 200 OK
 242 |     Status: 200 OK
 243 |     Status: 200 OK
 244 |     Status: 200 OK
 245 |     Status: 200 OK
 246 |     Status: 200 OK
 247 |     Status: 200 OK
 248 |     Status: 200 OK
 249 |     Status: 200 OK
 250 |     Status: 200 OK
 251 |     Status: 200 OK
 252 |     
 253 | 
 254 | 
 255 | ```python
 256 | fw=open('91835006_Watched_List.csv','w',encoding='utf-8')
 257 | fw.write('中文名/原名,标记日期,评分,短评\n')
 258 | for title in saw.keys():
 259 |     fw.write(title.replace(',','、').replace('，','、')+','+saw[title][0]+','+\
 260 |                  saw[title][1]+','+saw[title][2].replace(',','、').replace('，','、')+'\n')
 261 | ```
 262 | 
 263 | 
 264 | ```python
 265 | import pandas as pd
 266 | ```
 267 | 
 268 | 
 269 | ```python
 270 | data=pd.read_csv('_Watched_List.csv')
 271 | ```
 272 | 
 273 | 
 274 | ```python
 275 | data.head(10)
 276 | ```
 277 | 
 278 | 
 279 | 
 280 | 
 281 | <div>
 282 | <style scoped>
 283 |     .dataframe tbody tr th:only-of-type {
 284 |         vertical-align: middle;
 285 |     }
 286 | 
 287 |     .dataframe tbody tr th {
 288 |         vertical-align: top;
 289 |     }
 290 | 
 291 |     .dataframe thead th {
 292 |         text-align: right;
 293 |     }
 294 | </style>
 295 | <table border="1" class="dataframe">
 296 |   <thead>
 297 |     <tr style="text-align: right;">
 298 |       <th></th>
 299 |       <th>中文名/原名</th>
 300 |       <th>标记日期</th>
 301 |       <th>评分</th>
 302 |       <th>短评</th>
 303 |     </tr>
 304 |   </thead>
 305 |   <tbody>
 306 |     <tr>
 307 |       <th>0</th>
 308 |       <td>来自未来的故事 / Stories from Our Future</td>
 309 |       <td>2019-06-16</td>
 310 |       <td>4/5</td>
 311 |       <td>第一集挺有趣</td>
 312 |     </tr>
 313 |     <tr>
 314 |       <th>1</th>
 315 |       <td>办公室   第二季 / The Office Season 2</td>
 316 |       <td>2019-06-15</td>
 317 |       <td>4/5</td>
 318 |       <td>「当你入聘一家公司、发现大部分工作是babysitting你的小老板」</td>
 319 |     </tr>
 320 |     <tr>
 321 |       <th>2</th>
 322 |       <td>黑镜 第五季 / Black Mirror Season 5</td>
 323 |       <td>2019-06-07</td>
 324 |       <td>4/5</td>
 325 |       <td>麦粒这集很棒呀、有种童话故事的感觉、善恶分明的故事果然非常满足观众。但转念一想、麦粒这集是很...</td>
 326 |     </tr>
 327 |     <tr>
 328 |       <th>3</th>
 329 |       <td>切尔诺贝利 / Chernobyl</td>
 330 |       <td>2019-06-05</td>
 331 |       <td>5/5</td>
 332 |       <td>向敢于直面真相的人致敬！鄙视那些掩盖真相的国家机器。</td>
 333 |     </tr>
 334 |     <tr>
 335 |       <th>4</th>
 336 |       <td>杀死伊芙 第二季 / Killing Eve Season 2</td>
 337 |       <td>2019-05-31</td>
 338 |       <td>4/5</td>
 339 |       <td>Nah</td>
 340 |     </tr>
 341 |     <tr>
 342 |       <th>5</th>
 343 |       <td>女巫 / The VVitch: A New-England Folktale</td>
 344 |       <td>2019-05-21</td>
 345 |       <td>4/5</td>
 346 |       <td>弟弟caleb好可爱啊、大女儿也美的发光。</td>
 347 |     </tr>
 348 |     <tr>
 349 |       <th>6</th>
 350 |       <td>海蒂和爷爷 / Heidi</td>
 351 |       <td>2019-05-19</td>
 352 |       <td>4/5</td>
 353 |       <td>真的是童话一般啊～淳朴的民风和善良的有钱人。希望世界真的如此美好、灯光一亮、梦醒了。</td>
 354 |     </tr>
 355 |     <tr>
 356 |       <th>7</th>
 357 |       <td>绑定 第一季 / Bonding Season 1</td>
 358 |       <td>2019-05-16</td>
 359 |       <td>4/5</td>
 360 |       <td>警察一般把我们看作是妓女女主的这句话同时也描述了在大众眼光中、BDSM是怎样的形象。导演以如...</td>
 361 |     </tr>
 362 |     <tr>
 363 |       <th>8</th>
 364 |       <td>大侦探皮卡丘 / Pokémon Detective Pikachu</td>
 365 |       <td>2019-05-12</td>
 366 |       <td>2/5</td>
 367 |       <td>Nah</td>
 368 |     </tr>
 369 |     <tr>
 370 |       <th>9</th>
 371 |       <td>性爱自修室 第一季 / Sex Education Season 1</td>
 372 |       <td>2019-05-09</td>
 373 |       <td>5/5</td>
 374 |       <td>该死的校长！成年人的欺诈、偏执都在他身上淋漓尽致体现了。编剧第二季给我搞他！</td>
 375 |     </tr>
 376 |   </tbody>
 377 | </table>
 378 | </div>
 379 | 
 380 | 
 381 | 
 382 | 
 383 | ```python
 384 | page=0
 385 | print(f'第{page}页')
 386 | ```
 387 | 
 388 |     第0页
 389 |     
 390 | 
 391 | ## 豆瓣读书
 392 | 
 393 | ### 想读列表
 394 | 
 395 | 
 396 | ```python
 397 | bookurl='https://book.douban.com/people/91835006/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count'
 398 | response=urllib.request.Request(url=bookurl,headers=headers)
 399 | request=urllib.request.urlopen(response)
 400 | soup=BeautifulSoup(request.read())
 401 | print('Status:',request.status,request.reason)
 402 | ```
 403 | 
 404 |     Status: 200 OK
 405 |     
 406 | 
 407 | 
 408 | ```python
 409 | items=soup.find_all(class_='item')
 410 | ```
 411 | 
 412 | 
 413 | ```python
 414 | items
 415 | ```
 416 | 
 417 | 
 418 | 
 419 | 
 420 |     [<li class="item" id="list26004211">
 421 |      <div class="item-show">
 422 |      <div class="title">
 423 |      <a href="https://book.douban.com/subject/26004211/">
 424 |                              数值最优化方法
 425 |                          </a>
 426 |      </div>
 427 |      <div class="date">
 428 |                      2019-06-06
 429 |                      </div>
 430 |      </div>
 431 |      <div class="hide" id="grid26004211">
 432 |      <div class="grid-date">
 433 |      <span class="intro">高立 / 北京大学出版社 / 2014-9 / 28</span><br/>
 434 |      </div>
 435 |      </div>
 436 |      </li>, <li class="item" id="list3535331">
 437 |      <div class="item-show">
 438 |      <div class="title">
 439 |      <a href="https://book.douban.com/subject/3535331/">
 440 |                              食经
 441 |                          </a>
 442 |      </div>
 443 |      <div class="date">
 444 |                      2019-06-06
 445 |                      </div>
 446 |      </div>
 447 |      <div class="hide" id="grid3535331">
 448 |      <div class="grid-date">
 449 |      <span class="intro">陈梦因 / 百花文艺出版社 / 2009-1 / 78.00元</span><br/>
 450 |      </div>
 451 |      </div>
 452 |      </li>, <li class="item" id="list1465543">
 453 |      <div class="item-show">
 454 |      <div class="title">
 455 |      <a href="https://book.douban.com/subject/1465543/">
 456 |                              异端的权利
 457 |                          </a>
 458 |      </div>
 459 |      <div class="date">
 460 |                      2019-05-29
 461 |                      </div>
 462 |      </div>
 463 |      <div class="hide" id="grid1465543">
 464 |      <div class="grid-date">
 465 |      <span class="intro">[奥] 斯·茨威格 / 赵台安 / 生活·读书·新知三联书店 / 1986-12 / 1.40元</span><br/>
 466 |      </div>
 467 |      </div>
 468 |      </li>, <li class="item" id="list27077129">
 469 |      <div class="item-show">
 470 |      <div class="title">
 471 |      <a href="https://book.douban.com/subject/27077129/">
 472 |                              华氏451
 473 |                          </a>
 474 |      </div>
 475 |      <div class="date">
 476 |                      2019-05-27
 477 |                      </div>
 478 |      </div>
 479 |      <div class="hide" id="grid27077129">
 480 |      <div class="grid-date">
 481 |      <span class="intro">[美] 雷·布拉德伯里 / 于而彦 / 上海译文出版社 / 2017-7 / 45.00</span><br/>
 482 |      </div>
 483 |      </div>
 484 |      </li>, <li class="item" id="list26641665">
 485 |      <div class="item-show">
 486 |      <div class="title">
 487 |      <a href="https://book.douban.com/subject/26641665/">
 488 |                              切尔诺贝利的悲鸣
 489 |                          </a>
 490 |      </div>
 491 |      <div class="date">
 492 |                      2019-05-27
 493 |                      </div>
 494 |      </div>
 495 |      <div class="hide" id="grid26641665">
 496 |      <div class="grid-date">
 497 |      <span class="intro">S. A. 阿列克谢耶维奇 / 方祖芳 / 郭成业 / 花城出版社 / 2015-11 / 42</span><br/>
 498 |      </div>
 499 |      </div>
 500 |      </li>, <li class="item" id="list1089142">
 501 |      <div class="item-show">
 502 |      <div class="title">
 503 |      <a href="https://book.douban.com/subject/1089142/">
 504 |                              消费社会
 505 |                          </a>
 506 |      </div>
 507 |      <div class="date">
 508 |                      2019-05-22
 509 |                      </div>
 510 |      </div>
 511 |      <div class="hide" id="grid1089142">
 512 |      <div class="grid-date">
 513 |      <span class="intro">[法] 让·波德里亚 / 刘成富 / 南京大学出版社 / 2001-5 / 28.00元</span><br/>
 514 |      </div>
 515 |      </div>
 516 |      </li>, <li class="item" id="list30393937">
 517 |      <div class="item-show">
 518 |      <div class="title">
 519 |      <a href="https://book.douban.com/subject/30393937/">
 520 |                              Upheaval
 521 |                          </a>
 522 |      </div>
 523 |      <div class="date">
 524 |                      2019-05-21
 525 |                      </div>
 526 |      </div>
 527 |      <div class="hide" id="grid30393937">
 528 |      <div class="grid-date">
 529 |      <span class="intro">Jared Diamond / Allen Lane / 2019-5-7 / GBP 25.00</span><br/>
 530 |      </div>
 531 |      </div>
 532 |      </li>, <li class="item" id="list19977684">
 533 |      <div class="item-show">
 534 |      <div class="title">
 535 |      <a href="https://book.douban.com/subject/19977684/">
 536 |                              酒国
 537 |                          </a>
 538 |      </div>
 539 |      <div class="date">
 540 |                      2019-05-17
 541 |                      </div>
 542 |      </div>
 543 |      <div class="hide" id="grid19977684">
 544 |      <div class="grid-date">
 545 |      <span class="intro">莫言 / 上海文艺出版社 / 2012-10 / 35.00元</span><br/>
 546 |      </div>
 547 |      </div>
 548 |      </li>, <li class="item" id="list1896853">
 549 |      <div class="item-show">
 550 |      <div class="title">
 551 |      <a href="https://book.douban.com/subject/1896853/">
 552 |                              The Ballad of the Sad Cafe and other Stories
 553 |                          </a>
 554 |      </div>
 555 |      <div class="date">
 556 |                      2019-05-13
 557 |                      </div>
 558 |      </div>
 559 |      <div class="hide" id="grid1896853">
 560 |      <div class="grid-date">
 561 |      <span class="intro">Carson McCullers / Penguin Books Ltd / 2001-3 / GBP 9.99</span><br/>
 562 |      </div>
 563 |      </div>
 564 |      </li>, <li class="item" id="list2973335">
 565 |      <div class="item-show">
 566 |      <div class="title">
 567 |      <a href="https://book.douban.com/subject/2973335/">
 568 |                              微暗的火
 569 |                          </a>
 570 |      </div>
 571 |      <div class="date">
 572 |                      2019-05-13
 573 |                      </div>
 574 |      </div>
 575 |      <div class="hide" id="grid2973335">
 576 |      <div class="grid-date">
 577 |      <span class="intro">[美]弗拉基米尔·纳博科夫 / 梅绍武 / 上海译文 / 2008-1 / 26.00元</span><br/>
 578 |      </div>
 579 |      </div>
 580 |      </li>, <li class="item" id="list27094706">
 581 |      <div class="item-show">
 582 |      <div class="title">
 583 |      <a href="https://book.douban.com/subject/27094706/">
 584 |                              财富自由之路
 585 |                          </a>
 586 |      </div>
 587 |      <div class="date">
 588 |                      2019-05-11
 589 |                      </div>
 590 |      </div>
 591 |      <div class="hide" id="grid27094706">
 592 |      <div class="grid-date">
 593 |      <span class="intro">李笑来 / 电子工业出版社 / 2017-8 / 79.00</span><br/>
 594 |      </div>
 595 |      </div>
 596 |      </li>, <li class="item" id="list20036150">
 597 |      <div class="item-show">
 598 |      <div class="title">
 599 |      <a href="https://book.douban.com/subject/20036150/">
 600 |                              Principles
 601 |                          </a>
 602 |      </div>
 603 |      <div class="date">
 604 |                      2019-05-11
 605 |                      </div>
 606 |      </div>
 607 |      <div class="hide" id="grid20036150">
 608 |      <div class="grid-date">
 609 |      <span class="intro">Ray Dalio / Simon &amp; Schuster / 2017-9-19 / USD 30.00</span><br/>
 610 |      </div>
 611 |      </div>
 612 |      </li>, <li class="item" id="list7060185">
 613 |      <div class="item-show">
 614 |      <div class="title">
 615 |      <a href="https://book.douban.com/subject/7060185/">
 616 |                              江城
 617 |                          </a>
 618 |      </div>
 619 |      <div class="date">
 620 |                      2019-05-05
 621 |                      </div>
 622 |      </div>
 623 |      <div class="hide" id="grid7060185">
 624 |      <div class="grid-date">
 625 |      <span class="intro">[美] 彼得·海斯勒 / 李雪顺 / 上海译文出版社 / 2012-1 / 36.00元</span><br/>
 626 |      </div>
 627 |      </div>
 628 |      </li>, <li class="item" id="list33401937">
 629 |      <div class="item-show">
 630 |      <div class="title">
 631 |      <a href="https://book.douban.com/subject/33401937/">
 632 |                              Find Me
 633 |                          </a>
 634 |      </div>
 635 |      <div class="date">
 636 |                      2019-05-04
 637 |                      </div>
 638 |      </div>
 639 |      <div class="hide" id="grid33401937">
 640 |      <div class="grid-date">
 641 |      <span class="intro">André Aciman / Farrar, Straus and Giroux / 2019-10-29 / USD 26.00</span><br/>
 642 |      </div>
 643 |      </div>
 644 |      </li>, <li class="item" id="list1092290">
 645 |      <div class="item-show">
 646 |      <div class="title">
 647 |      <a href="https://book.douban.com/subject/1092290/">
 648 |                              语言研究
 649 |                          </a>
 650 |      </div>
 651 |      <div class="date">
 652 |                      2019-04-23
 653 |                      </div>
 654 |      </div>
 655 |      <div class="hide" id="grid1092290">
 656 |      <div class="grid-date">
 657 |      <span class="intro">George Yule / 牛津大学出版社 / 2000-8-1 / 28.90</span><br/>
 658 |      </div>
 659 |      </div>
 660 |      </li>, <li class="item" id="list1856494">
 661 |      <div class="item-show">
 662 |      <div class="title">
 663 |      <a href="https://book.douban.com/subject/1856494/">
 664 |                              卡拉马佐夫兄弟
 665 |                          </a>
 666 |      </div>
 667 |      <div class="date">
 668 |                      2019-04-23
 669 |                      </div>
 670 |      </div>
 671 |      <div class="hide" id="grid1856494">
 672 |      <div class="grid-date">
 673 |      <span class="intro">[俄] 费奥多尔·陀思妥耶夫斯基 / 荣如德 / 上海译文出版社 / 2006-8 / 25.00元</span><br/>
 674 |      </div>
 675 |      </div>
 676 |      </li>, <li class="item" id="list2243692">
 677 |      <div class="item-show">
 678 |      <div class="title">
 679 |      <a href="https://book.douban.com/subject/2243692/">
 680 |                              潜水钟与蝴蝶
 681 |                          </a>
 682 |      </div>
 683 |      <div class="date">
 684 |                      2019-04-22
 685 |                      </div>
 686 |      </div>
 687 |      <div class="hide" id="grid2243692">
 688 |      <div class="grid-date">
 689 |      <span class="intro">[法] 让-多米尼克·鲍比 / 邱瑞銮 / 南海出版公司 / 2007-9 / 20.00元</span><br/>
 690 |      </div>
 691 |      <div class="comment">
 692 |                          原来世上存在有这么励志的书……
 693 |                          
 694 |                      </div>
 695 |      </div>
 696 |      </li>, <li class="item" id="list4578536">
 697 |      <div class="item-show">
 698 |      <div class="title">
 699 |      <a href="https://book.douban.com/subject/4578536/">
 700 |                              阿司匹林传奇
 701 |                          </a>
 702 |      </div>
 703 |      <div class="date">
 704 |                      2019-04-19
 705 |                      </div>
 706 |      </div>
 707 |      <div class="hide" id="grid4578536">
 708 |      <div class="grid-date">
 709 |      <span class="intro">(英) 杰弗里斯 / 暴永宁 / 生活·读书·新知三联书店 / 2010-7 / 38.00元</span><br/>
 710 |      <span class="tags">标签: 英国</span>
 711 |      </div>
 712 |      </div>
 713 |      </li>, <li class="item" id="list1219421">
 714 |      <div class="item-show">
 715 |      <div class="title">
 716 |      <a href="https://book.douban.com/subject/1219421/">
 717 |                              实验心理学
 718 |                          </a>
 719 |      </div>
 720 |      <div class="date">
 721 |                      2019-04-19
 722 |                      </div>
 723 |      </div>
 724 |      <div class="hide" id="grid1219421">
 725 |      <div class="grid-date">
 726 |      <span class="intro">M.Kimberly MacLin / 张奇 / 中国轻工业出版社 / 2004-9 / 28.00元</span><br/>
 727 |      </div>
 728 |      </div>
 729 |      </li>, <li class="item" id="list1758235">
 730 |      <div class="item-show">
 731 |      <div class="title">
 732 |      <a href="https://book.douban.com/subject/1758235/">
 733 |                              Linear Algebra and Its Applications, 4e
 734 |                          </a>
 735 |      </div>
 736 |      <div class="date">
 737 |                      2019-04-12
 738 |                      </div>
 739 |      </div>
 740 |      <div class="hide" id="grid1758235">
 741 |      <div class="grid-date">
 742 |      <span class="intro">Gilbert Strang / Brooks Cole / 2005-07-19 / USD 220.95</span><br/>
 743 |      </div>
 744 |      </div>
 745 |      </li>, <li class="item" id="list30199434">
 746 |      <div class="item-show">
 747 |      <div class="title">
 748 |      <a href="https://book.douban.com/subject/30199434/">
 749 |                              原生家庭
 750 |                          </a>
 751 |      </div>
 752 |      <div class="date">
 753 |                      2019-04-10
 754 |                      </div>
 755 |      </div>
 756 |      <div class="hide" id="grid30199434">
 757 |      <div class="grid-date">
 758 |      <span class="intro">（美）苏珊·福沃德博士 / 黄姝 / 北京时代华文书局│阳光博客 / 2018-8 / 58.00</span><br/>
 759 |      </div>
 760 |      </div>
 761 |      </li>, <li class="item" id="list25914783">
 762 |      <div class="item-show">
 763 |      <div class="title">
 764 |      <a href="https://book.douban.com/subject/25914783/">
 765 |                              你好小朋友
 766 |                          </a>
 767 |      </div>
 768 |      <div class="date">
 769 |                      2019-04-10
 770 |                      </div>
 771 |      </div>
 772 |      <div class="hide" id="grid25914783">
 773 |      <div class="grid-date">
 774 |      <span class="intro">秋山 亮二 / 小西六写真工业株式会社 / 1983-4-12 / JPY 3087</span><br/>
 775 |      </div>
 776 |      </div>
 777 |      </li>, <li class="item" id="list4822685">
 778 |      <div class="item-show">
 779 |      <div class="title">
 780 |      <a href="https://book.douban.com/subject/4822685/">
 781 |                              编码
 782 |                          </a>
 783 |      </div>
 784 |      <div class="date">
 785 |                      2019-04-07
 786 |                      </div>
 787 |      </div>
 788 |      <div class="hide" id="grid4822685">
 789 |      <div class="grid-date">
 790 |      <span class="intro">[美] Charles Petzold / 左飞 / 电子工业出版社 / 2010 / 55.00元</span><br/>
 791 |      </div>
 792 |      </div>
 793 |      </li>, <li class="item" id="list30367015">
 794 |      <div class="item-show">
 795 |      <div class="title">
 796 |      <a href="https://book.douban.com/subject/30367015/">
 797 |                              守夜
 798 |                          </a>
 799 |      </div>
 800 |      <div class="date">
 801 |                      2019-03-25
 802 |                      </div>
 803 |      </div>
 804 |      <div class="hide" id="grid30367015">
 805 |      <div class="grid-date">
 806 |      <span class="intro">[英] 萨拉·沃特斯 / 阿朗 / 世纪文景|上海人民出版社 / 2019-2 / 59.00</span><br/>
 807 |      </div>
 808 |      </div>
 809 |      </li>, <li class="item" id="list27010229">
 810 |      <div class="item-show">
 811 |      <div class="title">
 812 |      <a href="https://book.douban.com/subject/27010229/">
 813 |                              通向奴役之路
 814 |                          </a>
 815 |      </div>
 816 |      <div class="date">
 817 |                      2019-03-24
 818 |                      </div>
 819 |      </div>
 820 |      <div class="hide" id="grid27010229">
 821 |      <div class="grid-date">
 822 |      <span class="intro">海耶克 / 滕維藻 / 商務印書館(香港)有限公司 / 2017-3-20 / HK$ 128.00</span><br/>
 823 |      </div>
 824 |      </div>
 825 |      </li>, <li class="item" id="list1025723">
 826 |      <div class="item-show">
 827 |      <div class="title">
 828 |      <a href="https://book.douban.com/subject/1025723/">
 829 |                              潜规则
 830 |                          </a>
 831 |      </div>
 832 |      <div class="date">
 833 |                      2019-03-22
 834 |                      </div>
 835 |      </div>
 836 |      <div class="hide" id="grid1025723">
 837 |      <div class="grid-date">
 838 |      <span class="intro">吴思 / 云南人民出版社 / 2001-1 / 16.00元</span><br/>
 839 |      </div>
 840 |      </div>
 841 |      </li>, <li class="item" id="list25943061">
 842 |      <div class="item-show">
 843 |      <div class="title">
 844 |      <a href="https://book.douban.com/subject/25943061/">
 845 |                              孤独、团结与反抗
 846 |                          </a>
 847 |      </div>
 848 |      <div class="date">
 849 |                      2019-03-22
 850 |                      </div>
 851 |      </div>
 852 |      <div class="hide" id="grid25943061">
 853 |      <div class="grid-date">
 854 |      <span class="intro">(法) 加缪 / 郭宏安 / 花城出版社 / 2014-11 / 43.00</span><br/>
 855 |      </div>
 856 |      </div>
 857 |      </li>, <li class="item" id="list22806583">
 858 |      <div class="item-show">
 859 |      <div class="title">
 860 |      <a href="https://book.douban.com/subject/22806583/">
 861 |                              国家的常识
 862 |                          </a>
 863 |      </div>
 864 |      <div class="date">
 865 |                      2019-03-22
 866 |                      </div>
 867 |      </div>
 868 |      <div class="hide" id="grid22806583">
 869 |      <div class="grid-date">
 870 |      <span class="intro">[美]迈克尔·罗斯金(Michael G. Roskin) / 杨勇 / 世界图书出版公司 / 2013-4 / 68.00元</span><br/>
 871 |      </div>
 872 |      </div>
 873 |      </li>, <li class="item" id="list27021785">
 874 |      <div class="item-show">
 875 |      <div class="title">
 876 |      <a href="https://book.douban.com/subject/27021785/">
 877 |                              游戏设计、原型与开发
 878 |                          </a>
 879 |      </div>
 880 |      <div class="date">
 881 |                      2019-03-09
 882 |                      </div>
 883 |      </div>
 884 |      <div class="hide" id="grid27021785">
 885 |      <div class="grid-date">
 886 |      <span class="intro">【美】Jeremy Gibson / 刘晓晗 / 电子工业出版社 / 2017-5 / 128</span><br/>
 887 |      </div>
 888 |      </div>
 889 |      </li>, <li class="item last" id="list26313534">
 890 |      <div class="item-show">
 891 |      <div class="title">
 892 |      <a href="https://book.douban.com/subject/26313534/">
 893 |                              Unity游戏设计与实现
 894 |                          </a>
 895 |      </div>
 896 |      <div class="date">
 897 |                      2019-03-09
 898 |                      </div>
 899 |      </div>
 900 |      <div class="hide" id="grid26313534">
 901 |      <div class="grid-date">
 902 |      <span class="intro">[日]加藤政树 / 罗水东 / 人民邮电出版社 / 2015-2 / 79.00元</span><br/>
 903 |      </div>
 904 |      </div>
 905 |      </li>]
 906 | 
 907 | 
 908 | 
 909 | 
 910 | ```python
 911 | items[4](href=re.compile('subject'))[0].get_text(strip=True)
 912 | ```
 913 | 
 914 | 
 915 | 
 916 | 
 917 |     '切尔诺贝利的悲鸣'
 918 | 
 919 | 
 920 | 
 921 | 
 922 | ```python
 923 | items[2](class_='intro')[0].get_text(strip=True).split('/')
 924 | ```
 925 | 
 926 | 
 927 | 
 928 | 
 929 |     ['[奥] 斯·茨威格 ', ' 赵台安 ', ' 生活·读书·新知三联书店 ', ' 1986-12 ', ' 1.40元']
 930 | 
 931 | 
 932 | 
 933 | 
 934 | ```python
 935 | soup.find(class_='next').link.get('href')
 936 | ```
 937 | 
 938 | 
 939 | 
 940 | 
 941 |     '/people/91835006/wish?start=30&sort=time&rating=all&filter=all&mode=list'
 942 | 
 943 | 
 944 | 
 945 | 
 946 | ```python
 947 | def BWappend(BWdict,Items):
 948 |     for i in range(len(Items)):
 949 |         try:
 950 |             title=Items[i](href=re.compile('subject'))[0].get_text(strip=True)
 951 |             intro=Items[i](class_='intro')[0].get_text(strip=True).split('/')
 952 |             author=intro[0]
 953 |             publisher=intro[-3]
 954 |             translater='/'.join(intro[1:-3])
 955 |             BWdict[title]=[author,translater,publisher]
 956 |         except:
 957 |             title=Items[i](href=re.compile('subject'))[0].get_text(strip=True)
 958 |             intro=Items[i](class_='intro')[0].get_text(strip=True).split(';')
 959 |             author=intro[0]
 960 |             publisher=intro[-1]
 961 |             translater='/'.join(intro[1:-1])
 962 |             BWdict[title]=[author,translater,publisher]
 963 | 
 964 | ```
 965 | 
 966 | 
 967 | ```python
 968 | def bookwish(doubanid):
 969 |     firstpage='https://book.douban.com/people/'+doubanid+'/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count'
 970 |     request=urllib.request.urlopen(url=firstpage)
 971 |     soup=BeautifulSoup(request.read())
 972 |     page=1
 973 |     print(f'第{page}页',request.reason)
 974 |     bookwishdict={}
 975 |     items=soup.find_all(class_='item')
 976 |     BWappend(BWdict=bookwishdict,Items=items)
 977 |     while 1:
 978 |         try:
 979 |             Nextpage='https://book.douban.com'+soup.find(class_='next').link.get('href')
 980 |         except:
 981 |             print('已到最终页')
 982 |             break
 983 |         else:
 984 |             response=urllib.request.Request(url=Nextpage,headers=headers)
 985 |             request=urllib.request.urlopen(response)
 986 |             soup=BeautifulSoup(request.read())
 987 |             page+=1
 988 |             print(f'第{page}页',request.reason)
 989 |             items2=soup.find_all(class_='item')
 990 |             BWappend(BWdict=bookwishdict,Items=items2)
 991 |             time.sleep(1)
 992 |     fw=open(doubanid+'_TOread_List.csv','w',encoding='utf-8_sig')
 993 |     fw.write('书名,作者,译者,出版社\n')
 994 |     for title in bookwishdict.keys():
 995 |         fw.write(title.replace(',','、').replace('，','、')+','+bookwishdict[title][0]+\
 996 |                  ','+bookwishdict[title][1]+','+bookwishdict[title][2].replace(',','、').replace('，','、')+'\n')
 997 |     fw.close()
 998 |     return bookwishdict
 999 | ```
1000 | 
1001 | 
1002 | ```python
1003 | 
1004 | ```
1005 | 
1006 | 
1007 | ```python
1008 | 
1009 | ```
1010 | 
1011 | 
1012 | ```python
1013 | k=bookwish(doubanid='thucydides')
1014 | ```
1015 | 
1016 |     第1页 OK
1017 |     第2页 OK
1018 |     第3页 OK
1019 |     第4页 OK
1020 |     第5页 OK
1021 |     第6页 OK
1022 |     第7页 OK
1023 |     第8页 OK
1024 |     第9页 OK
1025 |     第10页 OK
1026 |     第11页 OK
1027 |     第12页 OK
1028 |     第13页 OK
1029 |     第14页 OK
1030 |     第15页 OK
1031 |     
1032 | 
1033 | 
1034 | ```python
1035 | #加浏览器信息和换代理（弃用）
1036 | hds = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
1037 | proxy='117.68.236.91:8118'
1038 | readurl='https://book.douban.com/people/91835006/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
1039 | '''
1040 | urlhandle = urllib.request.ProxyHandler({'http': proxy})
1041 | opener = urllib.request.build_opener(urlhandle)
1042 | urllib.request.install_opener(opener)
1043 | 
1044 | response=urllib.request.Request(url=readurl,headers=headers)
1045 | request=urllib.request.urlopen(response)
1046 | soup=BeautifulSoup(request.read())
1047 | 
1048 | print('Status:',request.status,request.reason)
1049 | for k,v in request.getheaders():
1050 |     print('{}: {}'.format(k,v))
1051 | '''
1052 | ```
1053 | 
1054 | 
1055 | 
1056 | 
1057 |     "\nurlhandle = urllib.request.ProxyHandler({'http': proxy})\nopener = urllib.request.build_opener(urlhandle)\nurllib.request.install_opener(opener)\n\nresponse=urllib.request.Request(url=readurl,headers=headers)\nrequest=urllib.request.urlopen(response)\nsoup=BeautifulSoup(request.read())\n\nprint('Status:',request.status,request.reason)\nfor k,v in request.getheaders():\n    print('{}: {}'.format(k,v))\n"
1058 | 
1059 | 
1060 | 
1061 | ### 读过列表
1062 | 
1063 | 
1064 | ```python
1065 | bookurl='https://book.douban.com/people/silenceapostle'
1066 | read='https://book.douban.com/people/silenceapostle/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
1067 | browser = webdriver.Chrome()
1068 | browser.get(bookurl)
1069 | browser.get(read)
1070 | soup = BeautifulSoup(browser.page_source, "html.parser")
1071 | ```
1072 | 
1073 | 
1074 | ```python
1075 | items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1076 | ```
1077 | 
1078 | 
1079 | ```python
1080 | items[5](class_=re.compile('rat'))[0]['class'][0][6]
1081 | ```
1082 | 
1083 | 
1084 | 
1085 | 
1086 |     '4'
1087 | 
1088 | 
1089 | 
1090 | 
1091 | ```python
1092 | def BRappend(BRdict,Items):
1093 |     for i in range(len(Items)):
1094 |         title=Items[i]('a')[0].get_text(strip=True)
1095 |         date=Items[i](class_=re.compile('date'))[0].get_text(strip=True)
1096 |         try:
1097 |             intro=Items[i](class_=re.compile('intro'))[0].get_text(strip=True).split('/')
1098 |             author=intro[0]
1099 |             publisher=intro[-3]
1100 |             translater='/'.join(intro[1:-3])
1101 |         except:
1102 |             intro=Items[i](class_=re.compile('intro'))[0].get_text(strip=True).replace(';','/').split('/')
1103 |             author=intro[0]
1104 |             publisher=intro[-1]
1105 |             translater='/'.join(intro[1:-1])
1106 |         try:
1107 |             comment=Items[i](class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
1108 |         except:
1109 |             comment='Nah'
1110 |         try:
1111 |             stars=Items[i](class_=re.compile('rat'))[0]['class'][0][6]
1112 |         except:
1113 |             stars='Nah'
1114 |         BRdict[title]=[author,translater,publisher,stars,date,comment]
1115 | ```
1116 | 
1117 | 
1118 | ```python
1119 | def ReadBookList(doubanid):
1120 |     mainpage='https://book.douban.com/people/'+doubanid
1121 |     firstpage='https://book.douban.com/people/'+doubanid+'/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
1122 |     browser = webdriver.Chrome()
1123 |     browser.get(mainpage)
1124 |     browser.get(firstpage)
1125 |     soup = BeautifulSoup(browser.page_source, "html.parser")
1126 |     items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1127 |     read_book={}
1128 |     BRappend(BRdict=read_book,Items=items)
1129 |     page=1
1130 |     print(f"浏览器处理第{page}页")
1131 |     while 1:
1132 |         time.sleep(2)
1133 |         try:
1134 |             NextPage='https://book.douban.com'+soup.find(class_='next').link.get('href')
1135 |         except:
1136 |             print('已到最终页')
1137 |             break
1138 |         else:
1139 |             browser.get(NextPage)
1140 |             soup=BeautifulSoup(browser.page_source,"html.parser")
1141 |             items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1142 |             page+=1
1143 |             print(f"浏览器处理第{page}页")
1144 |             BRappend(BRdict=read_book,Items=items)
1145 |     fw=open(doubanid+'_READ_List.csv','w',encoding='utf-8_sig')
1146 |     fw.write('书名,作者,译者,出版社,评分,日期,短评\n')
1147 |     for title in read_book.keys():
1148 |         fw.write(title.replace(',','、').replace('，','、')+','+read_book[title][0]+\
1149 |                  ','+read_book[title][1]+','+read_book[title][2].replace(',','、').replace('，','、')+\
1150 |                  ','+read_book[title][3]+','+read_book[title][4]+','+read_book[title][5].replace(',','、').replace('，','、')+'\n')
1151 |     fw.close()
1152 |     return read_book
1153 | ```
1154 | 
1155 | 
1156 | ```python
1157 | dick=ReadBookList(doubanid='91835006')
1158 | ```
1159 | 
1160 |     浏览器处理第1页
1161 |     浏览器处理第2页
1162 |     
1163 | 
1164 | ## MUSIC
1165 | 
1166 | 
1167 | ```python
1168 | musicurl='https://music.douban.com/people/174132109/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
1169 | request=urllib.request.urlopen(url=musicurl)
1170 | soup=BeautifulSoup(request.read())
1171 | print('Status:',request.status,request.reason)
1172 | ```
1173 | 
1174 |     Status: 200 OK
1175 |     
1176 | 
1177 | 
1178 | ```python
1179 | items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1180 | ```
1181 | 
1182 | 
1183 | ```python
1184 | items[10](class_='intro')[0].get_text(strip=True)
1185 | ```
1186 | 
1187 | 
1188 | 
1189 | 
1190 |     'Celine Dion / 1998-01-05 / 单曲 / CD / 流行'
1191 | 
1192 | 
1193 | 
1194 | 
1195 | ```python
1196 | soup.find(class_='next').link.get('href')
1197 | ```
1198 | 
1199 | 
1200 | 
1201 | 
1202 |     'https://music.douban.com/people/174132109/collect?start=30&sort=time&rating=all&filter=all&mode=list'
1203 | 
1204 | 
1205 | 
1206 | 
1207 | ```python
1208 | def Musappend(Mdict,Items):
1209 |     for it in Items:
1210 |         title=it('a')[0].get_text(strip=True)
1211 |         date=it(class_=re.compile('date'))[0].get_text(strip=True)
1212 |         try:
1213 |             stars=it(class_=re.compile('rat'))[0]['class'][0][6]
1214 |         except:
1215 |             stars='Nah'
1216 |         try:
1217 |             comment=it(class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
1218 |         except:
1219 |             comment='Nah'
1220 |         intro=it(class_='intro')[0].get_text(strip=True)
1221 |         Mdict[title]=[intro,date,stars,comment]
1222 | ```
1223 | 
1224 | 
1225 | ```python
1226 | def HeardList(doubanid):
1227 |     firstpage='https://music.douban.com/people/'+doubanid+'/collect?sort=time&start=0&filter=all&mode=list&tags_sort=count'
1228 |     request=urllib.request.urlopen(url=firstpage)
1229 |     soup=BeautifulSoup(request.read())
1230 |     items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1231 |     heard_dic={}
1232 |     Musappend(Mdict=heard_dic,Items=items)
1233 |     page=1
1234 |     print(f'第{page}页',request.reason)
1235 |     while 1:
1236 |         time.sleep(1)
1237 |         try:
1238 |             NextPage=soup.find(class_='next').link.get('href')
1239 |         except:
1240 |             print('已到最终页')
1241 |             break
1242 |         else:
1243 |             request=urllib.request.urlopen(url=NextPage)
1244 |             soup=BeautifulSoup(request.read())
1245 |             items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1246 |             Musappend(Mdict=heard_dic,Items=items)
1247 |             page+=1
1248 |             print(f'第{page}页',request.reason)
1249 |     fw=open(doubanid+'_Heard_List.csv','w',encoding='utf-8_sig')
1250 |     fw.write('专辑/单曲,简介,日期,评分,短评\n')
1251 |     for title in heard_dic.keys():
1252 |         fw.write(title.replace(',','、').replace('，','、')+','+heard_dic[title][0].replace(',','、').replace('，','、')+\
1253 |                  ','+heard_dic[title][1]+','+heard_dic[title][2]+\
1254 |                  ','+heard_dic[title][3].replace(',','、').replace('，','、')+'\n')
1255 |     fw.close()
1256 |     return heard_dic
1257 | ```
1258 | 
1259 | 
1260 | ```python
1261 | dic=HeardList(doubanid='91835006')
1262 | ```
1263 | 
1264 |     第1页 OK
1265 |     第2页 OK
1266 |     第3页 OK
1267 |     第4页 OK
1268 |     第5页 OK
1269 |     第6页 OK
1270 |     已到最终页
1271 |     
1272 | 
1273 | 
1274 | ```python
1275 | wantmurl='https://music.douban.com/people/174132109/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count'
1276 | request=urllib.request.urlopen(url=wantmurl)
1277 | soup=BeautifulSoup(request.read())
1278 | print('Status:',request.status,request.reason)
1279 | ```
1280 | 
1281 |     Status: 200 OK
1282 |     
1283 | 
1284 | 
1285 | ```python
1286 | items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1287 | ```
1288 | 
1289 | 
1290 | ```python
1291 | def WMusappend(Mdict,Items):
1292 |     for it in Items:
1293 |         title=it('a')[0].get_text(strip=True)
1294 |         date=it(class_=re.compile('date'))[0].get_text(strip=True)
1295 |         try:
1296 |             comment=it(class_=re.compile('comm'))[0].get_text(strip=True).replace('\n','-')
1297 |         except:
1298 |             comment='Nah'
1299 |         intro=it(class_='intro')[0].get_text(strip=True)
1300 |         Mdict[title]=[intro,date,comment]
1301 | ```
1302 | 
1303 | 
1304 | ```python
1305 | def WHeardList(doubanid):
1306 |     firstpage='https://music.douban.com/people/'+doubanid+'/wish?sort=time&start=0&filter=all&mode=list&tags_sort=count'
1307 |     request=urllib.request.urlopen(url=firstpage)
1308 |     soup=BeautifulSoup(request.read())
1309 |     items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1310 |     whear_dic={}
1311 |     WMusappend(Mdict=whear_dic,Items=items)
1312 |     page=1
1313 |     print(f'第{page}页',request.reason)
1314 |     while 1:
1315 |         time.sleep(1)
1316 |         try:
1317 |             NextPage=soup.find(class_='next').link.get('href')
1318 |         except:
1319 |             print('已到最终页')
1320 |             break
1321 |         else:
1322 |             request=urllib.request.urlopen(url=NextPage)
1323 |             soup=BeautifulSoup(request.read())
1324 |             items=soup.find_all(class_=re.compile('item'),id=re.compile('li'))
1325 |             Musappend(Mdict=whear_dic,Items=items)
1326 |             page+=1
1327 |             print(f'第{page}页',request.reason)
1328 |     fw=open(doubanid+'_MusicWish_List.csv','w',encoding='utf-8_sig')
1329 |     fw.write('专辑/单曲,简介,日期,留言\n')
1330 |     for title in whear_dic.keys():
1331 |         fw.write(title.replace(',','、').replace('，','、')+','+whear_dic[title][0].replace(',','、').replace('，','、')+\
1332 |                  ','+whear_dic[title][1]+','+whear_dic[title][2].replace(',','、').replace('，','、')+'\n')
1333 |     fw.close()
1334 |     return whear_dic
1335 | ```
1336 | 
1337 | 
1338 | ```python
1339 | WHeardList(doubanid='91835006')
1340 | ```
1341 | 
1342 |     第1页 OK
1343 |     已到最终页
1344 |     
1345 | 
1346 | 
1347 | 
1348 | 
1349 |     {'Honeybloom': ['Choker / 2018-08-03 / 专辑 / 数字(Digital) / 流行',
1350 |       '2018-10-15',
1351 |       'Nah'],
1352 |      'Black Origami': ['Jlin / 2017-05-19 / 专辑 / Audio CD / 电子',
1353 |       '2017-05-28',
1354 |       'Nah'],
1355 |      'Either/Or: Expanded Edition / 模棱两可': ['Elliott Smith / 2017-03-10 / 专辑 / CD / 民谣',
1356 |       '2017-03-19',
1357 |       'Nah'],
1358 |      'In Colour / 多采多姿': ['Jamie XX / 2015-06-01 / 专辑 / CD / 电子',
1359 |       '2016-01-01',
1360 |       'Nah'],
1361 |      'Seven Lions': ['Seven Lions / 2012-10-16 / EP / 数字(Digital) / 电子',
1362 |       '2015-07-12',
1363 |       'Nah'],
1364 |      'Who You Are Is Not Enough': ['Athletics / 2012-06-26 / EP / 数字(Digital) / 摇滚',
1365 |       '2015-04-17',
1366 |       'Nah'],
1367 |      '天宫图': ['窦唯,莫西子诗,子枫 / 2014-04-14 / 专辑 / CD / 电子', '2015-02-08', 'Nah'],
1368 |      'All Is Wild, All Is Silent / 四野俱寂': ['Balmorhea / 2009-03-10 / 专辑 / Audio CD',
1369 |       '2015-01-31',
1370 |       'Nah'],
1371 |      'Enochian': ['Dry River / 2014-01-20 / 专辑 / 数字(Digital) / 摇滚',
1372 |       '2014-12-21',
1373 |       'Nah']}
1374 | 
1375 | 
1376 | 
1377 | 
1378 | ```python
1379 | 
1380 | ```
1381 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.9.1
2 | certifi==2020.6.20
3 | chardet==3.0.4
4 | idna==2.10
5 | requests==2.24.0
6 | soupsieve==2.0.1
7 | urllib3==1.26.5
8 | 


--------------------------------------------------------------------------------