├── .gitignore
├── README.md
├── crawler_learn.py
├── crawler.py
├── crawler_mail.py
└── thu_learn.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | learn/
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Memory_fetcher
 2 | **一键保存清华邮箱和网络学堂数据到本地存储.**
 3 | 
 4 | ---
 5 | ## 用法
 6 | ### 1. 安装
 7 | **基本环境要求：** 电脑上有python3环境就可以（python3和pip3工具的安装从略）。理论上python2也可以，不过我还没测试过。
 8 | 
 9 | 此外还用到了另外三样：
10 | #### 1.1 BeautifulSoup4
11 | `pip3 install BeautifulSoup4`
12 | #### 1.2 requests 
13 | `pip3 install requests
14 | `
15 | #### 1.3 本项目
16 | 下载源码到某一本地文件夹并解压.
17 | 
18 | ### 2.使用
19 | - `crawler_learn.py`脚本用于下载网络学堂数据，会在当前目录下创建一个‘learn’文件夹并分课程存放。
20 | - `crawler_mail.py`脚本用于下载清华邮箱数据，会在当前目录下创建一个‘mail’文件夹进行存放。
21 | 
22 | - 在本项目源码路径（建议选一个存储空间足够大的硬盘）下打开命令行（终端），运行：
23 | `python3 crawler_learn.py`
24 | 或
25 | `python3 crawler_mail.py`
26 | 
27 | - 根据提示输入用户名和密码后即开始下载。
28 | 
29 | ---
30 | ## 注：
31 | - 运行crawler_learn.py时，终端内会有大量debug信息，请不要慌张，这是程序在自动检测网络学堂上的数据编码。
32 | - 清华邮箱下载下来的数据可能存在乱码，此bug已知，希望之后还有时间改进。
33 | - 特别鸣谢kehao95提供的[API](https://github.com/kehao95/thu_learn)
34 | - **祝毕业快乐！**


--------------------------------------------------------------------------------
/crawler_learn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from thu_learn import *
 3 | import os
 4 | 
 5 | sizeThre = 1000 #Mb
 6 | 
 7 | login()#登录，读取课程数据
 8 | 
 9 | semester = Semester(current=True)#current=True表示当前学期，False表示以往学期
10 | 
11 | for course in semester.courses:
12 |     path = 'learn/' + course.name
13 |     if not os.path.exists(path):
14 |         os.makedirs(path)
15 |     materialPath = path + '/materials/'
16 |     if not os.path.exists(materialPath):
17 |         os.makedirs(materialPath)
18 |     workPath = path+'/works/'       
19 |     if not os.path.exists(workPath):
20 |         os.makedirs(workPath)
21 | 
22 |     for file in course.files:
23 |         if file.size < sizeThre: # 下载文件大小控制，单位为Mb，一般不会超过50Mb
24 |             file.save(materialPath)
25 | 
26 |     for work in course.works:
27 |         if not work.answer is None:
28 |             work.answer.save(workPath)
29 |         if not work.file is None:
30 |             work.file.save(workPath)
31 | 


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | # -*-coding: UTF-8 -*-
 3 | import requests
 4 | import bs4
 5 | import re
 6 | import os
 7 | try:
 8 | 	path=input("Enter directory:")
 9 | except ValueError:
10 | 	path="./"
11 | web="http://www.cs.jhu.edu/~phi/ai/"
12 | nameLenLimit=100
13 | count=0
14 | cnt=0
15 | 
16 | session=requests.session()
17 | r=session.get(web)
18 | soup=bs4.BeautifulSoup(r.content,"html.parser")
19 | p1=re.compile('/[\w\-\.]*\.pdf')#find the pdf files
20 | p2=re.compile('week\d*')#not necessary
21 | p3=re.compile('lecture\d*')#not necessary
22 | 
23 | for link in soup.find_all('a'):
24 | 	if link.get('href') is not None:
25 | 		result=p1.search(link.get('href'))
26 | 		if result is not None: #found target pdf file
27 | 			tmp=result.group()
28 | 			name=tmp[1:]
29 | 			if (p2.match(name) is not None):
30 | 				name = name[:p2.match(name).end()]+'_'+link.string+name[p2.match(name).end():]
31 | 				print(name)
32 | 			elif (p3.match(name) is not None):
33 | 				name = name[:p3.match(name).end()]+'_'+link.string+name[p3.match(name).end():]
34 | 				print(name)
35 | 			else:
36 | 				name=link.string+'.pdf'
37 | 				print(name)
38 | 
39 | 			if len(name)>nameLenLimit:
40 | 				print(count, name)
41 | 				name = str(count)+'.pdf'
42 | 				count=count+1
43 | 			url=web+link.get('href')
44 | 			print(url)
45 | 			r = requests.get(url, stream=True)
46 | 			if not os.path.exists(path):
47 | 				os.makedirs(path)
48 | 			with open(path+'/'+name,'wb') as handle:
49 | 				if not r.ok:
50 | 					pass
51 | 					# raise ValueError('failed in saving:', name, '---', url)
52 | 				cnt=cnt+1
53 | 				print('start downloading No.', cnt)
54 | 				for block in r.iter_content(1024):
55 | 					handle.write(block)
56 | 			
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/crawler_mail.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import re
 3 | import time
 4 | import json
 5 | import imaplib
 6 | import smtplib
 7 | import os
 8 | from email.mime.text import MIMEText
 9 | from email.header import Header
10 | 
11 | 
12 | class GetMail(object):
13 |     @classmethod
14 |     def mail_login(self, mail_type, mail_ssl, mail_username, mail_password):
15 |         """邮箱登录,并检索目标人未读邮件"""
16 |         get_server = imaplib.IMAP4_SSL(mail_type, mail_ssl)
17 |         get_server.login(mail_username, mail_password)
18 |         get_server.select("INBOX")  # 默认收件夹是INBOX
19 |         typ, data = get_server.search(None, 'ALL')  # SEEN--已读邮件,UNSEEN--未读邮件,ALL--全部邮件
20 |         #data是一个只有1元素的列表，里面存储所有邮件编号
21 |         if data[0]:
22 |             number_list = data[0].split()  # 处理邮件编号list,编号越大邮件时间越近
23 |             print(number_list[-1])
24 |             for the_mail_number in number_list:
25 |                 # 邮件内容详情
26 |                 mail_data = str(get_server.fetch(the_mail_number, '(RFC822)')[1])
27 |                 if '.png' in mail_data:
28 |                     continue
29 |                 if '.gif' in mail_data:
30 |                     continue
31 |                 if '.jpg' in mail_data:
32 |                     continue
33 |                 if '._' in mail_data:
34 |                     continue
35 |                 if '@.' in mail_data:
36 |                     continue
37 |                 print(the_mail_number)
38 |                 tmp1 = mail_data.find('Subject:')
39 |                 print(mail_data[tmp1:tmp1+30])
40 |                 file_name='mail/'+str(int(the_mail_number))+'.html'
41 |                 with open(file_name,'w') as f:
42 |                     f.write(mail_data)
43 |         else:
44 |             print( "未检索到未读邮件")
45 | 
46 | def process_start(user_id, user_pass):
47 |     ret = GetMail.mail_login(mail_type='mails.tsinghua.edu.cn',
48 |                              mail_ssl=993,
49 |                              mail_username=user_id,
50 |                              mail_password=user_pass)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     # 执行程序
55 |     user_id=input('Please input user email address:')
56 |     user_pass=input('Please input user password:')
57 |     os.makedirs('mail')
58 |     process_start(user_id, user_pass)
59 | 


--------------------------------------------------------------------------------
/thu_learn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = 'kehao'
  3 | import requests
  4 | from bs4 import BeautifulSoup, Comment
  5 | import re
  6 | import os
  7 | import getpass
  8 | import logging
  9 | 
 10 | _DebugLevel = logging.INFO
 11 | logging.basicConfig(level=_DebugLevel)
 12 | 
 13 | # global vars
 14 | _session = requests.session()
 15 | _URL_BASE = 'https://learn.tsinghua.edu.cn'
 16 | _URL_LOGIN = _URL_BASE + '/MultiLanguage/lesson/teacher/loginteacher.jsp'
 17 | 
 18 | # 学期
 19 | _URL_CURRENT_SEMESTER = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/MyCourse.jsp?typepage=1'
 20 | _URL_PAST_SEMESTER = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/MyCourse.jsp?typepage=2'
 21 | # 个人信息
 22 | _URL_PERSONAL_INFO = 'http://learn.tsinghua.edu.cn/MultiLanguage/vspace/vspace_userinfo1.jsp'
 23 | 
 24 | # 课程不同板块前缀
 25 | # 课程公告
 26 | _PREF_MSG = 'http://learn.tsinghua.edu.cn/MultiLanguage/public/bbs/getnoteid_student.jsp?course_id='
 27 | # 课程信息
 28 | _PREF_INFO = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/course_info.jsp?course_id='
 29 | # 课程文件
 30 | _PREF_FILES = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/download.jsp?course_id='
 31 | # 教学资源
 32 | _PREF_LIST = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/ware_list.jsp?course_id='
 33 | # 课程作业
 34 | _PREF_WORK = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/hom_wk_brw.jsp?course_id='
 35 | 
 36 | 
 37 | def login(user_id=None, user_pass=None):
 38 |     """
 39 |     login to get cookies in _session
 40 |     :param user_id: your Tsinghua id "keh13" for example
 41 |     :param user_pass: your password
 42 |     :return:True if succeed
 43 |     """
 44 |     if user_id is None or user_pass is None:
 45 |         user_id = input("TsinghuaId:")
 46 |         user_pass = getpass.getpass("Password:")
 47 |     data = dict(
 48 |         userid=user_id,
 49 |         userpass=user_pass,
 50 |     )
 51 |     r = _session.post(_URL_LOGIN, data)
 52 |     # 即使登录失败也是200所以根据返回内容简单区分了
 53 |     if len(r.content) > 120:
 54 |         logging.warning("login failed")
 55 |         return False
 56 |     else:
 57 |         logging.info("login success")
 58 |         return True
 59 | 
 60 | 
 61 | def make_soup(url):
 62 |     """
 63 |     _session.GET the page, handle the encoding and return the BeautifulSoup
 64 |     :param url: Page url
 65 |     :return: BeautifulSoup
 66 |     """
 67 |     r = _session.get(url)
 68 |     r.encoding = 'bgk'
 69 |     soup = BeautifulSoup(r.content, "html.parser")
 70 |     return soup
 71 | 
 72 | 
 73 | class Semester:
 74 |     """
 75 |     Class Semester have all courses in it
 76 |     """
 77 | 
 78 |     def __init__(self, current=True):
 79 |         """
 80 |         set the current flag to get current/past Semester
 81 |         :param current: Boolean True/False for Current/Past semester
 82 |         :return: None
 83 |         """
 84 |         if _session is None:
 85 |             raise RuntimeError("Call login(userid, userpass) before anything else")
 86 |         if current:
 87 |             self.url = _URL_CURRENT_SEMESTER
 88 |         else:
 89 |             self.url = _URL_PAST_SEMESTER
 90 |         self._courses = list(self.courses)
 91 | 
 92 |     @property
 93 |     def courses(self):
 94 |         """
 95 |         return all the courses under the semester
 96 |         :return: Courses generator
 97 |         """
 98 |         soup = make_soup(self.url)
 99 |         for j in soup.find_all('tr', class_=['info_tr', 'info_tr2']):
100 |             i = j.find('a')
101 |             url = i['href']
102 |             if url.startswith('/Mult'):
103 |                 url = _URL_BASE + url
104 |             else:
105 |                 # !!important!! ignore the new WebLearning Courses At This moment
106 |                 continue
107 |             name = i.contents[0]
108 |             name = re.sub(r'[\n\r\t ]', '', name)
109 |             name = re.sub(r'\([^\(\)]+\)$', '', name)
110 |             id = url[-6:]
111 |             yield Course(name=name, url=url, id=id)
112 | 
113 | 
114 | class Course:
115 |     """
116 |     this is the Course class
117 |     """
118 | 
119 |     def __init__(self, id, url=None, name=None):
120 |         pass
121 |         self._id = id
122 |         self._url = url
123 |         self._name = name
124 |         self._works = list(self.works)
125 |         self._files = list(self.files)
126 |         self._messages = list(self.messages)
127 |         logging.info(name)
128 | 
129 |     @property
130 |     def url(self):
131 |         """course url"""
132 |         return self._url
133 | 
134 |     @property
135 |     def name(self):
136 |         """course name"""
137 |         return self._name
138 | 
139 |     @property
140 |     def id(self):
141 |         """courses id"""
142 |         return self._id
143 | 
144 |     @property
145 |     def works(self):
146 |         """
147 |         get all the work in course
148 |         :return: Work generator
149 |         """
150 |         url = _PREF_WORK + self._id
151 |         soup = make_soup(url)
152 |         for i in soup.find_all('tr', class_=['tr1', 'tr2']):
153 |             tds = i.find_all('td')
154 |             url = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/' + i.find('a')['href']
155 |             id = re.search(r'(\d+)', url).group(0)
156 |             title = i.find('a').contents[0]
157 |             start_time = tds[1].contents[0]
158 |             end_time = tds[2].contents[0]
159 |             submitted = ("已经提交" in tds[3].contents[0])
160 |             yield Work(id=id, title=title, url=url, start_time=start_time, end_time=end_time, submitted=submitted)
161 | 
162 |     @property
163 |     def messages(self):
164 |         """
165 |         get all messages in course
166 |         :return: Message generator
167 |         """
168 |         url = _PREF_MSG + self.id
169 |         soup = make_soup(url)
170 |         for m in soup.find_all('tr', class_=['tr1', 'tr2']):
171 |             tds = m.find_all('td')
172 |             title = tds[1].contents[1].text
173 |             url = 'http://learn.tsinghua.edu.cn/MultiLanguage/public/bbs/' + tds[1].contents[1]['href']
174 |             id = re.search(r"id=(\d+)", url).group(1)
175 |             date = tds[3].text
176 |             yield Message(title=title, url=url, date=date, id=id)
177 |             # TODO
178 | 
179 |     @property
180 |     def files(self):
181 |         """
182 |         get all files in course
183 |         :return: File generator
184 |         """
185 | 
186 |         def file_size_M(s):
187 |             digitals = s[:-1]
188 |             if s.endswith('K'):
189 |                 return float(digitals) / 1024
190 |             elif s.endswith('M'):
191 |                 return float(digitals)
192 |             else:
193 |                 return 1024 * float(digitals)
194 | 
195 |         url = _PREF_FILES + self.id
196 |         soup = make_soup(url)
197 |         for j in soup.find_all('tr', class_=['tr1', 'tr2']):
198 |             name = re.search(r'getfilelink=([^&]+)&', str(j.find(text=lambda text: isinstance(text, Comment)))).group(1)
199 |             a = j.find('a')
200 |             url = 'http://learn.tsinghua.edu.cn/kejian/data/%s/download/%s' % (self.id, name)
201 |             title = re.sub(r'[\n\r\t ]', '', a.contents[0])
202 |             name = re.sub(r'_[^_]+\.', '.', name)
203 |             size = file_size_M(j.find_all('td')[-3].text) #单位：Mb
204 |             yield File(size=size, name=name, url=url)
205 |         pass
206 | 
207 |     @property
208 |     def info(self):
209 |         url = _PREF_INFO + self.id
210 |         return Info(url)
211 | 
212 | 
213 | class Work:
214 |     """
215 |     the homework class
216 |     """
217 | 
218 |     def __init__(self, url=None, id=None, title=None, start_time=None, end_time=None, submitted=None):
219 |         self._url = url
220 |         self._id = id
221 |         self._title = title
222 |         self._details = self.details
223 |         self._file = self.file
224 |         self._start_time = start_time
225 |         self._end_time = end_time
226 |         self._submitted = submitted
227 |         logging.info(title)
228 |         pass
229 | 
230 |     @property
231 |     def url(self):
232 |         """work url"""
233 |         return self._url
234 | 
235 |     @property
236 |     def id(self):
237 |         """work id"""
238 |         return self._id
239 | 
240 |     @property
241 |     def title(self):
242 |         """work title"""
243 |         return self._title
244 | 
245 |     @property
246 |     def start_time(self):
247 |         """
248 |         start date of the work
249 |         :return:str time 'yyyy-mm-dd'
250 |         """
251 |         return self._start_time
252 | 
253 |     @property
254 |     def end_time(self):
255 |         """
256 |         end date of the work
257 |         :return: str time 'yyyy-mm-dd'
258 |         """
259 |         return self._end_time
260 | 
261 |     @property
262 |     def submitted(self):
263 |         """
264 |         end date of the work
265 |         :return: str time 'yyyy-mm-dd'
266 |         """
267 |         return self._submitted
268 | 
269 |     @property
270 |     def details(self):
271 |         """
272 |         the description of the work
273 |         :return:str details /None if not exists
274 |         """
275 |         soup = make_soup(self.url)
276 |         try:
277 |             _details = soup.find_all('td', class_='tr_2')[1].textarea.contents[0]
278 |         except:
279 |             _details = ""
280 |         return _details
281 | 
282 |     @property
283 |     def file(self):
284 |         """
285 |         the file attached to the work
286 |         :return: Instance of File/None if not exists
287 |         """
288 |         soup = make_soup(self.url)
289 |         try:
290 |             fname = soup.find_all('td', class_='tr_2')[2].a.contents[0]
291 |             furl = 'http://learn.tsinghua.edu.cn' + soup.find_all('td', class_='tr_2')[2].a['href']
292 |             _file = File(url=furl, name=fname)
293 |         except(AttributeError):
294 |             _file = None
295 |         return _file
296 | 
297 |     @property
298 |     def answer(self):
299 |         """
300 |         the file attached to the work
301 |         :return: Instance of File/None if not exists
302 |         """
303 |         soup = make_soup(self.url)
304 |         try:
305 |             fname = soup.find_all('td', class_='tr_2')[4].a.contents[0]
306 |             furl = 'http://learn.tsinghua.edu.cn' + soup.find_all('td', class_='tr_2')[4].a['href']
307 |             _file = File(url=furl, name=fname)
308 |         except(AttributeError):
309 |             _file = None
310 |         return _file
311 | 
312 | 
313 | class File:
314 |     def __init__(self, url, name, size=0, note=None):
315 |         self._name = name
316 |         self._url = url
317 |         self._note = note
318 |         self._size = size
319 | 
320 |     def save(self, path='.'):
321 |         r = requests.get(self.url, stream=True)
322 |         if not os.path.exists(path):
323 |             os.makedirs(path)
324 |         with open(path + '/' + self.name, 'wb') as handle:
325 |             if not r.ok:
326 |                 raise ValueError('failed in saving file', self.name, self.url)
327 |             for block in r.iter_content(1024):
328 |                 handle.write(block)
329 | 
330 |     @property
331 |     def name(self):
332 |         """file name
333 |         Note! the file name is the name on the web but not the name in the download link
334 |         """
335 |         return self._name
336 | 
337 |     @property
338 |     def url(self):
339 |         """download url"""
340 |         return self._url
341 | 
342 |     @property
343 |     def note(self):
344 |         """the description of the file
345 |         this will exits under the CourseFile area but not in work area
346 |         # considering take course.details as note
347 |         """
348 |         return self._note
349 | 
350 |     @property
351 |     def size(self):
352 |         return self._size
353 | 
354 | 
355 | class Message:
356 |     def __init__(self, url, title, date, id):
357 |         self._id = id
358 |         self._url = url
359 |         self._title = title
360 |         self._date = date
361 |         self._details = self.details
362 |         logging.info(title)
363 | 
364 |     @property
365 |     def id(self):
366 |         return self._id
367 | 
368 |     @property
369 |     def url(self):
370 |         return self._url
371 | 
372 |     @property
373 |     def title(self):
374 |         return self._title
375 | 
376 |     @property
377 |     def date(self):
378 |         return self._date
379 | 
380 |     @property
381 |     def details(self):
382 |         soup = make_soup(self.url)
383 |         _details = soup.find_all('td', class_='tr_l2')[1].text.replace('\xa0', ' ')
384 |         _details = re.sub('(\\xa0)+', ' ', _details)
385 |         _details = re.sub('\n+', '\n', _details)
386 |         return _details
387 | 
388 | 
389 | class Info:
390 |     class Teacher:
391 |         def __init__(self, name, email, phone, intro):
392 |             self.name = name
393 |             self.email = email
394 |             self.phone = phone
395 |             self.intro = intro
396 | 
397 |     def __init__(self, url):
398 |         self.soup = make_soup(url)
399 |         tds = self.soup.find_all('td')
400 |         self._classId = tds[4].text.replace(" ", "") #课程编号
401 |         self._classSeq = tds[6].text.replace(" ", "")#课程序号
402 |         self._className = tds[8].text.replace(" ", "")#课程名称
403 |         self._credit = tds[10].text.replace(" ", "")#学分
404 |         self._learnHour = tds[12].text.replace(" ", "")#学时
405 |         self._material = tds[27].text.replace(" ", "")#指定教材
406 |         self._reference = tds[29].text.replace(" ", "")#参考书目
407 |         self._testMethod = tds[31].text.replace(" ", "")#考核方式
408 |         self._classIntro = tds[33].text.replace(" ", "")#课程简介
409 |         self._teacher = self.Teacher(
410 |             name=tds[19].text.replace("\xa0", ""),
411 |             email=tds[21].text.replace("\xa0", ""),
412 |             phone=tds[23].text[1:].split(";"),
413 |             intro=re.sub(r'[\r\t ]', '', tds[25].text),
414 |         )
415 | 
416 | 
417 | def test():
418 |     pass
419 | 
420 | 
421 | def main():
422 |     test()
423 | 
424 | 
425 | if __name__ == '__main__':
426 |     main()
427 | 


--------------------------------------------------------------------------------