├── .gitignore ├── README.md ├── crawler_learn.py ├── crawler.py ├── crawler_mail.py └── thu_learn.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__/ 3 | learn/ 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Memory_fetcher 2 | **一键保存清华邮箱和网络学堂数据到本地存储.** 3 | 4 | --- 5 | ## 用法 6 | ### 1. 安装 7 | **基本环境要求:** 电脑上有python3环境就可以(python3和pip3工具的安装从略)。理论上python2也可以,不过我还没测试过。 8 | 9 | 此外还用到了另外三样: 10 | #### 1.1 BeautifulSoup4 11 | `pip3 install BeautifulSoup4` 12 | #### 1.2 requests 13 | `pip3 install requests 14 | ` 15 | #### 1.3 本项目 16 | 下载源码到某一本地文件夹并解压. 17 | 18 | ### 2.使用 19 | - `crawler_learn.py`脚本用于下载网络学堂数据,会在当前目录下创建一个‘learn’文件夹并分课程存放。 20 | - `crawler_mail.py`脚本用于下载清华邮箱数据,会在当前目录下创建一个‘mail’文件夹进行存放。 21 | 22 | - 在本项目源码路径(建议选一个存储空间足够大的硬盘)下打开命令行(终端),运行: 23 | `python3 crawler_learn.py` 24 | 或 25 | `python3 crawler_mail.py` 26 | 27 | - 根据提示输入用户名和密码后即开始下载。 28 | 29 | --- 30 | ## 注: 31 | - 运行crawler_learn.py时,终端内会有大量debug信息,请不要慌张,这是程序在自动检测网络学堂上的数据编码。 32 | - 清华邮箱下载下来的数据可能存在乱码,此bug已知,希望之后还有时间改进。 33 | - 特别鸣谢kehao95提供的[API](https://github.com/kehao95/thu_learn) 34 | - **祝毕业快乐!** -------------------------------------------------------------------------------- /crawler_learn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from thu_learn import * 3 | import os 4 | 5 | sizeThre = 1000 #Mb 6 | 7 | login()#登录,读取课程数据 8 | 9 | semester = Semester(current=True)#current=True表示当前学期,False表示以往学期 10 | 11 | for course in semester.courses: 12 | path = 'learn/' + course.name 13 | if not os.path.exists(path): 14 | os.makedirs(path) 15 | materialPath = path + '/materials/' 16 | if not os.path.exists(materialPath): 17 | os.makedirs(materialPath) 18 | workPath = path+'/works/' 19 | if not os.path.exists(workPath): 20 | os.makedirs(workPath) 21 | 22 | for file in course.files: 23 | if file.size < sizeThre: # 下载文件大小控制,单位为Mb,一般不会超过50Mb 24 | file.save(materialPath) 25 | 26 | for work in course.works: 27 | if not work.answer is None: 28 | work.answer.save(workPath) 29 | if not work.file is None: 30 | work.file.save(workPath) 31 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*-coding: UTF-8 -*- 3 | import requests 4 | import bs4 5 | import re 6 | import os 7 | try: 8 | path=input("Enter directory:") 9 | except ValueError: 10 | path="./" 11 | web="http://www.cs.jhu.edu/~phi/ai/" 12 | nameLenLimit=100 13 | count=0 14 | cnt=0 15 | 16 | session=requests.session() 17 | r=session.get(web) 18 | soup=bs4.BeautifulSoup(r.content,"html.parser") 19 | p1=re.compile('/[\w\-\.]*\.pdf')#find the pdf files 20 | p2=re.compile('week\d*')#not necessary 21 | p3=re.compile('lecture\d*')#not necessary 22 | 23 | for link in soup.find_all('a'): 24 | if link.get('href') is not None: 25 | result=p1.search(link.get('href')) 26 | if result is not None: #found target pdf file 27 | tmp=result.group() 28 | name=tmp[1:] 29 | if (p2.match(name) is not None): 30 | name = name[:p2.match(name).end()]+'_'+link.string+name[p2.match(name).end():] 31 | print(name) 32 | elif (p3.match(name) is not None): 33 | name = name[:p3.match(name).end()]+'_'+link.string+name[p3.match(name).end():] 34 | print(name) 35 | else: 36 | name=link.string+'.pdf' 37 | print(name) 38 | 39 | if len(name)>nameLenLimit: 40 | print(count, name) 41 | name = str(count)+'.pdf' 42 | count=count+1 43 | url=web+link.get('href') 44 | print(url) 45 | r = requests.get(url, stream=True) 46 | if not os.path.exists(path): 47 | os.makedirs(path) 48 | with open(path+'/'+name,'wb') as handle: 49 | if not r.ok: 50 | pass 51 | # raise ValueError('failed in saving:', name, '---', url) 52 | cnt=cnt+1 53 | print('start downloading No.', cnt) 54 | for block in r.iter_content(1024): 55 | handle.write(block) 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /crawler_mail.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import re 3 | import time 4 | import json 5 | import imaplib 6 | import smtplib 7 | import os 8 | from email.mime.text import MIMEText 9 | from email.header import Header 10 | 11 | 12 | class GetMail(object): 13 | @classmethod 14 | def mail_login(self, mail_type, mail_ssl, mail_username, mail_password): 15 | """邮箱登录,并检索目标人未读邮件""" 16 | get_server = imaplib.IMAP4_SSL(mail_type, mail_ssl) 17 | get_server.login(mail_username, mail_password) 18 | get_server.select("INBOX") # 默认收件夹是INBOX 19 | typ, data = get_server.search(None, 'ALL') # SEEN--已读邮件,UNSEEN--未读邮件,ALL--全部邮件 20 | #data是一个只有1元素的列表,里面存储所有邮件编号 21 | if data[0]: 22 | number_list = data[0].split() # 处理邮件编号list,编号越大邮件时间越近 23 | print(number_list[-1]) 24 | for the_mail_number in number_list: 25 | # 邮件内容详情 26 | mail_data = str(get_server.fetch(the_mail_number, '(RFC822)')[1]) 27 | if '.png' in mail_data: 28 | continue 29 | if '.gif' in mail_data: 30 | continue 31 | if '.jpg' in mail_data: 32 | continue 33 | if '._' in mail_data: 34 | continue 35 | if '@.' in mail_data: 36 | continue 37 | print(the_mail_number) 38 | tmp1 = mail_data.find('Subject:') 39 | print(mail_data[tmp1:tmp1+30]) 40 | file_name='mail/'+str(int(the_mail_number))+'.html' 41 | with open(file_name,'w') as f: 42 | f.write(mail_data) 43 | else: 44 | print( "未检索到未读邮件") 45 | 46 | def process_start(user_id, user_pass): 47 | ret = GetMail.mail_login(mail_type='mails.tsinghua.edu.cn', 48 | mail_ssl=993, 49 | mail_username=user_id, 50 | mail_password=user_pass) 51 | 52 | 53 | if __name__ == "__main__": 54 | # 执行程序 55 | user_id=input('Please input user email address:') 56 | user_pass=input('Please input user password:') 57 | os.makedirs('mail') 58 | process_start(user_id, user_pass) 59 | -------------------------------------------------------------------------------- /thu_learn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'kehao' 3 | import requests 4 | from bs4 import BeautifulSoup, Comment 5 | import re 6 | import os 7 | import getpass 8 | import logging 9 | 10 | _DebugLevel = logging.INFO 11 | logging.basicConfig(level=_DebugLevel) 12 | 13 | # global vars 14 | _session = requests.session() 15 | _URL_BASE = 'https://learn.tsinghua.edu.cn' 16 | _URL_LOGIN = _URL_BASE + '/MultiLanguage/lesson/teacher/loginteacher.jsp' 17 | 18 | # 学期 19 | _URL_CURRENT_SEMESTER = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/MyCourse.jsp?typepage=1' 20 | _URL_PAST_SEMESTER = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/MyCourse.jsp?typepage=2' 21 | # 个人信息 22 | _URL_PERSONAL_INFO = 'http://learn.tsinghua.edu.cn/MultiLanguage/vspace/vspace_userinfo1.jsp' 23 | 24 | # 课程不同板块前缀 25 | # 课程公告 26 | _PREF_MSG = 'http://learn.tsinghua.edu.cn/MultiLanguage/public/bbs/getnoteid_student.jsp?course_id=' 27 | # 课程信息 28 | _PREF_INFO = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/course_info.jsp?course_id=' 29 | # 课程文件 30 | _PREF_FILES = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/download.jsp?course_id=' 31 | # 教学资源 32 | _PREF_LIST = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/ware_list.jsp?course_id=' 33 | # 课程作业 34 | _PREF_WORK = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/hom_wk_brw.jsp?course_id=' 35 | 36 | 37 | def login(user_id=None, user_pass=None): 38 | """ 39 | login to get cookies in _session 40 | :param user_id: your Tsinghua id "keh13" for example 41 | :param user_pass: your password 42 | :return:True if succeed 43 | """ 44 | if user_id is None or user_pass is None: 45 | user_id = input("TsinghuaId:") 46 | user_pass = getpass.getpass("Password:") 47 | data = dict( 48 | userid=user_id, 49 | userpass=user_pass, 50 | ) 51 | r = _session.post(_URL_LOGIN, data) 52 | # 即使登录失败也是200所以根据返回内容简单区分了 53 | if len(r.content) > 120: 54 | logging.warning("login failed") 55 | return False 56 | else: 57 | logging.info("login success") 58 | return True 59 | 60 | 61 | def make_soup(url): 62 | """ 63 | _session.GET the page, handle the encoding and return the BeautifulSoup 64 | :param url: Page url 65 | :return: BeautifulSoup 66 | """ 67 | r = _session.get(url) 68 | r.encoding = 'bgk' 69 | soup = BeautifulSoup(r.content, "html.parser") 70 | return soup 71 | 72 | 73 | class Semester: 74 | """ 75 | Class Semester have all courses in it 76 | """ 77 | 78 | def __init__(self, current=True): 79 | """ 80 | set the current flag to get current/past Semester 81 | :param current: Boolean True/False for Current/Past semester 82 | :return: None 83 | """ 84 | if _session is None: 85 | raise RuntimeError("Call login(userid, userpass) before anything else") 86 | if current: 87 | self.url = _URL_CURRENT_SEMESTER 88 | else: 89 | self.url = _URL_PAST_SEMESTER 90 | self._courses = list(self.courses) 91 | 92 | @property 93 | def courses(self): 94 | """ 95 | return all the courses under the semester 96 | :return: Courses generator 97 | """ 98 | soup = make_soup(self.url) 99 | for j in soup.find_all('tr', class_=['info_tr', 'info_tr2']): 100 | i = j.find('a') 101 | url = i['href'] 102 | if url.startswith('/Mult'): 103 | url = _URL_BASE + url 104 | else: 105 | # !!important!! ignore the new WebLearning Courses At This moment 106 | continue 107 | name = i.contents[0] 108 | name = re.sub(r'[\n\r\t ]', '', name) 109 | name = re.sub(r'\([^\(\)]+\)$', '', name) 110 | id = url[-6:] 111 | yield Course(name=name, url=url, id=id) 112 | 113 | 114 | class Course: 115 | """ 116 | this is the Course class 117 | """ 118 | 119 | def __init__(self, id, url=None, name=None): 120 | pass 121 | self._id = id 122 | self._url = url 123 | self._name = name 124 | self._works = list(self.works) 125 | self._files = list(self.files) 126 | self._messages = list(self.messages) 127 | logging.info(name) 128 | 129 | @property 130 | def url(self): 131 | """course url""" 132 | return self._url 133 | 134 | @property 135 | def name(self): 136 | """course name""" 137 | return self._name 138 | 139 | @property 140 | def id(self): 141 | """courses id""" 142 | return self._id 143 | 144 | @property 145 | def works(self): 146 | """ 147 | get all the work in course 148 | :return: Work generator 149 | """ 150 | url = _PREF_WORK + self._id 151 | soup = make_soup(url) 152 | for i in soup.find_all('tr', class_=['tr1', 'tr2']): 153 | tds = i.find_all('td') 154 | url = 'http://learn.tsinghua.edu.cn/MultiLanguage/lesson/student/' + i.find('a')['href'] 155 | id = re.search(r'(\d+)', url).group(0) 156 | title = i.find('a').contents[0] 157 | start_time = tds[1].contents[0] 158 | end_time = tds[2].contents[0] 159 | submitted = ("已经提交" in tds[3].contents[0]) 160 | yield Work(id=id, title=title, url=url, start_time=start_time, end_time=end_time, submitted=submitted) 161 | 162 | @property 163 | def messages(self): 164 | """ 165 | get all messages in course 166 | :return: Message generator 167 | """ 168 | url = _PREF_MSG + self.id 169 | soup = make_soup(url) 170 | for m in soup.find_all('tr', class_=['tr1', 'tr2']): 171 | tds = m.find_all('td') 172 | title = tds[1].contents[1].text 173 | url = 'http://learn.tsinghua.edu.cn/MultiLanguage/public/bbs/' + tds[1].contents[1]['href'] 174 | id = re.search(r"id=(\d+)", url).group(1) 175 | date = tds[3].text 176 | yield Message(title=title, url=url, date=date, id=id) 177 | # TODO 178 | 179 | @property 180 | def files(self): 181 | """ 182 | get all files in course 183 | :return: File generator 184 | """ 185 | 186 | def file_size_M(s): 187 | digitals = s[:-1] 188 | if s.endswith('K'): 189 | return float(digitals) / 1024 190 | elif s.endswith('M'): 191 | return float(digitals) 192 | else: 193 | return 1024 * float(digitals) 194 | 195 | url = _PREF_FILES + self.id 196 | soup = make_soup(url) 197 | for j in soup.find_all('tr', class_=['tr1', 'tr2']): 198 | name = re.search(r'getfilelink=([^&]+)&', str(j.find(text=lambda text: isinstance(text, Comment)))).group(1) 199 | a = j.find('a') 200 | url = 'http://learn.tsinghua.edu.cn/kejian/data/%s/download/%s' % (self.id, name) 201 | title = re.sub(r'[\n\r\t ]', '', a.contents[0]) 202 | name = re.sub(r'_[^_]+\.', '.', name) 203 | size = file_size_M(j.find_all('td')[-3].text) #单位:Mb 204 | yield File(size=size, name=name, url=url) 205 | pass 206 | 207 | @property 208 | def info(self): 209 | url = _PREF_INFO + self.id 210 | return Info(url) 211 | 212 | 213 | class Work: 214 | """ 215 | the homework class 216 | """ 217 | 218 | def __init__(self, url=None, id=None, title=None, start_time=None, end_time=None, submitted=None): 219 | self._url = url 220 | self._id = id 221 | self._title = title 222 | self._details = self.details 223 | self._file = self.file 224 | self._start_time = start_time 225 | self._end_time = end_time 226 | self._submitted = submitted 227 | logging.info(title) 228 | pass 229 | 230 | @property 231 | def url(self): 232 | """work url""" 233 | return self._url 234 | 235 | @property 236 | def id(self): 237 | """work id""" 238 | return self._id 239 | 240 | @property 241 | def title(self): 242 | """work title""" 243 | return self._title 244 | 245 | @property 246 | def start_time(self): 247 | """ 248 | start date of the work 249 | :return:str time 'yyyy-mm-dd' 250 | """ 251 | return self._start_time 252 | 253 | @property 254 | def end_time(self): 255 | """ 256 | end date of the work 257 | :return: str time 'yyyy-mm-dd' 258 | """ 259 | return self._end_time 260 | 261 | @property 262 | def submitted(self): 263 | """ 264 | end date of the work 265 | :return: str time 'yyyy-mm-dd' 266 | """ 267 | return self._submitted 268 | 269 | @property 270 | def details(self): 271 | """ 272 | the description of the work 273 | :return:str details /None if not exists 274 | """ 275 | soup = make_soup(self.url) 276 | try: 277 | _details = soup.find_all('td', class_='tr_2')[1].textarea.contents[0] 278 | except: 279 | _details = "" 280 | return _details 281 | 282 | @property 283 | def file(self): 284 | """ 285 | the file attached to the work 286 | :return: Instance of File/None if not exists 287 | """ 288 | soup = make_soup(self.url) 289 | try: 290 | fname = soup.find_all('td', class_='tr_2')[2].a.contents[0] 291 | furl = 'http://learn.tsinghua.edu.cn' + soup.find_all('td', class_='tr_2')[2].a['href'] 292 | _file = File(url=furl, name=fname) 293 | except(AttributeError): 294 | _file = None 295 | return _file 296 | 297 | @property 298 | def answer(self): 299 | """ 300 | the file attached to the work 301 | :return: Instance of File/None if not exists 302 | """ 303 | soup = make_soup(self.url) 304 | try: 305 | fname = soup.find_all('td', class_='tr_2')[4].a.contents[0] 306 | furl = 'http://learn.tsinghua.edu.cn' + soup.find_all('td', class_='tr_2')[4].a['href'] 307 | _file = File(url=furl, name=fname) 308 | except(AttributeError): 309 | _file = None 310 | return _file 311 | 312 | 313 | class File: 314 | def __init__(self, url, name, size=0, note=None): 315 | self._name = name 316 | self._url = url 317 | self._note = note 318 | self._size = size 319 | 320 | def save(self, path='.'): 321 | r = requests.get(self.url, stream=True) 322 | if not os.path.exists(path): 323 | os.makedirs(path) 324 | with open(path + '/' + self.name, 'wb') as handle: 325 | if not r.ok: 326 | raise ValueError('failed in saving file', self.name, self.url) 327 | for block in r.iter_content(1024): 328 | handle.write(block) 329 | 330 | @property 331 | def name(self): 332 | """file name 333 | Note! the file name is the name on the web but not the name in the download link 334 | """ 335 | return self._name 336 | 337 | @property 338 | def url(self): 339 | """download url""" 340 | return self._url 341 | 342 | @property 343 | def note(self): 344 | """the description of the file 345 | this will exits under the CourseFile area but not in work area 346 | # considering take course.details as note 347 | """ 348 | return self._note 349 | 350 | @property 351 | def size(self): 352 | return self._size 353 | 354 | 355 | class Message: 356 | def __init__(self, url, title, date, id): 357 | self._id = id 358 | self._url = url 359 | self._title = title 360 | self._date = date 361 | self._details = self.details 362 | logging.info(title) 363 | 364 | @property 365 | def id(self): 366 | return self._id 367 | 368 | @property 369 | def url(self): 370 | return self._url 371 | 372 | @property 373 | def title(self): 374 | return self._title 375 | 376 | @property 377 | def date(self): 378 | return self._date 379 | 380 | @property 381 | def details(self): 382 | soup = make_soup(self.url) 383 | _details = soup.find_all('td', class_='tr_l2')[1].text.replace('\xa0', ' ') 384 | _details = re.sub('(\\xa0)+', ' ', _details) 385 | _details = re.sub('\n+', '\n', _details) 386 | return _details 387 | 388 | 389 | class Info: 390 | class Teacher: 391 | def __init__(self, name, email, phone, intro): 392 | self.name = name 393 | self.email = email 394 | self.phone = phone 395 | self.intro = intro 396 | 397 | def __init__(self, url): 398 | self.soup = make_soup(url) 399 | tds = self.soup.find_all('td') 400 | self._classId = tds[4].text.replace(" ", "") #课程编号 401 | self._classSeq = tds[6].text.replace(" ", "")#课程序号 402 | self._className = tds[8].text.replace(" ", "")#课程名称 403 | self._credit = tds[10].text.replace(" ", "")#学分 404 | self._learnHour = tds[12].text.replace(" ", "")#学时 405 | self._material = tds[27].text.replace(" ", "")#指定教材 406 | self._reference = tds[29].text.replace(" ", "")#参考书目 407 | self._testMethod = tds[31].text.replace(" ", "")#考核方式 408 | self._classIntro = tds[33].text.replace(" ", "")#课程简介 409 | self._teacher = self.Teacher( 410 | name=tds[19].text.replace("\xa0", ""), 411 | email=tds[21].text.replace("\xa0", ""), 412 | phone=tds[23].text[1:].split(";"), 413 | intro=re.sub(r'[\r\t ]', '', tds[25].text), 414 | ) 415 | 416 | 417 | def test(): 418 | pass 419 | 420 | 421 | def main(): 422 | test() 423 | 424 | 425 | if __name__ == '__main__': 426 | main() 427 | --------------------------------------------------------------------------------