├── PHP ├── FileDownloader.php ├── HtmlDownloader.php ├── HtmlParser.php ├── Spider.php ├── composer.json └── index.php ├── README.md └── python ├── conf.py ├── conf.pyc ├── entity ├── __init__.py ├── __init__.pyc ├── fileinfor.py └── fileinfor.pyc ├── filedeal ├── __init__.py ├── __init__.pyc ├── file_downloader.py └── file_downloader.pyc ├── index.py ├── spider ├── __init__.py ├── __init__.pyc ├── html_downloader.py ├── html_downloader.pyc ├── html_parser.py ├── html_parser.pyc ├── spiderman.py └── spiderman.pyc └── test ├── __init__.py ├── test_bs4.py ├── test_httplib.py ├── test_json.py ├── test_progress.py └── test_urllib.py /PHP/FileDownloader.php: -------------------------------------------------------------------------------- 1 | array( 15 | 'header' => 16 | "Host: www.imooc.com\r\n" . 17 | "Referer: http://m.120ask.com/health/show?page=2&id=84882&type=17\r\n" . 18 | "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36\r\n" 19 | ), 20 | ); 21 | $context = stream_context_create($header); 22 | $data = file_get_contents($url, 0, $context); 23 | return $data; 24 | } 25 | 26 | } -------------------------------------------------------------------------------- /PHP/HtmlParser.php: -------------------------------------------------------------------------------- 1 | filter('#main > div.course-infos > div.w.pr > div.hd.clearfix > h2')->text(); 24 | 25 | $course = $crawler->filterXPath('//*[@id="main"]/div[2]/div[1]/div[1]/div[3]/div/ul/li/a')->each( 26 | function (Crawler $node, $i) { 27 | $url = explode('/', $node->filter('a')->attr('href')); 28 | $id = $url[2]; 29 | $downloadData = json_decode(file_get_contents(str_replace('{}', $id, self::DOWNLOAD_URL)), true); 30 | return [ 31 | 'id' => $url['2'], 32 | 'title' => str_replace(['\r','\n','开始学习',' ',], ['','','',''], trim($node->text())), 33 | //'title' => str_replace('开始学习', '', trim($node->text())), 34 | 'url' => $downloadData['data']['result']['mpath'][2] 35 | ]; 36 | }); 37 | $result = [ 38 | 'subject' => $subject, 39 | 'data' => $course 40 | ]; 41 | return $result; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /PHP/Spider.php: -------------------------------------------------------------------------------- 1 | '普清', 'M' => '高清', 'H' => '超清'];//视频品质描述 19 | const PERSUM = 0.0;//用于描述总进度 20 | const PERLIST = [];//记录每个线程的进度 21 | 22 | private $htmlData; 23 | 24 | public function __construct($id) 25 | { 26 | $this->downloader = new HtmlDownloader(); 27 | $this->parser = new HtmlParser(); 28 | $this->id = $id; 29 | } 30 | 31 | public function run() 32 | { 33 | echo "#####################################################################\n"; 34 | echo "#慕课网视频抓取器\n"; 35 | echo "author:igo9go\n"; 36 | echo "github:https://github.com/igo9go/\n"; 37 | echo "#到慕课网官网打开想要下载的课程的章节列表页面,查看当前url链接\n"; 38 | echo "#例如http://www.imooc.com/learn/615,则课程编号为615\n"; 39 | echo "#####################################################################\n"; 40 | $url = self::COURSEURL . $this->id; 41 | 42 | 43 | echo "将要下载的课程连接为" . $url . "\n"; 44 | echo "开始解析视频,请稍后\n"; 45 | $this->crawl($url); 46 | echo "共有" . count($this->htmlData['data']) . "条视频\n"; 47 | echo "课程名称:" . $this->htmlData['subject'] . PHP_EOL; 48 | echo "开始下载,请等待"; 49 | 50 | $fildir = './' . $this->htmlData['subject']; 51 | if (!is_dir($fildir)) { 52 | mkdir($fildir); 53 | } 54 | $i = 1; 55 | foreach ($this->htmlData['data'] as $item) { 56 | $file = file_get_contents($item['url']); 57 | $fileName = $item['title'] . '.mp4'; 58 | file_put_contents($fildir . '/' . $fileName, $file); 59 | echo '第' . $i . '个视频已完成下载' . PHP_EOL; 60 | $i++; 61 | } 62 | return '下载完成'; 63 | } 64 | 65 | public function crawl($url) 66 | { 67 | $htmlContent = $this->downloader->download($url); 68 | $this->htmlData = $this->parser->parse($htmlContent); 69 | } 70 | } -------------------------------------------------------------------------------- /PHP/composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "immoc", 3 | "type": "library", 4 | "license": "mit", 5 | "authors": [ 6 | { 7 | "name": "igo9go", 8 | "email": "1@qq.com" 9 | } 10 | ], 11 | "require": { 12 | "symfony/dom-crawler": "2.8.*|3.0.*|3.1.*", 13 | "symfony/css-selector": "2.8.*|3.0.*|3.1.*" 14 | }, 15 | "minimum-stability": "dev" 16 | } -------------------------------------------------------------------------------- /PHP/index.php: -------------------------------------------------------------------------------- 1 | run(); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # downloaa_imooc 2 | 下载慕课网视频 3 | 4 | 运行步骤: 5 | 6 | 1. git clone https://github.com/igo9go/downloaa_imooc 7 | 8 | 2. composer install 9 | 10 | 3. php index.php 744(744为课程ID) 11 | 12 | 13 |  14 | 15 |  -------------------------------------------------------------------------------- /python/conf.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #全局变量 3 | import threading 4 | 5 | DOWNLOAD_URL = 'http://www.imooc.com/course/ajaxmediainfo/?mid={}&mode=flash'#下载链接 6 | COURSEURL = "http://www.imooc.com/learn/"#课程链接 7 | #COURSEURL = "http://coding.imooc.com/learn/list/74.html" 8 | 9 | CHOOSE=['H','M','L']#视频品质 10 | 11 | STATE='L'#视频默认品质 12 | 13 | LOCK = threading.Lock()#线程锁 14 | 15 | INFOR = {'L':u'普清','M':u'高清','H':u'超清'}#视频品质描述 16 | 17 | PERSUM=0.0#用于描述总进度 18 | 19 | PERLIST=[]#记录每个线程的进度 -------------------------------------------------------------------------------- /python/conf.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/conf.pyc -------------------------------------------------------------------------------- /python/entity/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /python/entity/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/entity/__init__.pyc -------------------------------------------------------------------------------- /python/entity/fileinfor.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | ''' 3 | 这个类用于描述文件信息的模型,将文件的所有信息封装成一个类,便于管理 4 | ''' 5 | class FileInfor(object): 6 | def __init__(self): 7 | self.__subject=''#教程名称 8 | self.__filename=''#课程名称(也是下载的每个文件名称) 9 | self.__mid= ''#课程的ID号 10 | self.__url={}#下载链接(分高中低,H M L) 11 | @property 12 | def subject(self): 13 | return self.__subject 14 | 15 | @subject.setter 16 | def subject(self,value): 17 | self.__subject = value 18 | 19 | @property 20 | def filename(self): 21 | return self.__filename 22 | 23 | @filename.setter 24 | def filename(self,value): 25 | self.__filename = value 26 | 27 | @property 28 | def mid(self): 29 | return self.__mid 30 | 31 | @mid.setter 32 | def mid(self,value): 33 | self.__mid = value 34 | 35 | @property 36 | def url(self): 37 | return self.__url 38 | 39 | @url.setter 40 | def url(self,value): 41 | self.__url = value -------------------------------------------------------------------------------- /python/entity/fileinfor.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/entity/fileinfor.pyc -------------------------------------------------------------------------------- /python/filedeal/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /python/filedeal/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/filedeal/__init__.pyc -------------------------------------------------------------------------------- /python/filedeal/file_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import os 3 | import threading 4 | import urllib 5 | import sys 6 | import conf 7 | 8 | 9 | class File_Downloader(threading.Thread): 10 | ''' 11 | 这个类主要是用来下载视频文件的,继承了线程类 12 | ''' 13 | def __init__(self,fileInfo,id): 14 | threading.Thread.__init__(self) 15 | #先创建顶层文件夹 16 | self.__fileInfor = fileInfo 17 | self.__id = id 18 | self.createdir() 19 | 20 | 21 | def run(self): 22 | fileurl=self.__fileInfor.url[conf.STATE] 23 | filepath = self.filedir+os.sep+self.__fileInfor.filename+'.mp4' 24 | urllib.urlretrieve(fileurl,filepath, self.Schedule)#下载文件 25 | 26 | 27 | #创建顶层文件夹 28 | def createdir(self): 29 | self.filedir = self.__fileInfor.subject+"("+conf.INFOR[conf.STATE]+")" 30 | if os.path.exists(self.filedir) == False: 31 | os.mkdir(self.filedir) 32 | #下载任务 33 | def Schedule(self,blocknum,blocksize,totalsize): 34 | ''''' 35 | blocknum:已经下载的数据块 36 | blocksize:数据块的大小 37 | totalsize:远程文件的大小 38 | ''' 39 | per = 100.0 * blocknum * blocksize / totalsize 40 | if per > 100 : 41 | per = 100 42 | conf.LOCK.acquire() 43 | conf.PERLIST[self.__id]= per#记录每个线程的下载百分比,用于计算整个的进度状况 44 | nowsum = 0;#当前的进度 45 | for item in conf.PERLIST: 46 | nowsum+=item 47 | str = u'当前下载进度:---------------->>>>>>>> %.2f%%' % (100*nowsum/conf.PERSUM) 48 | sys.stdout.write(str+"\r") 49 | sys.stdout.flush() 50 | conf.LOCK.release() 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /python/filedeal/file_downloader.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/filedeal/file_downloader.pyc -------------------------------------------------------------------------------- /python/index.py: -------------------------------------------------------------------------------- 1 | #!D:\Python27\python 2 | # -*- coding: utf-8 -*- 3 | 4 | from spider.spiderman import SpiderMan 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | if __name__=="__main__": 14 | spider = SpiderMan() 15 | spider.cmdshow_gbk() 16 | #spider.crawl("http://www.imooc.com/learn/110") -------------------------------------------------------------------------------- /python/spider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /python/spider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/spider/__init__.pyc -------------------------------------------------------------------------------- /python/spider/html_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import cookielib 3 | import urllib2 4 | class Html_Downloader(object): 5 | ''' 6 | 这个类主要是下载html使用的是urllib2模块 7 | ''' 8 | def download(self, url): 9 | if url is None: 10 | return None 11 | request = urllib2.Request(url) 12 | #下面的两个header是为了模拟手机浏览器,因为慕课网app可以不用注册就可以访问视频,所以把咱们的程序模拟成手机浏览器,就可以直接下载了 13 | request.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36') 14 | request.add_header('host','www.imooc.com') 15 | response= urllib2.urlopen(request) 16 | if response.getcode()!=200: 17 | return None 18 | return response.read() -------------------------------------------------------------------------------- /python/spider/html_downloader.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/spider/html_downloader.pyc -------------------------------------------------------------------------------- /python/spider/html_parser.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import re 3 | import urlparse 4 | from bs4 import BeautifulSoup 5 | from conf import DOWNLOAD_URL 6 | from entity.fileinfor import FileInfor 7 | from spider.html_downloader import Html_Downloader 8 | 9 | 10 | class Html_Parser(object): 11 | ''' 12 | html解析器:从中提取出视频信息 13 | ''' 14 | def __init__(self): 15 | self.res_data=[]#用来存放视频信息 16 | 17 | 18 | def parser(self, html_cont): 19 | ''' 20 | 21 | :param html_cont: html内容 22 | :return: 23 | ''' 24 | if html_cont is None: 25 | return 26 | # 使用BeautifulSoup模块对html进行解析 27 | soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')#str ='
The Dormouse's story
29 | # 30 | #Once upon a time there were three little sisters; and their names were 31 | # Elsie, 32 | # Lacie and 33 | # Tillie; 34 | # and they lived at the bottom of a well.
35 | # 36 | #...
37 | # """ 38 | # soup = BeautifulSoup(html_doc) 39 | # links = soup.find_all('a') 40 | # 41 | # for link in links: 42 | # print link.name,link['href'],link.get_text() 43 | # 44 | # p_node = soup.find('p',class_ = "title") 45 | # print p_node.name,p_node.get_text() 46 | 47 | # str = '