├── PHP
    ├── FileDownloader.php
    ├── HtmlDownloader.php
    ├── HtmlParser.php
    ├── Spider.php
    ├── composer.json
    └── index.php
├── README.md
└── python
    ├── conf.py
    ├── conf.pyc
    ├── entity
        ├── __init__.py
        ├── __init__.pyc
        ├── fileinfor.py
        └── fileinfor.pyc
    ├── filedeal
        ├── __init__.py
        ├── __init__.pyc
        ├── file_downloader.py
        └── file_downloader.pyc
    ├── index.py
    ├── spider
        ├── __init__.py
        ├── __init__.pyc
        ├── html_downloader.py
        ├── html_downloader.pyc
        ├── html_parser.py
        ├── html_parser.pyc
        ├── spiderman.py
        └── spiderman.pyc
    └── test
        ├── __init__.py
        ├── test_bs4.py
        ├── test_httplib.py
        ├── test_json.py
        ├── test_progress.py
        └── test_urllib.py


/PHP/FileDownloader.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 |  * Created by PhpStorm.
 5 |  * User: zhouqiang
 6 |  * Date: 2017/2/7
 7 |  * Time: 下午2:09
 8 |  */
 9 | class FileDownloader
10 | {
11 | 
12 | }


--------------------------------------------------------------------------------
/PHP/HtmlDownloader.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 |  * Created by PhpStorm.
 5 |  * User: zhouqiang
 6 |  * Date: 2017/2/7
 7 |  * Time: 下午2:10
 8 |  */
 9 | class HtmlDownloader
10 | {
11 |     public static function download($url = '')
12 |     {
13 |         $header = array(
14 |             'http' =>array(
15 |                 'header' =>
16 |                     "Host: www.imooc.com\r\n" .
17 |                     "Referer: http://m.120ask.com/health/show?page=2&id=84882&type=17\r\n" .
18 |                     "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36\r\n"
19 |             ),
20 |         );
21 |         $context = stream_context_create($header);
22 |         $data = file_get_contents($url, 0, $context);
23 |         return $data;
24 |     }
25 | 
26 | }


--------------------------------------------------------------------------------
/PHP/HtmlParser.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 |  * Created by PhpStorm.
 5 |  * User: zhouqiang
 6 |  * Date: 2017/2/7
 7 |  * Time: 下午2:10
 8 |  */
 9 | use Symfony\Component\DomCrawler\Crawler;
10 | 
11 | 
12 | class HtmlParser
13 | {
14 |     const DOWNLOAD_URL = 'http://www.imooc.com/course/ajaxmediainfo/?mid={}&mode=flash';//下载链接
15 | 
16 |     public $res = [];
17 | 
18 |     public function parse($htmlContent = '')
19 |     {
20 |         if (!$htmlContent) return '';
21 |         $crawler = new Crawler($htmlContent);
22 | 
23 |         $subject = $crawler->filter('#main > div.course-infos > div.w.pr > div.hd.clearfix > h2')->text();
24 | 
25 |         $course = $crawler->filterXPath('//*[@id="main"]/div[2]/div[1]/div[1]/div[3]/div/ul/li/a')->each(
26 |             function (Crawler $node, $i) {
27 |                 $url = explode('/', $node->filter('a')->attr('href'));
28 |                 $id = $url[2];
29 |                 $downloadData = json_decode(file_get_contents(str_replace('{}', $id, self::DOWNLOAD_URL)), true);
30 |                 return [
31 |                     'id' => $url['2'],
32 |                     'title' => str_replace(['\r','\n','开始学习',' ',], ['','','',''], trim($node->text())),
33 |                     //'title' => str_replace('开始学习', '', trim($node->text())),
34 |                     'url' => $downloadData['data']['result']['mpath'][2]
35 |                 ];
36 |             });
37 |         $result = [
38 |             'subject' => $subject,
39 |             'data' => $course
40 |         ];
41 |         return $result;
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/PHP/Spider.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | /**
 4 |  * Created by PhpStorm.
 5 |  * User: zhouqiang
 6 |  * Date: 2017/2/7
 7 |  * Time: 下午2:11
 8 |  */
 9 | require 'HtmlDownloader.php';
10 | require 'HtmlParser.php';
11 | 
12 | class Spider
13 | {
14 |     const    COURSEURL = "http://www.imooc.com/learn/"; //课程链接
15 |     //COURSEURL = "http:;//coding.imooc.com/learn/list/74.html"
16 |     const    CHOOSE = ['H', 'M', 'L'];//视频品质
17 |     const    STATE = 'H';//视频默认品质
18 |     const    INFOR = ['L' => '普清', 'M' => '高清', 'H' => '超清'];//视频品质描述
19 |     const    PERSUM = 0.0;//用于描述总进度
20 |     const   PERLIST = [];//记录每个线程的进度
21 | 
22 |     private $htmlData;
23 | 
24 |     public function __construct($id)
25 |     {
26 |         $this->downloader = new HtmlDownloader();
27 |         $this->parser = new HtmlParser();
28 |         $this->id = $id;
29 |     }
30 | 
31 |     public function run()
32 |     {
33 |         echo "#####################################################################\n";
34 |         echo "#慕课网视频抓取器\n";
35 |         echo "author:igo9go\n";
36 |         echo "github:https://github.com/igo9go/\n";
37 |         echo "#到慕课网官网打开想要下载的课程的章节列表页面，查看当前url链接\n";
38 |         echo "#例如http://www.imooc.com/learn/615，则课程编号为615\n";
39 |         echo "#####################################################################\n";
40 |         $url = self::COURSEURL . $this->id;
41 | 
42 | 
43 |         echo "将要下载的课程连接为" . $url . "\n";
44 |         echo "开始解析视频,请稍后\n";
45 |         $this->crawl($url);
46 |         echo "共有" . count($this->htmlData['data']) . "条视频\n";
47 |         echo "课程名称:" . $this->htmlData['subject'] . PHP_EOL;
48 |         echo "开始下载,请等待";
49 | 
50 |         $fildir = './' . $this->htmlData['subject'];
51 |         if (!is_dir($fildir)) {
52 |             mkdir($fildir);
53 |         }
54 |         $i = 1;
55 |         foreach ($this->htmlData['data'] as $item) {
56 |             $file = file_get_contents($item['url']);
57 |             $fileName = $item['title'] . '.mp4';
58 |             file_put_contents($fildir . '/' . $fileName, $file);
59 |             echo '第' . $i . '个视频已完成下载' . PHP_EOL;
60 |             $i++;
61 |         }
62 |         return '下载完成';
63 |     }
64 | 
65 |     public function crawl($url)
66 |     {
67 |         $htmlContent = $this->downloader->download($url);
68 |         $this->htmlData = $this->parser->parse($htmlContent);
69 |     }
70 | }


--------------------------------------------------------------------------------
/PHP/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "immoc",
 3 |   "type": "library",
 4 |   "license": "mit",
 5 |   "authors": [
 6 |     {
 7 |       "name": "igo9go",
 8 |       "email": "1@qq.com"
 9 |     }
10 |   ],
11 |   "require": {
12 |     "symfony/dom-crawler": "2.8.*|3.0.*|3.1.*",
13 |     "symfony/css-selector": "2.8.*|3.0.*|3.1.*"
14 |   },
15 |   "minimum-stability": "dev"
16 | }


--------------------------------------------------------------------------------
/PHP/index.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | 
 3 | require_once __DIR__ . '/vendor/autoload.php';
 4 | require 'Spider.php';
 5 | 
 6 | if (!isset($argv[1])) {
 7 |     echo '请输入需要下载的课程ID'.PHP_EOL;
 8 |     echo '如果下载的课程链接为http://www.imooc.com/learn/744'.PHP_EOL;
 9 |     echo '执行 php index.php 744';
10 |     exit;
11 | }
12 | $id = $argv[1];
13 | 
14 | $spider = new Spider($id);
15 | 
16 | $spider->run();


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # downloaa_imooc
 2 | 下载慕课网视频
 3 | 
 4 | 运行步骤:
 5 | 
 6 | 1. git clone https://github.com/igo9go/downloaa_imooc
 7 | 
 8 | 2. composer install
 9 | 
10 | 3. php index.php 744(744为课程ID)
11 | 
12 | 
13 | ![](http://oc9orpe44.bkt.clouddn.com/17-2-7/12718178-file_1486460082478_d782.png)
14 | 
15 | ![](http://oc9orpe44.bkt.clouddn.com/17-2-7/57911326-file_1486460108434_9092.png)


--------------------------------------------------------------------------------
/python/conf.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #全局变量
 3 | import threading
 4 | 
 5 | DOWNLOAD_URL = 'http://www.imooc.com/course/ajaxmediainfo/?mid={}&mode=flash'#下载链接
 6 | COURSEURL = "http://www.imooc.com/learn/"#课程链接
 7 | #COURSEURL = "http://coding.imooc.com/learn/list/74.html"
 8 | 
 9 | CHOOSE=['H','M','L']#视频品质
10 | 
11 | STATE='L'#视频默认品质
12 | 
13 | LOCK = threading.Lock()#线程锁
14 | 
15 | INFOR = {'L':u'普清','M':u'高清','H':u'超清'}#视频品质描述
16 | 
17 | PERSUM=0.0#用于描述总进度
18 | 
19 | PERLIST=[]#记录每个线程的进度


--------------------------------------------------------------------------------
/python/conf.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/conf.pyc


--------------------------------------------------------------------------------
/python/entity/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------
/python/entity/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/entity/__init__.pyc


--------------------------------------------------------------------------------
/python/entity/fileinfor.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | '''
 3 | 这个类用于描述文件信息的模型，将文件的所有信息封装成一个类，便于管理
 4 | '''
 5 | class FileInfor(object):
 6 |     def __init__(self):
 7 |         self.__subject=''#教程名称
 8 |         self.__filename=''#课程名称(也是下载的每个文件名称)
 9 |         self.__mid= ''#课程的ID号
10 |         self.__url={}#下载链接(分高中低,H M L)
11 |     @property
12 |     def subject(self):
13 |         return self.__subject
14 | 
15 |     @subject.setter
16 |     def subject(self,value):
17 |         self.__subject = value
18 | 
19 |     @property
20 |     def filename(self):
21 |         return self.__filename
22 | 
23 |     @filename.setter
24 |     def filename(self,value):
25 |         self.__filename = value
26 | 
27 |     @property
28 |     def mid(self):
29 |         return self.__mid
30 | 
31 |     @mid.setter
32 |     def mid(self,value):
33 |         self.__mid = value
34 | 
35 |     @property
36 |     def url(self):
37 |         return self.__url
38 | 
39 |     @url.setter
40 |     def url(self,value):
41 |         self.__url = value


--------------------------------------------------------------------------------
/python/entity/fileinfor.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/entity/fileinfor.pyc


--------------------------------------------------------------------------------
/python/filedeal/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------
/python/filedeal/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/filedeal/__init__.pyc


--------------------------------------------------------------------------------
/python/filedeal/file_downloader.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import os
 3 | import threading
 4 | import urllib
 5 | import sys
 6 | import conf
 7 | 
 8 | 
 9 | class File_Downloader(threading.Thread):
10 |     '''
11 |         这个类主要是用来下载视频文件的，继承了线程类
12 |     '''
13 |     def __init__(self,fileInfo,id):
14 |         threading.Thread.__init__(self)
15 |          #先创建顶层文件夹
16 |         self.__fileInfor = fileInfo
17 |         self.__id = id
18 |         self.createdir()
19 | 
20 | 
21 |     def run(self):
22 |         fileurl=self.__fileInfor.url[conf.STATE]
23 |         filepath = self.filedir+os.sep+self.__fileInfor.filename+'.mp4'
24 |         urllib.urlretrieve(fileurl,filepath, self.Schedule)#下载文件
25 | 
26 | 
27 |     #创建顶层文件夹
28 |     def createdir(self):
29 |         self.filedir = self.__fileInfor.subject+"("+conf.INFOR[conf.STATE]+")"
30 |         if os.path.exists(self.filedir) == False:
31 |             os.mkdir(self.filedir)
32 |     #下载任务
33 |     def Schedule(self,blocknum,blocksize,totalsize):
34 |         '''''
35 |         blocknum:已经下载的数据块
36 |         blocksize:数据块的大小
37 |         totalsize:远程文件的大小
38 |         '''
39 |         per = 100.0 * blocknum * blocksize / totalsize
40 |         if per > 100 :
41 |             per = 100
42 |         conf.LOCK.acquire()
43 |         conf.PERLIST[self.__id]= per#记录每个线程的下载百分比，用于计算整个的进度状况
44 |         nowsum = 0;#当前的进度
45 |         for item in conf.PERLIST:
46 |             nowsum+=item
47 |         str = u'当前下载进度:---------------->>>>>>>> %.2f%%' % (100*nowsum/conf.PERSUM)
48 |         sys.stdout.write(str+"\r")
49 |         sys.stdout.flush()
50 |         conf.LOCK.release()
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/python/filedeal/file_downloader.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/filedeal/file_downloader.pyc


--------------------------------------------------------------------------------
/python/index.py:
--------------------------------------------------------------------------------
 1 | #!D:\Python27\python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from spider.spiderman import SpiderMan
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | if __name__=="__main__":
14 |     spider = SpiderMan()
15 |     spider.cmdshow_gbk()
16 |     #spider.crawl("http://www.imooc.com/learn/110")


--------------------------------------------------------------------------------
/python/spider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------
/python/spider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/spider/__init__.pyc


--------------------------------------------------------------------------------
/python/spider/html_downloader.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import cookielib
 3 | import urllib2
 4 | class Html_Downloader(object):
 5 |     '''
 6 |     这个类主要是下载html使用的是urllib2模块
 7 |     '''
 8 |     def download(self, url):
 9 |         if url is None:
10 |             return None
11 |         request = urllib2.Request(url)
12 |         #下面的两个header是为了模拟手机浏览器，因为慕课网app可以不用注册就可以访问视频，所以把咱们的程序模拟成手机浏览器，就可以直接下载了
13 |         request.add_header('user-agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36')
14 |         request.add_header('host','www.imooc.com')
15 |         response= urllib2.urlopen(request)
16 |         if response.getcode()!=200:
17 |             return None
18 |         return response.read()


--------------------------------------------------------------------------------
/python/spider/html_downloader.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/spider/html_downloader.pyc


--------------------------------------------------------------------------------
/python/spider/html_parser.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import re
 3 | import urlparse
 4 | from bs4 import BeautifulSoup
 5 | from conf import DOWNLOAD_URL
 6 | from entity.fileinfor import FileInfor
 7 | from spider.html_downloader import Html_Downloader
 8 | 
 9 | 
10 | class Html_Parser(object):
11 |     '''
12 |     html解析器:从中提取出视频信息
13 |     '''
14 |     def __init__(self):
15 |         self.res_data=[]#用来存放视频信息
16 | 
17 | 
18 |     def parser(self, html_cont):
19 |         '''
20 | 
21 |         :param html_cont: html内容
22 |         :return:
23 |         '''
24 |         if html_cont is None:
25 |             return
26 |         # 使用BeautifulSoup模块对html进行解析
27 |         soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')#str ='<div class="hd"><h2 class="l">Hibernate注解</h2></div>'
28 |         subject = soup.find('div',class_ = "hd").get_text()
29 |         links = soup.find_all('a',class_='J-media-item')
30 |         html_down = Html_Downloader()#这个主要是请求视频的真实链接,抓包的时候你就会明白
31 | 
32 |         #下面的代码是将视频信息封装成对象添加到res_data列表中
33 |         for link in links:
34 |             fileinfor = FileInfor()
35 |             fileinfor.subject = subject.strip()
36 |             fileinfor.filename= link.get_text().strip().replace(':','_').replace("\r\n","").replace(u'开始学习',"").replace(' ', '')
37 |             fileinfor.mid = link['href'].split('/')[2]
38 |             json = html_down.download(DOWNLOAD_URL.replace('{}',fileinfor.mid)).replace('\/','/').encode('utf-8')
39 |             # print json
40 |             dic_json=eval(json)
41 |             # print dic_json['data']['result']['mpath'][0]
42 |             fileinfor.url['L']=dic_json['data']['result']['mpath'][0]
43 |             fileinfor.url['M']=dic_json['data']['result']['mpath'][1]
44 |             fileinfor.url['H']=dic_json['data']['result']['mpath'][2]
45 |             self.res_data.append(fileinfor)
46 |         print self.res_data
47 |         return self.res_data
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/python/spider/html_parser.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/spider/html_parser.pyc


--------------------------------------------------------------------------------
/python/spider/spiderman.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | from conf import COURSEURL, CHOOSE
 5 | import conf
 6 | from filedeal import file_downloader
 7 | from spider import html_parser
 8 | from spider import html_downloader
 9 | 
10 | '''
11 | 
12 | 这个类是爬虫的主逻辑
13 | '''
14 | 
15 | class SpiderMan(object):
16 | 
17 |     def __init__(self):
18 |         self.downloader = html_downloader.Html_Downloader()#html下载器
19 |         self.parser = html_parser.Html_Parser()#html解析器
20 | 
21 | 
22 | 
23 |     def crawl(self,url):
24 |         '''
25 | 
26 |         :param url: 需要爬取的url
27 |         :return:
28 |         '''
29 |         #下载好的html
30 |         html_cont = self.downloader.download(url)
31 |         #爬取到的视频数据信息
32 |         self.res_datas = self.parser.parser(html_cont)
33 | 
34 | 
35 | 
36 |     def download(self,res_datas):
37 |         '''
38 | 
39 |         :param res_datas: 视频数据信息列表
40 |         :return:
41 |         '''
42 |         id = 0 #设置线程的id号，只是为了进度条显示的时候进行分类信息
43 |         for res_data in res_datas:
44 |             downloader = file_downloader.File_Downloader(res_data,id)#视频文件下载线程，给每个文件分配一个线程(有点偷懒了)
45 |             id += 1
46 |             conf.PERLIST.append(0)#百分比列表
47 |             downloader.start()
48 | 
49 |     def cmdshow_gbk(self):
50 |         print u'#####################################################################'
51 |         print u"#慕课网视频抓取器"
52 |         print u"author:七夜"
53 |         print u"博客：http://blog.csdn.net/qiye_/和http://www.cnblogs.com/qiyeboy/ 同步更新 "
54 |         print u"微信公众号:qiye_python"
55 |         print u"github:https://github.com/qiyeboy/"
56 |         print u"#到慕课网官网打开想要下载的课程的章节列表页面，查看当前url链接"
57 |         print u"#例如http://www.imooc.com/learn/615，则课程编号为615"
58 |         print u"#####################################################################"
59 |         try:
60 |             ID = raw_input(u'输入要下载的课程编号：'.encode('utf-8'))
61 |             url = COURSEURL+str(ID)
62 |             print u"将要下载的课程链接为:",url
63 |             print u'开始解析视频,请稍后:'
64 |             self.crawl(url)
65 |             conf.PERSUM = len(self.res_datas)*100.0#总的进度
66 |             print u'共有%d条视频'% len(self.res_datas)
67 |             print u"课程名称:%s" % self.res_datas[0].subject
68 |             for res_data in self.res_datas:
69 |                 print u"----->%s" % res_data.filename
70 | 
71 |             state = input(u'选择清晰度（1：超清UHD，2：高清HD，3：普清SD）：'.encode('utf-8'))
72 |             if state not in [1,2,3]:
73 |                 print u'输入有误'
74 |                 return
75 |             conf.STATE = CHOOSE[state-1]
76 |             self.download(self.res_datas)
77 | 
78 |         except Exception ,e:
79 |             print u'程序炸了',e
80 |             return
81 | 
82 |     def prn_obj(obj):
83 |         print '\n'.join(['%s:%s' % item for item in obj.__dict__.items()])
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/python/spider/spiderman.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/igo9go/downloaa_imooc/71e0ca2f1a6bf761158b877d434b49fed83bbe1a/python/spider/spiderman.pyc


--------------------------------------------------------------------------------
/python/test/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------
/python/test/test_bs4.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #pip install beautifulsoup
 3 | from bs4 import BeautifulSoup
 4 | import re
 5 | 
 6 | # soup = BeautifulSoup('123','html.parser',from_encoding='utf-8')
 7 | # #查找所有标签为a的节点
 8 | # soup.find_all('a')
 9 | # #查找所有标签为a,链接符合/view/123.html形式的节点
10 | # soup.find_all('a',href='/view/123.html')
11 | #
12 | # soup.find_all('a',href=re.compile(r'/view/\d+\.htm'))
13 | #
14 | # #查找所有标签为div,class为abc,文字为python的节点
15 | # soup.find_all('div',class_='abc',string='python')
16 | 
17 | # #获取查找到的节点的标签名称
18 | # node.name
19 | # #获取查找到的按节点的href属性
20 | # node['href']
21 | #
22 | # #获取查找到的a节点的链接文字
23 | # node.get_text()
24 | 
25 | # html_doc = """
26 | # <html><head><title>The Dormouse's story</title></head>
27 | # <body>
28 | # <p class="title"><b>The Dormouse's story</b></p>
29 | #
30 | # <p class="story">Once upon a time there were three little sisters; and their names were
31 | # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
32 | # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
33 | # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
34 | # and they lived at the bottom of a well.</p>
35 | #
36 | # <p class="story">...</p>
37 | # """
38 | # soup = BeautifulSoup(html_doc)
39 | # links = soup.find_all('a')
40 | #
41 | # for link in links:
42 | #     print link.name,link['href'],link.get_text()
43 | #
44 | # p_node = soup.find('p',class_ = "title")
45 | # print p_node.name,p_node.get_text()
46 | 
47 | # str = '<a target="_self" href="http://pic.yesky.com/146/101110646.shtml"><img alt="星女郎林允清纯活泼 当街开吃抬腿跳跃_街拍" ' \
48 | # 'src="http://image.tianjimedia.com/uploadImages/2016/071/44/6Q13W9S85S75_113.jpg"></a>'
49 | #
50 | # soup = BeautifulSoup(str,'html.parser',from_encoding='utf-8')
51 | # links = soup.find_all('img')
52 | # for link in links:
53 | #     print link['src']
54 | # str = '<a class="J-media-item studyvideo" href="/video/11304" target="_blank">1-1 项目介绍 (02:14)<i class="study-state done"></i></a>'
55 | # soup = BeautifulSoup(str,'html.parser',from_encoding='utf-8')
56 | # links = soup.find_all('a',class_='J-media-item studyvideo')
57 | #
58 | # for link in links:
59 | #     print link.get_text().strip(),int(link['href'].split('/')[2])
60 | 
61 | str ='<div class="hd"><h2 class="l">Hibernate注解</h2></div>'
62 | soup = BeautifulSoup(str,'html.parser',from_encoding='utf-8')
63 | p_node = soup.find('div',class_ = "hd")
64 | 
65 | print p_node.get_text()
66 | 


--------------------------------------------------------------------------------
/python/test/test_httplib.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import urlparse
 3 | import httplib
 4 | parsedurl = urlparse.urlparse('https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality=100&size=b4000_4000&sec=1457758644&di=a35a020cb3e5da7c2c179da3f18b6ad5&src=http://img.hb.aicdn.com/d2024a8a998c8d3e4ba842e40223c23dfe1026c8bbf3-OudiPA_fw580')
 5 | 
 6 | print parsedurl[1]
 7 | print parsedurl[2]
 8 | httpConn = httplib.HTTPConnection(parsedurl[1])
 9 | httpConn.request('GET', parsedurl[2])
10 | response = httpConn.getresponse()
11 | if response.status == 200:
12 |     size = response.getheader('Content-Length')
13 |     size = int(size) / 1024
14 |     print 'Size: %s KB,Content-Type: %s, Last-Modified: %s'%(size,response.getheader('Content-Type'),response.getheader('Last-Modified'))
15 | else:
16 |     print response.status,response.reason
17 | httpConn.close()
18 | 


--------------------------------------------------------------------------------
/python/test/test_json.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | str='''{"result":0,"data":{"result":{"mid":11305,"mpath":["http:\/\/v2.mukewang.com\/b0e670ef-7695-4ded-b5d8-3f78d221413d\/L.mp4?auth_key=1457620869-0-0-688a98b2f90af6006b38ddf89b750ad1","http:\/\/v2.mukewang.com\/b0e670ef-7695-4ded-b5d8-3f78d221413d\/M.mp4?auth_key=1457620869-0-0-2c0f5e8ca614ce7fd2a4e2b1ef844074","http:\/\/v2.mukewang.com\/b0e670ef-7695-4ded-b5d8-3f78d221413d\/H.mp4?auth_key=1457620869-0-0-7287a7bc3fd090f504d592117600d4cf"],"cpid":"3019","name":"\u8bbe\u7f6e\u5e03\u5c40","time":"59","practise":[]}},"msg":"\u6210\u529f"}'''.replace('\/','/')
3 | 
4 | dict = eval(str)
5 | 
6 | print dict['data']['result']['mpath'][1]


--------------------------------------------------------------------------------
/python/test/test_progress.py:
--------------------------------------------------------------------------------
 1 | import os,sys,string
 2 | import time
 3 | 
 4 | def view_bar(num=1, sum=100, bar_word=":"):
 5 | 	rate = float(num) / float(sum)
 6 | 	rate_num = int(rate * 100)
 7 | 	print '\r%d%% :' %(rate_num)
 8 | 
 9 | 	for i in range(0, num):
10 | 		os.write(1, bar_word)
11 | 	sys.stdout.flush()
12 | 
13 | if __name__ == '__main__':
14 | 	for i in range(0, 100):
15 | 		time.sleep(0.1)
16 | 		view_bar(i, 99)
17 | 


--------------------------------------------------------------------------------
/python/test/test_urllib.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | import urllib2,cookielib
 3 | import bs4
 4 | 
 5 | # url = "http://www.imooc.com/learn/615"
 6 | #
 7 | #
 8 | # print '第一种方法'
 9 | #
10 | # response1 = urllib2.urlopen(url)
11 | #
12 | # print response1.getcode()
13 | # print response1.read()
14 | #
15 | # print '第二种方法'
16 | #
17 | # DEFAULT_REQUEST_HEADERS = {
18 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 | #   'Accept-Language': 'en',
20 | #    'Referer':'http://www.kuaidaili.com/',
21 | # 'Cookie':'Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1466940405,1466944441; _ga=GA1.2.172144337.1466940406; _gat=1'
22 | #           '; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1466944441',
23 | # 'Connection':'keep-alive'
24 | # }
25 | 
26 | # request = urllib2.Request("http://www.kuaidaili.com/free/inha/2")
27 | # request.add_header('user-agent','Mozilla-Firefox-Spider(Wenanry)')
28 | # request.add_header('Referer','http://www.kuaidaili.com/')
29 | # request.add_header('Cookie','Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1466953979; _ga=GA1.2.555532848.1466953979; _gat=1'
30 | #           '; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1466953979',)
31 | #
32 | # try:
33 | #     response2= urllib2.urlopen(request)
34 | # except urllib2.HTTPError,e:
35 | #     # print response2
36 | #     # print response2.getcode()
37 | #     # print response2.read()
38 | #     print e.getcode()
39 | #     print e.read(),e.info()
40 | 
41 | 
42 | from selenium import webdriver
43 | 
44 | browser = webdriver.Chrome()
45 | browser.get('http://www.baidu.com/')
46 | 
47 | # import requests
48 | #
49 | # loginUrl = 'http://www.kuaidaili.com/free/'
50 | # s = requests.Session()
51 | # # r = s.get(loginUrl,proxies=proxies,allow_redirects=True)
52 | # r = s.get(loginUrl,allow_redirects=True)
53 | #
54 | # response= r.text
55 | # r = s.get(loginUrl,allow_redirects=False)
56 | #
57 | # response= r.text
58 | # print response
59 | 
60 | #
61 | # print '第三种方法'
62 | # cj = cookielib.CookieJar()
63 | # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
64 | #
65 | # urllib2.install_opener(opener)
66 | #
67 | # response3 = urllib2.urlopen(url)
68 | #
69 | # print response3.getcode()
70 | # print cj
71 | # print len(response3.read())
72 | 
73 | 
74 | # freader = urllib2.urlopen(self.__fileInfor.url[conf.STATE])
75 | # filepath = self.filedir+os.sep+self.__fileInfor.filename+'.mp4'
76 | # with open(filepath, "wb") as fwriter:
77 | #     fwriter.write(freader.read())
78 | #     fwriter.flush()
79 | # fwriter.close()
80 | # import urllib
81 | # #下载文件
82 | # def callbackfunc(blocknum, blocksize, totalsize):
83 | #     '''回调函数
84 | #     @blocknum: 已经下载的数据块
85 | #     @blocksize: 数据块的大小
86 | #     @totalsize: 远程文件的大小
87 | #     '''
88 | #     percent = 100.0 * blocknum * blocksize / totalsize
89 | #     if percent > 100:
90 | #         percent = 100
91 | #     print "%.2f%%"% percent
92 | # url = 'http://www.sina.com.cn'
93 | # local = 'd:\\sina.html'
94 | # urllib.urlretrieve(url, local, callbackfunc)
95 | 


--------------------------------------------------------------------------------