([^<>]*)

├── .gitignore ├── 5tpsMp3 ├── 5tpsMp3.py └── 5tpsMp3_py2.py ├── Ken777 ├── Ken777.py ├── Ken777制作书籍汇总.TXT └── downloaded.txt ├── README.md ├── _test.py ├── doc └── weiphone.md ├── flvcd.py ├── ifengVideo └── ifengVideo.py ├── itpub.py ├── lib ├── __init__.py ├── common.py ├── config.ini └── config.py ├── opencourse.py ├── opencourse_old.py ├── verycd ├── simplecd.py └── verycd.py ├── weiphone.py └── youku └── youku_join.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | __pycache__/ 3 | 4 | _test.py -------------------------------------------------------------------------------- /5tpsMp3/5tpsMp3.py: -------------------------------------------------------------------------------- 1 | 2 | import threading 3 | import signal 4 | import json 5 | import queue 6 | 7 | from lib.common import * 8 | 9 | progress_file = '5tps.json' # 保存进度的文件名 10 | o5tps = {} # 要解析的对象 11 | cur_pos = 0 12 | 13 | 14 | class Parser(): 15 | BASE_URL = 'http://www.5tps.com' 16 | 17 | def __init__(self): 18 | # 每个子页面的下载链接、fileName的正则 19 | self.rDownUrl = re.compile( 20 | '点此下载.*') 21 | self.rFileName = re.compile(r'[^/]+\.mp3') # 从上面的DownUrl得到 22 | 23 | #得到 out_dict{'start_url':'', 'title':'', 'urls':[], 'content':'', 'total_size':10} 24 | def parseStartUrl(self, start_url): 25 | """ 26 | parse out download page from start url. 27 | eg. we can get 'http://www.5tps.com/down/8297_52_1_1.html' from 'http://www.5tps.com/html/8297.html' 28 | """ 29 | out_dict = {'start_url': start_url} 30 | html = getHtml(start_url, 'gbk') 31 | 32 | #标题 33 | titleLine = r1(r'([^<>]*)', html) 34 | title = titleLine.split(" ")[1] 35 | out_dict['title'] = title 36 | 37 | #子链接 38 | rUrl = re.compile(r'href=[\"\'](/down/.*?html)') 39 | itemUrls = rUrl.findall(html) 40 | #/down/8297_52_1_1.html --> http://www.5tps.com/down/8297_52_1_1.html 41 | out_dict['urls'] = [self.BASE_URL + url for url in itemUrls] 42 | 43 | #内容简介 44 | # xpath_contetn = './/*[@id='full']/div/div/ul/p/span' #内容简介 45 | content = re.search(r'

.+?
', html, re.DOTALL).group() 46 | #去掉html标记 47 | out_dict['content'] = htmlToText(content) 48 | 49 | out_dict['total_size'] = len(out_dict['urls']) 50 | return out_dict 51 | 52 | # 找到真实的下载地址， return (downUrl, fileName) 53 | def getDownUrl(self, url): 54 | """ find out the real download link from download page. 55 | eg. we can get the download link 'http://180j-d.ysts8.com:8000/人物纪实/童年/001.mp3? 56 | 1251746750178x1356330062x1251747362932-3492f04cf54428055a110a176297d95a' from 57 | 'http://www.5tps.com/down/8297_52_1_1.html' 58 | """ 59 | content = getHtml(url, 'gbk') 60 | downUrl = self.rDownUrl.search(content).group(1) 61 | #从url中提取出fileName 62 | fileName = self.rFileName.search(downUrl).group() 63 | fileName = fileName.replace('%20', ' ') 64 | 65 | return (downUrl, fileName) 66 | 67 | 68 | class CheckThread(threading.Thread): 69 | """ 检查任务是否完成 """ 70 | def init(self, check_list): 71 | threading.Thread.init(self) 72 | self.check_list = check_list 73 | 74 | def run(self): 75 | while True: 76 | for i, filePath in enumerate(self.check_list): 77 | if os.path.exists(filePath): 78 | print(' 下载完成: %s' % filePath) 79 | del self.check_list[i] 80 | time.sleep(0.5) 81 | 82 | 83 | def exitApp(signum, frame): 84 | #强行中断前保存解析进度 85 | o5tps['start_pos'] = cur_pos 86 | o5tps['down_size'] = o5tps['total_size'] - o5tps['start_pos'] 87 | with open(progress_file, 'w') as f: 88 | json.dump(o5tps, f, sort_keys=True, indent=4) 89 | print('保存进度成功，自动退出！') 90 | sys.exit() 91 | 92 | 93 | def getProgress(): 94 | #检查是否有进度文件 95 | try: 96 | global o5tps 97 | f = open(progress_file) 98 | o5tps = json.load(f) 99 | f.close() 100 | 101 | print('''读取进度, 目标: {title}, {start_url} 102 | 共有{total_size:d}个,本次开始:第{start_pos:d}个,还有{down_size:d}个''' 103 | .format(o5tps)) 104 | return True 105 | except: 106 | return False 107 | 108 | 109 | def main(): 110 | global o5tps, cur_pos 111 | save_path = 'e:\\Downloads\\有声小说' 112 | parser = Parser() 113 | 114 | print('开始运行') 115 | 116 | #没有进度则输入start_url 117 | hasProgress = getProgress() 118 | if not hasProgress: 119 | start_url = sys.argv[1] 120 | if not start_url: 121 | print('没有输入url') 122 | return 123 | o5tps = parser.parseStartUrl(start_url) 124 | o5tps['start_pos'] = 0 125 | print('{title} 共有{total_size:d}个'.format(o5tps)) 126 | 127 | #注册停止命令 128 | signal.signal(signal.SIGINT, exitApp) 129 | 130 | #取得一些值，看起来简单点 131 | check_list = [] 132 | urls = o5tps['urls'] 133 | cur_pos = o5tps['start_pos'] 134 | dir_path = os.path.join(save_path, o5tps['title']) 135 | 136 | #启动检查是否下载完成进程 137 | t = CheckThread(check_list) 138 | t.daemon = True 139 | t.start() 140 | 141 | #写入说明.txt 142 | if not os.path.exists(dir_path): 143 | os.mkdir(dir_path) 144 | content_path = os.path.join(dir_path, '说明.txt') 145 | if not os.path.exists(content_path): 146 | with open(content_path, 'w', encoding='gbk') as f: 147 | f.write(o5tps['content']) 148 | 149 | #动态取得下载链接并添加到IDM 150 | while cur_pos < len(urls): 151 | if len(check_list) < 3: 152 | downUrl = urls[cur_pos] 153 | 154 | cur_num = cur_pos + 1 155 | print('正在处理第{}个, {}'.format(cur_num, downUrl)) 156 | mp3Url, fileName = parser.getDownUrl(downUrl) 157 | 158 | #本地不存在则添加 159 | filePath = os.path.join(dir_path, fileName) 160 | if not os.path.exists(filePath): 161 | addToIDM(mp3Url, dir_path) 162 | 163 | #添加到 check_list 164 | check_list.append(filePath) 165 | cur_pos += 1 166 | else: 167 | time.sleep(0.5) 168 | 169 | #全部完成删除进度文件 170 | if os.path.exists(progress_file): 171 | os.remove(progress_file) 172 | print('全部下载完成！') 173 | 174 | if name == 'main': 175 | main() 176 | -------------------------------------------------------------------------------- /5tpsMp3/5tpsMp3_py2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ywzhaiqi/DownloadHelpers/9f8ceff8deb6124c1e64fbec9bd658dbc27ae8bf/5tpsMp3/5tpsMp3_py2.py -------------------------------------------------------------------------------- /Ken777/Ken777.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ywzhaiqi/DownloadHelpers/9f8ceff8deb6124c1e64fbec9bd658dbc27ae8bf/Ken777/Ken777.py -------------------------------------------------------------------------------- /Ken777/Ken777制作书籍汇总.TXT: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ywzhaiqi/DownloadHelpers/9f8ceff8deb6124c1e64fbec9bd658dbc27ae8bf/Ken777/Ken777制作书籍汇总.TXT -------------------------------------------------------------------------------- /Ken777/downloaded.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ywzhaiqi/DownloadHelpers/9f8ceff8deb6124c1e64fbec9bd658dbc27ae8bf/Ken777/downloaded.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 多个网站辅助下载脚本合集 2 | 3 | 这些脚本都是我平时用到不断收集的，以Python为主，还包含js脚本。本人水平有限，希望共同探讨。 4 | AutoHotkey 辅助合集 5 | 6 | ## 需要 7 | 8 | - python 9 | - pyquery 10 | 11 | ## 分类 12 | 13 | ### verycd 下载 14 | 15 | - [verycd.user.js](verycd/verycd@ywzhaiqi@gmailcom.user.js)：直接在verycd页面显示ed2k链接 16 | - [verycd.py](verycd/verycd.py)：未完成。用上面的 GM 脚本替代。 17 | - [simplecd.py](verycd/simplecd.py): 可用 18 | 19 | ### 迅雷离线下载 20 | 21 | - [thunderlixianexporter.user.js](http://s.binux.me/TLE/master/ThunderLixianExporter) 22 | - [thunderassistant.user.js](http://userscripts.org/scripts/show/111748) 23 | - 有时候有问题，显示一直在加载，点击没反应。由于jQuery库 24 | - [离线Python脚本](https://github.com/iambus/xunlei-lixian) 25 | - 自己修改添加了导出IDM下载文件，没有试用 26 | 27 | ### itpub 论坛下载 28 | 29 | 解析并添加到IDM下载，我的超级用法：复制多个论坛链接，然后调用AHK脚本直接从剪贴板取网址执行脚本。 30 | 31 | ### weiphone 论坛批量下载 32 | 33 | 取得帖子的所有下载链接并添加到IDM，批量下载时很有用。 34 | 35 | ### 5tps 有声小说下载 36 | 37 | 动态解析（限制为2个）并添加到ID 38 | 39 | ### 网易公开课批量获取地址 40 | 41 | 获取整个公开课数据库存在mongodb，检索、添加下载 42 | 43 | ### 凤凰视频下载 44 | 45 | 包括财经郎眼、锵锵三人行、开卷八分钟。 46 | 47 | TODO 48 | 49 | - 从verycd下载 50 | 51 | ### 优酷视频下载 52 | 53 | - 从flvcd网站获取下载链接并添加到IDM下载。 54 | - 用IDM速度比自带下载工具略快，但由于未完美解决合并的问题，一直没用 55 | - you-get 获取不能改清晰度 -------------------------------------------------------------------------------- /_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ywzhaiqi/DownloadHelpers/9f8ceff8deb6124c1e64fbec9bd658dbc27ae8bf/_test.py -------------------------------------------------------------------------------- /doc/weiphone.md: -------------------------------------------------------------------------------- 1 | ##最终获取链接 2 | http://bbs.weiphone.com/job.php?action=download&aid=1741094&check=1&nowtime=1360316928664&verify=5c41a95d 3 | 4 | ##点击事件 5 | 资治通鉴全译本.epub 6 | 7 | function ajaxurl(o, ep) { 8 | read.obj = o; 9 | ajax.send(o.href + ((typeof ep == 'undefined' || !ep) ? '' : ep), '', ajax.get); 10 | return false; 11 | } 12 | 13 | ##ajax.send 14 | var nowtime = new Date().getTime(); 15 | if (nowtime - this.last < 1500) { 16 | clearTimeout(this.t); 17 | this.t = setTimeout(function(){ajax.send(url,data,callback)},1500+this.last-nowtime); 18 | return; 19 | } 20 | this.last = nowtime; 21 | url += (url.indexOf("?") >= 0) ? "&nowtime=" + nowtime : "?nowtime=" + nowtime; 22 | if (typeof verifyhash != 'undefined') { 23 | url += '&verify=' + verifyhash; 24 | } 25 | 26 | 27 | ##verifyhash 28 | var verifyhash = '5c41a95d'; 29 | 在 read-htm-tid-1726790.html 的18行 -------------------------------------------------------------------------------- /flvcd.py: -------------------------------------------------------------------------------- 1 | # -- coding: utf-8 -- Python3 2 | import sys 3 | import re 4 | import os 5 | import json 6 | from pyquery import PyQuery 7 | 8 | from lib.common import addToIDM 9 | 10 | SAVE_PATH = 'e:\\Downloads\\_tmp' 11 | TO_JOIN_PATH = 'youku_to_join.json' 12 | 13 | 14 | def parseFlvcd(url, format='hight'): 15 | # 不知道是否需要 urllib.parse.quote(videourl) 16 | print('目标：{}，清晰度：{}'.format(url, format)) 17 | url = "http://www.flvcd.com/parse.php?kw=" + url + '&format=' + format 18 | d = PyQuery(url) 19 | 20 | filename = d('input[name="filename"]').attr('value') 21 | filename = re.sub('[\\\|\:\\"\?\<\>]', "_", filename) 22 | 23 | urltxt = d('input[name="inf"]').attr('value') 24 | url = urltxt.strip() 25 | addToIDM(url, SAVE_PATH, filename) 26 | # for url in urltxt.split('\r\n'): 27 | # url = url.strip() 28 | # if url: 29 | # addToIDM(url) 30 | 31 | 32 | class Flvcd(): 33 | encoding = 'gbk' 34 | 35 | def init(self): 36 | self.url = "" 37 | # 只找链接 self.pattern = re.compile(r"(.+)') 40 | #从在下载url中取得后缀名 41 | #e.g. http://f.youku.com/player/getFlvPath/sid/00_00/st/mp4/fileid/030008040050D9D8F35EBA0359955B550E4FE0-A3D0-5DC5-C140-BFCFE663892F?K=2d47c2f50d44936424114172 42 | self.rFileExt = re.compile('st/(\w+)/fileid/') 43 | self.headers = {"Accept": "/", "Accept-Language": "zh-CN", 44 | "User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)", 45 | "Connection": "Keep-Alive"} 46 | 47 | def fetchHtml(self, url): 48 | req = urllib.request.Request(url, headers=self.headers) 49 | f = urllib.request.urlopen(req) 50 | return f.read().decode(self.encoding) 51 | 52 | def parse(self, url, format='high'): 53 | """ return [(url, name)...] """ 54 | # 不知道是否需要 urllib.parse.quote(videourl) 55 | self.url = "http://www.flvcd.com/parse.php?kw=" + url + '&format=' + format 56 | html = self.fetchHtml(self.url) 57 | 58 | # 先找出一大块区域 59 | m = self.rContent.search(html) 60 | content = m.group(1) 61 | # 进一步找出所有的name、url 62 | result = self.rNameAndUrl.findall(content) 63 | 64 | size = len(result) 65 | print('视频共有 {:d}个片段, 保存的位置: {}'.format(int(size / 2), SAVE_PATH)) 66 | outList = [] 67 | if size > 0: 68 | for i in range(0, size, 2): 69 | name = result[i] # 没有后缀 e.g. 「ZEALER 出品」华为荣耀四核测评-0001 70 | url = result[i + 1] 71 | # 取得后缀名 72 | fileExt = self.rFileExt.search(url).group(1) 73 | name += '.' + fileExt 74 | 75 | outList.append((url, name)) 76 | return outList 77 | else: 78 | print("URL Not Found") 79 | 80 | 81 | def addAllToIDM(allList): 82 | print('-' 40) 83 | for url, name in allList: 84 | addToIDM(url, SAVE_PATH, name) 85 | print('添加到IDM成功：{}'.format(name)) 86 | 87 | print("全部添加到IDM，请开始用IDM下载") 88 | 89 | 90 | def saveData(allList): 91 | #先取出存在的列表 92 | try: 93 | with open(TO_JOIN_PATH) as f: 94 | videos = json.load(f) 95 | except: 96 | videos = [] 97 | 98 | #当片段数量>=2才需要 99 | if len(allList) >= 2: 100 | #取得当前所有的片段列表 101 | filePaths = [] 102 | for url, name in allList: 103 | path = os.path.join(SAVE_PATH, name) 104 | filePaths.append(path) 105 | 106 | #添加到videos 107 | videos.append(filePaths) 108 | 109 | #重写记录文件 110 | with open(TO_JOIN_PATH, "w") as f: 111 | json.dump(videos, f, indent=4, ensure_ascii=False) 112 | print('-' * 40) 113 | print("要合并的列表保存到 %s" % TO_JOIN_PATH) 114 | 115 | 116 | def comman_line_runner(): 117 | argc = len(sys.argv) 118 | if argc == 2: 119 | format = 'high' 120 | elif argc == 3: 121 | format = sys.argv[2] 122 | else: 123 | #对于优酷视频来说, super超清, high高清, normal标清 124 | print("Usage: %s videoUrl [videoQuality=normal|high|super|normal|...]" % sys.argv[0]) 125 | print(" e.g.") 126 | print(" %s http://v.youku.com/v_show/id_XMzMzMjE0MjE2.html super" % sys.argv[0]) 127 | return 128 | 129 | videoUrl = sys.argv[1] 130 | parseFlvcd(videoUrl, format) 131 | 132 | if name == 'main': 133 | # main() 134 | comman_line_runner() 135 | -------------------------------------------------------------------------------- /ifengVideo/ifengVideo.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import os 4 | import fnmatch 5 | import sqlite3 6 | 7 | from lib.common import * 8 | 9 | ''' 从 http://blog.sina.com.cn/lmiou 获取开卷三分钟、财经郎眼、锵锵三人行视频地址''' 10 | 11 | FILE_PATH = 'ifengVideo.lst' 12 | DB_PATH = 'ifengVideo.db' 13 | 14 | 15 | def getAllPages(start=1, end=38): 16 | """ 得到所有page的下载地址 17 | """ 18 | downUrls = [] 19 | for i in range(start, end + 1): 20 | url = 'http://blog.sina.com.cn/s/article_sort_1490886071_10001_%s.html' % i 21 | print('正在获取第%s页...' % i) 22 | urls = getOnePage(url) 23 | size = len(urls) 24 | if size < 10: 25 | print(" 只得到%d条下载链接" % size) 26 | print(" " + url) 27 | downUrls.extend(urls) 28 | 29 | print("共获取到%d条下载链接" % len(downUrls)) 30 | with open(FILE_PATH, 'w+') as f: 31 | urls_str = '\n'.join(downUrls) 32 | f.write(urls_str) 33 | print("写入 %s 成功" % FILE_PATH) 34 | 35 | 36 | def getOnePage(url): 37 | """ 得到一个page页面的地址 38 | """ 39 | # f = urllib.request.urlopen(url) 40 | # content = f.read().decode('utf-8') 41 | content = getHtml(url, 'utf-8') 42 | 43 | urls = re.findall(r'ed2k://.?/', content, re.S) 44 | urls = [url.strip().replace('', '').replace('\n', '') for url in urls] 45 | return urls 46 | 47 | 48 | def getAllItems(): 49 | """ 从下载地址中分解出 50 | e.g. [('开卷八分钟', '2012-12-19', '《弯曲的脊梁》（一）.mkv', downUrl)...] 51 | """ 52 | rName = re.compile(r''' 53 | (\w+) # e.g. 开卷八分钟,文化大观园 54 | -[-\s]? # 分隔符 -后面可能有空格或- 55 | (\d{4}-\d{2}-\d{2}) # 日期 e.g. 2012-12-18 56 | -? # 分隔符 57 | (.) # 文件名 e.g. 王立军的“火化论”颇具文化修养.mkv 58 | ''', re.VERBOSE) 59 | with open(FILE_PATH) as f: 60 | items = [] 61 | for line in f: 62 | name = r1(r'ed2k://\|file\|(.?)\|', line) 63 | name = re.sub(r'-?新时空家园录制|-?【华夏视讯网首发】|-?3e帝国录制', '', name) 64 | 65 | m = rName.match(name) 66 | if m: 67 | url = line.strip() 68 | t = (url, m.group(1), m.group(2), m.group(3)) 69 | items.append(t) 70 | else: 71 | print('not match: %s' % name) 72 | return items 73 | 74 | 75 | def writeAllItems(items): 76 | """ 写入到数据库 77 | """ 78 | if len(items) == 0: 79 | print("items size is 0") 80 | return 81 | 82 | con = sqlite3.connect(DB_PATH) 83 | con.execute('DROP TABLE items') 84 | con.execute('CREATE TABLE IF NOT EXISTS items (url VARCHAR(255) UNIQUE, type TEXT, date TEXT, name TEXT)') 85 | 86 | cur = con.cursor() 87 | for item in items: 88 | # item e.g. url, type, date, name 89 | cur.execute('INSERT INTO items VALUES(?,?,?,?)', item) 90 | 91 | con.commit() 92 | cur.close() 93 | con.close() 94 | 95 | 96 | def getDBItems(): 97 | con = sqlite3.connect(DB_PATH) 98 | cur = con.execute("SELECT date, url FROM items WHERE type='开卷八分钟'") 99 | dbItems = [(date, url) for date, url in cur] 100 | cur.close() 101 | return dbItems 102 | 103 | 104 | def getLocalDates(): 105 | file_dates = [] 106 | for file in os.listdir('e:\\Downloads\\视频\\开卷八分钟'): 107 | if fnmatch.fnmatch(file, '.mkv') or fnmatch.fnmatch(file, '.rmvb'): 108 | file_date = r1(r'(\d{4}-\d{2}-\d{2})', file) 109 | file_dates.append(file_date) 110 | return file_dates 111 | 112 | 113 | def getUnDownUrls(): 114 | dbItems = getDBItems() 115 | all_file_dates = [date for date, url in dbItems] 116 | 117 | local_file_dates = getLocalDates() 118 | 119 | print("数据库中没有的而本地有的:") 120 | # for date in local_file_dates: #常规用法 121 | # if date not in all_file_dates: 122 | # print(" " + date) 123 | for date in filter(lambda x: x not in all_file_dates, local_file_dates): # 另一种用法 124 | print(" " + date) 125 | 126 | #set(all_file_dates) ^ set(local_file_dates) 127 | 128 | # un_down_urls = [] 129 | # for date, url in dbItems: 130 | # if date not in local_file_dates: 131 | # un_down_urls.append(url) 132 | un_down_urls = list(filter(lambda x: x[0] not in local_file_dates, dbItems)) 133 | print("还有 %s个未下载" % len(un_down_urls)) 134 | 135 | return un_down_urls 136 | 137 | 138 | def updateAllSize(): 139 | conn = sqlite3.connect(DB_PATH) 140 | 141 | c = conn.execute('SELECT rowid, url FROM items') 142 | for rowid, url in c: 143 | #ed2k://|file|<文件名>|<文件大小>|<文件Hash>|/ 144 | size = url.split('|')[3] 145 | conn.execute('UPDATE items SET size=? WHERE rowid=?', (size, rowid)) 146 | 147 | conn.commit() 148 | c.close() 149 | 150 | 151 | def getSize(): 152 | total_size = 0 153 | # 下载列表\开卷八分钟_下载列表.lst 3.7GB 154 | # 下载列表\锵锵三人行_文化大观园.lst 28.5GB 155 | with open('下载列表\锵锵三人行_文化大观园.lst') as f: 156 | for line in f: 157 | if line.startswith('ed2k'): 158 | size = line.split('|')[3] 159 | total_size += int(size) 160 | 161 | size_str = strSize(total_size) 162 | print(size_str) 163 | 164 | 165 | def test(): 166 | conn = sqlite3.connect(DB_PATH) 167 | c = conn.cursor() 168 | 169 | c.execute('SELECT url FROM items WHERE type != "开卷八分钟"') 170 | with open('锵锵三人行_文化大观园.lst', 'w') as f: 171 | for url, in c: 172 | f.write(url + '\n') 173 | 174 | c.close() 175 | 176 | if name == 'main': 177 | # un_down_urls = getUnDownUrls() 178 | 179 | # with open('下载列表/开卷八分钟_下载列表.lst', 'w') as f: 180 | # f.write('\n'.join(un_down_urls)) 181 | getSize() 182 | pass 183 | -------------------------------------------------------------------------------- /itpub.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | import sys 4 | import argparse 5 | from urllib.parse import urlparse, parse_qs 6 | from pyquery import PyQuery as pq 7 | 8 | from lib.common import addToIDM 9 | 10 | DOWN_PATH = 'e:\\Downloads\\itpub' 11 | reFileExt = re.compile('[(pdf)|(rar)|(doc)|(excel)|(chm)|(zip)|(7z)|\ 12 | (swf)|(torrent)|(txt)|(sql)|(docx)|(xls)|(xlsx)|(ppt)|(pptx)]') 13 | 14 | BASE_URL = 'http://www.itpub.net/' 15 | # db = pymongo.MongoClient().myDownload 16 | 17 | 18 | def _addToDownload(url, fileName=None): 19 | """ url 是附件的下载链接 20 | """ 21 | url = url.replace('attachment.php?', 'forum.php?mod=attachment&').replace('&', '&') 22 | 23 | addToIDM(url) 24 | # fileName = trimFileName(fileName) 25 | # addToIDM(url, DOWN_PATH, fileName) 26 | print(' 添加到IDM: %s' % fileName) 27 | 28 | 29 | def _getThreadId(threadUrl): 30 | # http://www.itpub.net/forum.php?mod=viewthread&tid=1596873&extra=page%3D1%26filter%3Dtypeid%26typeid%3D385%26typeid%3D385&page=2 31 | # http://www.itpub.net/thread-1761255-1-1.html 32 | o = urlparse(threadUrl) 33 | if 'mod=viewthread&tid=' in o.query: 34 | querys = parse_qs(o.query) 35 | threadId = querys.get('tid')[0] 36 | # pageNum = querys.get('page') 37 | # pageNum = pageNum if pageNum else 1 #如果没有默认第一页 38 | elif '/thread-' in o.path: 39 | pathList = o.path.split('-') 40 | threadId = pathList[1] 41 | # pageNum = pathList[2] 42 | else: 43 | print('Error: getThreadId from url error, ' + threadUrl) 44 | return int(threadId) 45 | 46 | 47 | def _parseForum(forumUrl, downed=False): 48 | """ 解析一个分页的 thread 49 | """ 50 | d = pq(forumUrl) 51 | threads = [] 52 | items = d('img[title="附件"]').siblings('a.xst').items() 53 | for item in items: 54 | url = BASE_URL + item.attr('href') 55 | name = item.text() 56 | threadId = _getThreadId(url) 57 | 58 | thread = {'_id': threadId, 'name': name, 'url': url, 'downed': downed} 59 | threads.append(thread) 60 | 61 | print('得到 %d 个thread' % len(threads)) 62 | return threads 63 | 64 | 65 | # def setOneBookDowned(threadUrl, threadId=None): 66 | # if not threadId: 67 | # threadId = _getThreadId(threadUrl) 68 | # db.itpub.update({'_id': threadId}, {'$set': {'downed': True}}, upset=True) 69 | # print('已设置为已下载') 70 | 71 | 72 | # def parseAllThreads(start=1, end=7): 73 | # """ 精华书籍的列表 74 | # http://www.itpub.net/forum.php?mod=forumdisplay&fid=61&filter=typeid&typeid=385&page=1 75 | # """ 76 | # print('开始解析所有精华列表') 77 | # for i in range(start, end + 1): 78 | # threadUrl = 'http://www.itpub.net/forum.php?mod=forumdisplay&fid=61&filter=typeid&typeid=385&page=%s' % i 79 | # print('fetch: page %s...' % i) 80 | # threads = _parseForum(threadUrl) 81 | # for t in threads: 82 | # db.itpub.update({'_id': t['_id']}, {'downed': True}) 83 | 84 | 85 | def _createNextPageUrl(url): 86 | # 如果 url = http://www.itpub.net/thread-1608864-2-1.html 这是第二页 87 | if '/forum.php?mod=viewthread' in url: 88 | if '&page=' in url: 89 | bUrl, pageNum = url.split('&page=') 90 | pageNum = int(pageNum) 91 | else: 92 | bUrl = url 93 | pageNum = 1 94 | nextUrl = '%s&page=%d' % (bUrl, (pageNum + 1)) 95 | # 或 http://www.itpub.net/forum.php?mod=viewthread&tid=512296&page=2 96 | elif '/thread-' in url: 97 | m = re.match(r'(.)-(\d)-(\d\.html)', url) 98 | nextNum = int(m.group(2)) + 1 99 | nextUrl = '%s-%d-%s' % (m.group(1), nextNum, m.group(3)) 100 | else: 101 | print('生成下一页网址错误: ' + url) 102 | return 103 | 104 | return nextUrl 105 | 106 | 107 | cacheNames = [] 108 | 109 | 110 | def parseAndDownOneBook(url): 111 | print(' 开始解析: ' + url) 112 | d = pq(url) 113 | find = d('ignore_js_op a[href^="attachment.php?aid="]') 114 | 115 | if not find: 116 | return 117 | 118 | for e in find.items(): 119 | downUrl = BASE_URL + e.attr('href') 120 | name = e.text() 121 | #防止不断重复 122 | if name in cacheNames: 123 | return 124 | _addToDownload(downUrl, name) 125 | cacheNames.append(name) 126 | 127 | if '.part' in name: 128 | #进入下一页 129 | nextUrl = _createNextPageUrl(url) 130 | parseAndDownOneBook(nextUrl) 131 | 132 | 133 | def downDigestBooks(limit=10): 134 | """ 下载精华书籍 """ 135 | books = db.itpub.find({'$or': [{'downed': {'$exists': False}}, {'downed': False}]}).limit(limit) 136 | for book in books: 137 | parseAndDownOneBook(book['url']) 138 | setOneBookDowned(book['url']) 139 | 140 | 141 | def main_old(): 142 | parser = argparse.ArgumentParser(description='itpub下载') 143 | parser.add_argument('-d', dest='threadUrl', help='论坛中一个帖子的地址') 144 | parser.add_argument('-txt', dest='txtPath', help='文本文件的路径') 145 | parser.add_argument('-all', dest='downNum', help='继续下载n个精华帖子') 146 | parser.add_argument('-df', dest='attachmentUrl', help='附件下载地址') 147 | 148 | args = parser.parse_args(sys.argv[1:]) 149 | if args.threadUrl: 150 | #下载单个链接 151 | if args.threadUrl.startswith('http://www.itpub.net'): 152 | parseAndDownOneBook(args.threadUrl) 153 | else: 154 | print('您输入的url不正确') 155 | 156 | if args.txtPath: 157 | #下载批量链接 158 | with open('itpub.txt') as f: 159 | fileLines = f.readlines() 160 | 161 | for line in fileLines: 162 | url = line.strip() 163 | parseAndDownOneBook(url) 164 | print('') 165 | 166 | if args.attachmentUrl: 167 | urls = re.findall(r'http://.', s) 168 | for url in urls: 169 | _addToDownload(url) 170 | 171 | if args.downNum: 172 | downDigestBooks(int(args.downNum)) 173 | 174 | 175 | def command_line_runner(): 176 | inputStr = sys.argv[1] 177 | if inputStr.startswith('http://www.itpub.net'): 178 | if 'thread' in inputStr: 179 | dofunc = parseAndDownOneBook 180 | elif 'attachment.php?aid=' in inputStr: 181 | dofunc = _addToDownload 182 | elif '.txt' in inputStr: 183 | with open('itpub.txt') as f: 184 | inputStr = f.read() 185 | dofunc = parseAndDownOneBook 186 | else: 187 | input('不支持此url，按回车键退出') 188 | sys.exit(1) 189 | 190 | urls = re.findall(r'http://.', inputStr) 191 | for i, url in enumerate(urls, start=1): 192 | print('%s/%s %s' % (i, len(urls), url)) 193 | dofunc(url) 194 | 195 | if name == 'main': 196 | command_line_runner() 197 | -------------------------------------------------------------------------------- /lib/init.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ywzhaiqi/DownloadHelpers/9f8ceff8deb6124c1e64fbec9bd658dbc27ae8bf/lib/init.py -------------------------------------------------------------------------------- /lib/common.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | import sys 4 | import subprocess 5 | 6 | from config import IDM_PATH 7 | 8 | PY3k = sys.version_info >= (3,) 9 | SUFFIXES = ['Byte', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] 10 | 11 | if PY3k: 12 | from urllib.parse import unquote 13 | else: 14 | from urllib import unquote 15 | 16 | 17 | def addToIDM(url, path=None, name=None): 18 | command = [IDM_PATH, '/d', url] 19 | if path: 20 | command.extend(['/p', path]) 21 | if name: 22 | command.extend(['/f', name.encode('gbk')]) 23 | command.append('/a') 24 | retcode = subprocess.call(command) 25 | if name: 26 | print(u'添加到IDM: ' + name) 27 | else: 28 | print(u'添加到IDM: ' + url) 29 | return retcode 30 | 31 | 32 | def str_size(size, unit='Byte'): 33 | """將文件大小转换为合适的单位表示.""" 34 | if size < 1024: 35 | return '%.2f %s' % (size, unit) 36 | 37 | return str_size(size / 1024.0, SUFFIXES[SUFFIXES.index(unit) + 1]) 38 | 39 | 40 | def unquote_url(x): 41 | if type(x) != str: 42 | return x 43 | if PY3k: 44 | try: 45 | return unquote(x, 'utf-8') 46 | except UnicodeDecodeError: 47 | return unquote(x, 'gbk') # can't decode in utf-8 and gbk 48 | else: 49 | x = unquote(x) 50 | try: 51 | return x.decode('utf-8') 52 | except UnicodeDecodeError: 53 | return x.decode('gbk') 54 | 55 | 56 | def parse_ed2k_link(link): 57 | ed2k_re = r'ed2k://\|file\|([^|])\|(\d+)\|([a-fA-F0-9]{32})\|' 58 | m = re.match(ed2k_re, link) or re.match(ed2k_re, unquote(link)) 59 | if not m: 60 | raise Exception('not an acceptable ed2k link: ' + link) 61 | name, file_size, hash_hex = m.groups() 62 | return unquote_url(name), hash_hex.lower(), int(file_size) 63 | 64 | 65 | def parse_ed2k_id(link): 66 | return parse_ed2k_link(link)[1:] 67 | 68 | 69 | def parse_ed2k_file(link): 70 | return parse_ed2k_link(link)[0] 71 | -------------------------------------------------------------------------------- /lib/config.ini: -------------------------------------------------------------------------------- 1 | [Home] 2 | idm_path = D:\Program Files\Internet Download Manager\IDMan.exe 3 | 4 | [545-PC] 5 | idm_path = F:\Program Files\Internet Download Manager\IDMan.exe 6 | 7 | -------------------------------------------------------------------------------- /lib/config.py: -------------------------------------------------------------------------------- 1 | # coding: gbk 2 | import os 3 | import ConfigParser 4 | 5 | DEFAULT_CONFIG = 'config.ini' 6 | 7 | 8 | class Config: 9 | def init(self): 10 | self.config = ConfigParser.ConfigParser() 11 | self.profile = os.getenv('COMPUTERNAME') 12 | self.load_config() 13 | 14 | def load_config(self): 15 | if os.path.exists(DEFAULT_CONFIG): 16 | self.config.read(DEFAULT_CONFIG) 17 | 18 | def put(self, option, value): 19 | self.config.set(self.profile, option, value) 20 | 21 | def get(self, option): 22 | return self.config.get(self.profile, option) 23 | 24 | def write(self): 25 | for profile in profiles: 26 | self.config.add_section(profile[0]) 27 | self.config.set(profile[0], profile[1], profile[2]) 28 | with open(CONFIG_PATH, 'w') as configfile: 29 | self.config.write(configfile) 30 | 31 | global_config = Config() 32 | 33 | IDM_PATH = global_config.get('idm_path') -------------------------------------------------------------------------------- /opencourse.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import argparse 4 | from pyquery import PyQuery 5 | 6 | from lib.common import addToIDM 7 | 8 | BASE_PATH = 'e:\\Downloads\\_tmp' 9 | 10 | 11 | def download(url): 12 | d = PyQuery(url) 13 | 14 | title = d('.m-cdes h2').text() 15 | # title = trimFileName(title) # 整个公开课的标题 16 | # info = d('.m-cdes p:first').text() # 本课程共4集翻译完欢迎学习 17 | 18 | path = os.path.join(BASE_PATH, title) 19 | print('保存的位置：' + path) 20 | 21 | # 有2个列表 22 | for e in d('#list2 .u-ctitle').items(): 23 | ctitle = e.text() # 一节课的标题 24 | downUrl = e.siblings('.u-cdown .downbtn').attr('href') 25 | #没翻译的为None 26 | if downUrl: 27 | filename = ctitle + '.mp4' 28 | addToIDM(downUrl, path, filename) # retcode=0 29 | 30 | 31 | def get_parser(): 32 | parser = argparse.ArgumentParser(description='help download 163 OpenCourse') 33 | parser.add_argument('url', metavar='URL', nargs='?', help='the course url') 34 | return parser 35 | 36 | 37 | def command_line_runner(): 38 | parser = get_parser() 39 | args = vars(parser.parse_args()) 40 | if 'url' in args: 41 | download(args['url']) 42 | 43 | 44 | if name == 'main': 45 | command_line_runner() 46 | -------------------------------------------------------------------------------- /opencourse_old.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import urllib.parse 3 | import json 4 | import cmd 5 | from fnmatch import fnmatch 6 | 7 | from pyquery import PyQuery 8 | 9 | from common import 10 | 11 | import io # 防止 print 出错 12 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) 13 | 14 | BASE_PATH = 'e:\\公开课' 15 | 16 | def downCourse(url): 17 | d = PyQuery(url=url) 18 | 19 | title = d('.m-cdes h2').text() 20 | title = trimFileName(title) # 整个公开课的标题 21 | info = d('.m-cdes p:first').text() # 本课程共4集翻译完欢迎学习 22 | 23 | path = os.path.join(BASE_PATH, title) 24 | print('保存的位置：' + path) 25 | 26 | for e in d('.u-ctitle').items(): 27 | ctitle = e.text() # 一节课的标题 28 | downUrl = e.siblings('.u-cdown .downbtn').attr('href') 29 | #没翻译的为None 30 | if downUrl: 31 | filename = ctitle + '.mp4' # 32 | retcode = addToIDM(downUrl, path, filename) # retcode=0 33 | print(' 添加到IDM：' + filename) 34 | 35 | class WebParser: 36 | def _parseCategory(self): 37 | """ 解析 http://open.163.com/ocw/ 国际公开课的分类列表 38 | Returns 39 | [{"category": "文学", "name": "牛津大学《犯罪小说》", "info": "共3集翻译完", 'url':'..'}] 40 | """ 41 | # 国际名校公开课 42 | 43 | startUrl = 'http://open.163.com/ocw/' 44 | courses = [] 45 | html = getHtml(startUrl) 46 | soup = BeautifulSoup(html) 47 | for csoup in soup.find_all('div', 'm-conmt'): 48 | category = csoup.find('h3', 'f-fs1').get_text() # e.g. 文学、艺术、哲学.... 49 | 50 | for oItem in csoup.find_all('div', 'cnt'): 51 | name = oItem.h5.string # e.g. 伟谷《亚伯拉罕·林肯两百周年纪念》 52 | url = oItem.h5.a['href'] # e.g. http://v.163.com/special/opencourse/abrahamlincoln.html 53 | info = oItem.h6.string # e.g. 共3集翻译完 54 | 55 | courses.append({ 56 | 'category': category, 'name': name, 'url': url, 'info': info 57 | }) 58 | 59 | return courses 60 | 61 | def _parseCourse(self, url): 62 | """ 解析一个公开课 63 | 64 | Return 65 | introduction 介绍 66 | items 多个子条目，是个字典 67 | """ 68 | html = getHtml(url, 'gbk') 69 | if not html: 70 | print('get url fail: ' + url) 71 | return 72 | soup = BeautifulSoup(html) 73 | # 介绍 74 | info_soup = soup.find('div', 'm-cdes') 75 | if info_soup: 76 | info = info_soup.get_text() 77 | introduction = info.replace(' \n\n\n 分享 \n\n\n\n\n\n\n /分享 \n收藏\n\n\n', '') 78 | else: 79 | introduction = '' 80 | # 子条目 81 | items = [] 82 | for tr_soup in soup.select('#list2 tr[class="u-"]'): 83 | item = {} 84 | #name 85 | n_soup = tr_soup.find('td', 'u-ctitle') 86 | name = n_soup.get_text().strip() 87 | name = re.sub(r'\s+', ' ', name) 88 | name = re.sub(r':|/', '.', name) 89 | item['name'] = name 90 | #原始链接 91 | release_url = n_soup.find('a')['href'] 92 | item['release_url'] = release_url 93 | #下载链接 94 | u_soup = tr_soup.find('a', 'downbtn') 95 | if u_soup: # 翻译过的 96 | down_url = u_soup['href'] 97 | translated = True 98 | else: # 没翻译过的 99 | down_url = r1(r'(http.mp4)', str(tr_soup)) 100 | translated = False 101 | item['down_url'] = down_url 102 | item['translated'] = translated 103 | items.append(item) 104 | 105 | return introduction, items 106 | 107 | 108 | class OpenCourse: 109 | """ course like 110 | { 111 | name: '', url: '', category: '文学', info: '共3集翻译完', introduction: '', 112 | items: [{name:'', down_url:'', translated: True, is_downed: False, release_url: ''}] 113 | } 114 | """ 115 | 116 | def init(self): 117 | conn = pymongo.MongoClient() 118 | self.db = conn.opencourse 119 | self.courses = self.db.courses # 目录集合 120 | 121 | def getAndUpdateOne(self): 122 | """ 获取目录列表并插入到 MongoDB 123 | """ 124 | for course in self.courses.find(): 125 | if 'items' not in course: 126 | print(course['name'], course['url']) 127 | self._parseCourse(course) 128 | self.courses.update({'_id': course['_id']}, course) 129 | 130 | def find(self, name, printItems=True): 131 | name = name.replace('\\', '') 132 | findCourses = list(self.courses.find({'name': re.compile(name)})) 133 | size, i = len(findCourses), 0 134 | print('找到%d个公开课:' % size) 135 | for course in findCourses: 136 | items = course['items'] 137 | #输出分类、名字、子条目数目、是否全部下载 138 | print('%3d %s/%s, 共%d个子条目:' % (i, course['category'], course['name'],len(items))) 139 | #2个一行输出子条目 140 | if printItems: 141 | for j in range(0, len(items), 2): 142 | #当前这个 143 | curName = items[j]['name'] 144 | # curDowned = '[未下载]' if items[j]['is_downed'] is False else '' 145 | #下一个 146 | next = j+1 147 | if next == len(items): 148 | nextName = '' 149 | nextDowned = '' 150 | else: 151 | nextName = items[j+1]['name'] 152 | # nextDowned = ' [未下载]' if items[j+1]['is_downed'] is False else '' 153 | #输出 154 | print(' %s%s' % (curName.ljust(30), nextName )) 155 | # print(' %s %s %s' % (curName, curDowned.ljust(30), nextName + nextDowned)) 156 | i += 1 157 | 158 | return findCourses 159 | 160 | def _checkLocalPath(self, course): 161 | print('检查本地情况: %s' % course['name']) 162 | dirPath = os.path.join(BASE_PATH, course['category'], course['name']) 163 | unDowned = 0 164 | for item in course['items']: 165 | # 临时用，删除以前的，因为改了名字 166 | if 'is_downed' in item: 167 | del item['is_downed'] 168 | if os.path.exists(os.path.join(dirPath, item['name'] + '.mp4')): 169 | item['downed'] = True 170 | else: 171 | item['downed'] = False 172 | unDowned += 1 173 | course['unDowned'] = unDowned 174 | return course 175 | 176 | def _updateLocalPath(self, course): 177 | self._checkLocalPath(course) 178 | 179 | self.courses.update({'_id': course['_id']}, { 180 | '$set': {'unDowned': course['unDowned'], 'items': course['items']}, 181 | '$unset': {'all_downed': ''} 182 | }) 183 | total = len(course['items']) 184 | downed = total - course['unDowned'] 185 | # sUnDowned = ' [还有 %s个未下载，继续努力]' % unDowned if unDowned>0 else ' [恭喜你，已全部下载]' 186 | print('成功更新本地路径: {}，下载进度：{:.0%} {}/{}'.format(course['name'], 187 | int(downed/total), downed, total)) 188 | 189 | def listCategory(self, category): 190 | for c in self.courses.find({'category': category}): 191 | pass 192 | 193 | def listUndownload(self, limit=10): 194 | # self.courses.find({'items': {'$elemMatch': 'is_downed': False}}, project) 195 | courses = self.courses.find({'items.is_downed': False}, {'name': 1}).limit(limit) 196 | for c in courses: 197 | print(c['name']) 198 | 199 | def findAndChoose(self, name, func): 200 | """ 查找并选择，然后执行更新或下载的函数 201 | """ 202 | courses = self.find(name, printItems=False) 203 | if courses: 204 | if len(courses) == 1: 205 | func(courses[0]) 206 | else: 207 | choose = int(input('请选择第几个: ')) 208 | if 0 <= choose < len(courses): 209 | func(courses[choose]) 210 | else: 211 | print('不在选择列表中') 212 | 213 | def downOne(self, name): 214 | """ 添加一个公开课到IDM 215 | name 文件夹名字，支持模糊搜索 e.g. 密歇根网络教育《平面设计》 216 | """ 217 | findCourses = self.find(name) 218 | if findCourses: 219 | if len(findCourses) == 1: 220 | self._addToDown(findCourses[0]) 221 | else: 222 | choose = int(input('请选择第几个: ')) 223 | if 0 <= choose < len(findCourses): 224 | self._addToDown(findCourses[choose]) 225 | else: 226 | print('不在选择列表中') 227 | 228 | 229 | def test(self): 230 | # for f in os.listdir(): 231 | # if fnmatch(f, '.mp4'): 232 | for c in self.courses.find(): 233 | self._updateLocalPath(c) 234 | # for c in self.courses.find(): 235 | # folder = c['name'] 236 | # for item in c['items']: 237 | 238 | pass 239 | 240 | def downAll(self): 241 | """ 要判断是否下载过 242 | """ 243 | for course in self.courses.find(): 244 | pass 245 | 246 | def _addToDown(self, course, start=0, num=100): 247 | """ 248 | @args 249 | course 一个公开课 dict 250 | start 起始位置, 默认为0 251 | num 添加的数量, -1代表全部 252 | """ 253 | if num < 1: 254 | print('添加的数目不能小于1！') 255 | return 256 | 257 | downPath = 'e:\\Downloads\\1\\公开课' 258 | dirpath = os.path.join(BASE_PATH, course['category'], course['name']) 259 | end = start + num 260 | for item in course['items'][start:end]: 261 | #跳过未翻译的 262 | if item['translated'] is True: 263 | url = item['down_url'] 264 | name = trimFileName(item['name']) + '.mp4' 265 | addToIDM(url, dirpath, name) 266 | print('已添加到IDM: ' + name) 267 | 268 | class OpenCourseCommand(cmd.Cmd): 269 | intro = '我的公开课管理下载程序' 270 | prompt = '(OpenCourse) ' 271 | 272 | def do_exit(self, arg): 273 | sys.exit() 274 | 275 | def do_set(self, arg): 276 | self.A = arg 277 | print('已经设置A的值') 278 | 279 | def do_print(self, arg): 280 | print(self.A) 281 | 282 | def main(): 283 | parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, 284 | description='公开课搜索和下载') 285 | parser.add_argument('-f', '--find', dest="find_name", default='', 286 | help="模糊查找，如输入'音乐'，名字带有'音乐'的所有公开课都将被查找出来。语法：'-f 音乐'") 287 | parser.add_argument('-d', '--down', dest='down_name', default='', 288 | help="添加到IDM，如输入'耶鲁大学《聆听音乐》'，程序将添加'耶鲁大学《聆听音乐》'的全集。") 289 | parser.add_argument('-u', '--uplocalpath', dest='uplocal_name', default='', help="更新本地路径到数据库") 290 | parser.add_argument('-l', '--list', dest='list_cmd', default='', help='test') 291 | parser.add_argument('test', nargs='?', help='test') 292 | 293 | args = parser.parse_args(sys.argv[1:]) 294 | course = OpenCourse() 295 | 296 | if args.find_name: 297 | course.find(args.find_name) 298 | if args.down_name: 299 | course.downOne(args.down_name) 300 | if args.uplocal_name: 301 | pass 302 | # course.findAndChoose(args.uplocal_name, course.updateLocalPath) 303 | if 'test' in args: 304 | course.test() 305 | # opencourse.listUndownload() 306 | # 307 | # opencourse.updateLocalPath() 308 | 309 | if name == 'main': 310 | # main() 311 | downCourse('http://v.163.com/special/opencourse/arabic.html') 312 | -------------------------------------------------------------------------------- /verycd/simplecd.py: -------------------------------------------------------------------------------- 1 | 2 | from urllib.request import urlopen 3 | import re 4 | 5 | 6 | def simplecd_links(url): 7 | m = re.match(r'(http://(?:www\.)?s[ia]mplecd\.\w+/)(id|entry)/', url) 8 | assert m, url 9 | site = m.group(1) 10 | html = urlopen(url).read().decode('utf-8') 11 | ids = re.findall(r'value="(\w+)"\s+name="selectemule"', html) 12 | form = '&'.join('rid=' + id for id in ids) 13 | q = 'mode=copy&' + form 14 | html = urlopen(site + 'download/?' + q).read().decode('utf-8') 15 | table = re.search(r'', html, flags=re.S).group() 16 | links = re.findall(r'ed2k://[^\s<>]+', table) 17 | return links 18 | 19 | 20 | def extend_link(url): 21 | links = simplecd_links(url) 22 | from myLib import parse_ed2k_file 23 | return [{'url':x, 'name':parse_ed2k_file(x)} for x in links] 24 | 25 | 26 | def test(): 27 | url = 'http://simplecd.me/entry/gAwlxX80/' 28 | items = extend_link(url) 29 | for item in items: 30 | print(item['name'], item['url']) 31 | 32 | 33 | if name == 'main': 34 | test() 35 | -------------------------------------------------------------------------------- /verycd/verycd.py: -------------------------------------------------------------------------------- 1 | # -- coding=utf-8 -- 2 | import re 3 | import argparse 4 | from pyquery import PyQuery 5 | 6 | 7 | def parse_other_page(url): 8 | """ 从其它网站获取链接 """ 9 | print('正在从 gdajie.com 获取链接') 10 | url = url.replace('www.verycd.com', 'www.verycd.gdajie.com') 11 | d = PyQuery(url) 12 | emuleFile = d('#emuleFile a[href^="ed2k://"]') 13 | if emuleFile: 14 | items = [(e.attr('href'), e.text()) for e in emuleFile.items()] 15 | return items 16 | else: 17 | #http://verycdfetch.duapp.com/topics/132012/ 18 | print('都没有链接') 19 | pass 20 | 21 | 22 | def parse_verycd_topic(url): 23 | assert url.startswith('http://www.verycd.com/topics/'), url 24 | print('正在从 verycd 获取链接...') 25 | d = PyQuery(url) 26 | if d('#iptcomED2K div:contains("无法提供下载")'): 27 | print(' verycd 没有链接') 28 | # parse_other_page(url) 29 | else: 30 | for e in d('#iptcomED2K a[ed2k^="ed2k://"]').items(): 31 | link = e.attr('ed2k') 32 | name = e.text() 33 | yield link, name 34 | # links = iptcomED2K.find('a[ed2k]').map(lambda i, e: PyQuery(e).attr('ed2k')) 35 | # return links 36 | 37 | # def extend_link(url): 38 | # links = verycd_links(url) 39 | # from lixian_hash_ed2k import parse_ed2k_file 40 | # return [{'url':x, 'name':parse_ed2k_file(x)} for x in links] 41 | 42 | 43 | def test(): 44 | url_text = '''http://www.verycd.com/topics/2944233/ 45 | http://www.verycd.com/topics/2943310/ 46 | http://www.verycd.com/topics/2943294/ 47 | ''' 48 | 49 | urls = re.findall(r'(http://.)', url_text) 50 | for url in urls: 51 | for link, name in parse_other_page(url): 52 | print(link) 53 | 54 | 55 | def command_line_runner(): 56 | parser = argparse.ArgumentParser('批量解析verycd下载链接') 57 | parser.add_argument('url_text', metavar='URL_TEXT', nargs='?', help='链接或链接文本') 58 | 59 | args = parser.parse_args() 60 | if args.url_text: 61 | urls = re.findall(r'(http://.*)', args.url_text) 62 | pass 63 | 64 | 65 | if name == 'main': 66 | # command_line_runner() 67 | test() 68 | -------------------------------------------------------------------------------- /weiphone.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ywzhaiqi/DownloadHelpers/9f8ceff8deb6124c1e64fbec9bd658dbc27ae8bf/weiphone.py -------------------------------------------------------------------------------- /youku/youku_join.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | import subprocess 5 | import json 6 | 7 | from youku_flvcd import TO_JOIN_PATH 8 | 9 | 10 | def joinVideo(videoPaths): 11 | """ 合并一个视频 12 | """ 13 | if len(videoPaths) == 0: 14 | print("videoPaths size=0") 15 | return 16 | 17 | allName = videoPaths[0].replace("-0001", "") # 合并后的名称 18 | fileExt = os.path.splitext(videoPaths[0])[1] # 后缀名 19 | 20 | if fileExt == '.mp4': 21 | command = ["D:\网络工具\硕鼠\mp4box.exe"] 22 | for videoPath in videoPaths: 23 | command.extend(["-cat", videoPath]) 24 | command.extend(["-new", allName]) 25 | elif fileExt == '.flv': 26 | command = ["D:\网络工具\硕鼠\FlvBind.exe"] 27 | command.append(allName) 28 | command.extend(videoPaths) 29 | else: 30 | print('格式不支持') 31 | return False 32 | 33 | returnCode = subprocess.call(command) 34 | if returnCode == 0: 35 | #移除片段文件 36 | for path in videoPaths: 37 | os.remove(path) 38 | print("删除文件: " + path) 39 | return True 40 | 41 | 42 | def main(): 43 | #取出记录列表 44 | try: 45 | with open(TO_JOIN_PATH) as f: 46 | videos = json.load(f) 47 | except: 48 | print("没有记录列表") 49 | return 50 | 51 | if not videos: 52 | return 53 | 54 | #一个个地检验下载完成没 55 | for videoPaths in videos: 56 | #只要有一个没下载好就不合并 57 | isFinish = True 58 | for path in videoPaths: 59 | if not os.path.exists(path): 60 | isFinish = False 61 | break 62 | 63 | if isFinish: 64 | success = joinVideo(videoPaths) 65 | if success: 66 | videos.remove(videoPaths) 67 | 68 | #如果还有videos则存入进度文件，否则删除进度文件 69 | if videos: 70 | with open(TO_JOIN_PATH, 'w') as f: 71 | json.dump(videos, f, indent=4, ensure_ascii=False) 72 | else: 73 | if os.path.exists(TO_JOIN_PATH): 74 | os.remove(TO_JOIN_PATH) 75 | 76 | 77 | if name == 'main': 78 | main() 79 | --------------------------------------------------------------------------------