├── .editorconfig ├── .github ├── ISSUE_TEMPLATE.md └── workflows │ └── nodejs.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── docker-entrypoint.sh ├── docs ├── .editorconfig ├── .vuepress │ ├── components │ │ └── bilibili-player.vue │ └── config.js ├── README.md ├── advance │ ├── cli.md │ └── patch.md ├── courses │ ├── cnmooc.md │ ├── icourse163.md │ ├── icourses.md │ ├── livedu.md │ ├── open_163.md │ ├── study_163.md │ ├── study_mooc.md │ └── xuetangx.md ├── guide │ ├── basic.md │ ├── faq.md │ ├── getting-started.md │ ├── known-issues.md │ └── notice.md └── images │ ├── get_cookies.png │ └── icourse163_01.png ├── mooc.py ├── moocs ├── __init__.py ├── cnmooc.py ├── icourse163.py ├── icourses.py ├── icourses_share.py ├── livedu.py ├── open_163.py ├── study_163.py ├── study_mooc.py ├── utils.py ├── xuetangx.py └── xuetangx_next.py ├── package.json ├── requirements.txt ├── scripts └── deploy.sh └── utils ├── aria2.py └── crawler.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig 2 | # https://editorconfig.org/ 3 | 4 | root = true 5 | 6 | [*] 7 | indent_style = space 8 | indent_size = 2 9 | end_of_line = lf 10 | charset = utf-8 11 | trim_trailing_whitespace = true 12 | insert_final_newline = true 13 | 14 | [*.py] 15 | indent_size = 4 16 | 17 | [*.md] 18 | indent_size = 3 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## (请在这里填写错误简述) 2 | 3 | 网站:中国大学MOOC(网易云课堂 MOOC、学堂在线) 4 | 5 | 课程地址:(请在这里填写课程地址) 6 | 7 | 问题描述:(请在这里填写问题描述) 8 | -------------------------------------------------------------------------------- /.github/workflows/nodejs.yml: -------------------------------------------------------------------------------- 1 | name: Node CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | build-and-deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@master 13 | - name: git-lfs 14 | run: | 15 | git lfs install 16 | git lfs pull 17 | - uses: actions/setup-node@master 18 | - name: deploy 19 | run: | 20 | npm install yarn 21 | yarn 22 | yarn deploy $ACCESS_TOKEN 23 | env: 24 | ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # celery beat schedule file 85 | celerybeat-schedule 86 | 87 | # SageMath parsed files 88 | *.sage.py 89 | 90 | # Environments 91 | .env 92 | .venv 93 | env/ 94 | venv/ 95 | ENV/ 96 | env.bak/ 97 | venv.bak/ 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | .spyproject 102 | 103 | # Rope project settings 104 | .ropeproject 105 | 106 | # mkdocs documentation 107 | /site 108 | 109 | # mypy 110 | .mypy_cache/ 111 | .dmypy.json 112 | dmypy.json 113 | 114 | # Pyre type checker 115 | .pyre/ 116 | 117 | ### Node ### 118 | # Logs 119 | logs 120 | *.log 121 | npm-debug.log* 122 | yarn-debug.log* 123 | yarn-error.log* 124 | lerna-debug.log* 125 | 126 | # Diagnostic reports (https://nodejs.org/api/report.html) 127 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 128 | 129 | # Runtime data 130 | pids 131 | *.pid 132 | *.seed 133 | *.pid.lock 134 | 135 | # Directory for instrumented libs generated by jscoverage/JSCover 136 | lib-cov 137 | 138 | # Coverage directory used by tools like istanbul 139 | coverage 140 | *.lcov 141 | 142 | # nyc test coverage 143 | .nyc_output 144 | 145 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 146 | .grunt 147 | 148 | # Bower dependency directory (https://bower.io/) 149 | bower_components 150 | 151 | # node-waf configuration 152 | .lock-wscript 153 | 154 | # Compiled binary addons (https://nodejs.org/api/addons.html) 155 | build/Release 156 | 157 | # Dependency directories 158 | node_modules/ 159 | jspm_packages/ 160 | 161 | # TypeScript v1 declaration files 162 | typings/ 163 | 164 | # TypeScript cache 165 | *.tsbuildinfo 166 | 167 | # Optional npm cache directory 168 | .npm 169 | 170 | # Optional eslint cache 171 | .eslintcache 172 | 173 | # Optional REPL history 174 | .node_repl_history 175 | 176 | # Output of 'npm pack' 177 | *.tgz 178 | 179 | # Yarn Integrity file 180 | .yarn-integrity 181 | 182 | # dotenv environment variables file 183 | .env 184 | .env.test 185 | 186 | # parcel-bundler cache (https://parceljs.org/) 187 | .cache 188 | 189 | # next.js build output 190 | .next 191 | 192 | # nuxt.js build output 193 | .nuxt 194 | 195 | # vuepress build output 196 | .vuepress/dist 197 | 198 | # Serverless directories 199 | .serverless/ 200 | 201 | # FuseBox cache 202 | .fusebox/ 203 | 204 | # DynamoDB Local files 205 | .dynamodb/ 206 | 207 | # End of https://www.gitignore.io/api/node 208 | 209 | # Node.js 210 | yarn.lock 211 | package.json 212 | .huskyrc 213 | .editorconfig 214 | commitlint.config.js 215 | 216 | # draft 217 | draft/ 218 | 219 | # IDEs/editors 220 | .vscode/ 221 | .idea/ 222 | 223 | # Yarn 224 | yarn.lock 225 | 226 | # course crawler 227 | __pycache__/ 228 | *.pyc 229 | /* - */ 230 | /*.json 231 | 232 | # Others 233 | .ipynb_checkpoints 234 | .idea 235 | .DS_Store 236 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:alpine 2 | 3 | WORKDIR /app 4 | 5 | 6 | RUN apk add --update --no-cache --virtual build_images g++ gcc libxslt-dev git && \ 7 | git clone https://github.com/Foair/course-crawler.git /app && \ 8 | pip install requests BeautifulSoup4 lxml -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com && \ 9 | apk del build_images && \ 10 | rm -rf /app/README.md /app/LICENSE 11 | 12 | COPY ./docker-entrypoint.sh /app 13 | 14 | RUN chmod 777 ./docker-entrypoint.sh 15 | 16 | ENTRYPOINT ["./docker-entrypoint.sh"] 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Foair 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Course Crawler 2 | 3 | ![python 3.6.7](https://img.shields.io/badge/python-3.6.7-green?style=flat-square&logo=python) 4 | 5 | 一个基于 Python 3 的 MOOC 课程下载工具,可以获取多个慕课网站的课件,方便离线观看 6 | 7 | ### 支持列表 8 | 9 | - [中国大学MOOC](https://www.icourse163.org/) 10 | - [网易云课堂](http://study.163.com/) 11 | - [普通课程](http://study.163.com/) 12 | - [MOOC 课程](http://mooc.study.163.com/) 13 | - [网易公开课](https://open.163.com/) 14 | - [好大学在线](https://www.cnmooc.org/) 15 | - [爱课程](http://www.icourses.cn/) 16 | - [视频公开课](http://www.icourses.cn/cuoc/) 17 | - [资源共享课](http://www.icourses.cn/mooc/) 18 | - [学堂在线](http://www.xuetangx.com/) 19 | - [北京高校优质课程研究会](http://www.livedu.com.cn/) 20 | 21 | 详细信息和用法请见 [https://www.sigure.xyz/course-crawler/](https://www.sigure.xyz/course-crawler/)。 22 | 23 | ### 声明 24 | 25 | 仅限个人学习和研究使用,切勿用于其他用途。强烈建议到 MOOC 网站进行学习,本程序只是提供一个备选方案。 26 | 27 | 本程序主体功能只是下载课件和附件,无任何手段获得付费课程,也没有以任何方式向任何人收取费用。 28 | 29 | 如果将程序用于商业用途或其他非法用途,一切后果由用户自负。 30 | 31 | 如果您发现有侵犯到您的合法权益,请与我联系删除相关代码,同时我对无意冒犯到您致以深深的歉意。 32 | 33 | ### 许可协议 34 | 35 | 请遵照 MIT 许可使用该程序。 36 | -------------------------------------------------------------------------------- /docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python mooc.py "$@" -d "/video" 4 | -------------------------------------------------------------------------------- /docs/.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig 2 | # https://editorconfig.org/ 3 | 4 | root = true 5 | 6 | [*] 7 | indent_style = space 8 | indent_size = 2 9 | end_of_line = lf 10 | charset = utf-8 11 | trim_trailing_whitespace = true 12 | insert_final_newline = true 13 | 14 | [*.py] 15 | indent_size = 4 16 | 17 | [*.sh] 18 | indent_size = 4 19 | 20 | [*.md] 21 | indent_size = 3 22 | -------------------------------------------------------------------------------- /docs/.vuepress/components/bilibili-player.vue: -------------------------------------------------------------------------------- 1 | 14 | 15 | 34 | 35 | 50 | -------------------------------------------------------------------------------- /docs/.vuepress/config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | title: "Course Crawler", 3 | description: "基于 Python 3 的 MOOC 课程下载工具", 4 | base: "/course-crawler/", 5 | 6 | // 插件 7 | plugins: [ 8 | // 页面滚动时自动激活侧边栏链接 9 | "@vuepress/active-header-links" 10 | ], 11 | 12 | // 主题配置 13 | themeConfig: { 14 | nav: [ 15 | { text: "指南", link: "/" }, 16 | { text: "分类", link: "/courses/icourse163" }, 17 | { text: "进阶", link: "/advance/cli" } 18 | ], 19 | sidebarDepth: 1, 20 | sidebar: { 21 | "/advance/": ["cli", "patch"], 22 | "/courses/": [ 23 | "icourse163", 24 | "study_163", 25 | "study_mooc", 26 | "open_163", 27 | "icourses", 28 | "xuetangx", 29 | "cnmooc", 30 | "livedu" 31 | ], 32 | "/": [ 33 | "", 34 | "guide/getting-started", 35 | "guide/basic", 36 | "guide/faq", 37 | "guide/known-issues", 38 | "guide/notice" 39 | ] 40 | }, 41 | 42 | // algolia: { 43 | // apiKey: "20560f10044e76d7f16908746c3adeb1", 44 | // indexName: "siguremo_course-crawler" 45 | // }, 46 | 47 | lastUpdated: "Last Updated", // string | boolean 48 | 49 | // 假定是 GitHub. 同时也可以是一个完整的 GitLab URL 50 | repo: "SigureMo/course-crawler", 51 | // 自定义仓库链接文字。默认从 `themeConfig.repo` 中自动推断为 52 | // "GitHub"/"GitLab"/"Bitbucket" 其中之一,或是 "Source"。 53 | repoLabel: "GitHub", 54 | 55 | // 以下为可选的编辑链接选项 56 | 57 | // 假如你的文档仓库和项目本身不在一个仓库: 58 | docsRepo: "SigureMo/course-crawler", 59 | // 假如文档不是放在仓库的根目录下: 60 | docsDir: "docs/", 61 | // 假如文档放在一个特定的分支下: 62 | // docsBranch: "docs", 63 | // 默认是 false, 设置为 true 来启用 64 | editLinks: true, 65 | // 默认为 "Edit this page" 66 | editLinkText: "在GitHub上编辑此页!", 67 | // Service Worker 的配置 68 | serviceWorker: { 69 | updatePopup: true 70 | } 71 | } 72 | }; 73 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # 介绍 2 | 3 | ![python 3.6.7](https://img.shields.io/badge/python-3.6.7-green?style=flat-square&logo=python) 4 | 5 | 一个基于 Python 3 的 MOOC 课程内容获取工具,方便离线观看。 6 | 7 | [下载最新程序](https://github.com/SigureMo/course-crawler/archive/master.zip) 或 [前往 GitHub](https://github.com/SigureMo/course-crawler) 8 | 9 | ## 支持列表 10 | 11 | - [中国大学 MOOC](https://www.icourse163.org/) 是国内优质的中文 MOOC 学习平台,由爱课程网携手·网易云课堂·打造。**大多数的名校都有一定数量课程**,如北京大学、浙江大学、哈尔滨工业大学等 12 | - [网易云课堂](http://study.163.com/) 13 | - [普通课程](http://study.163.com/) 涵盖方面较广,更注重于**职场、生活技能**,很多需要付费 14 | - [MOOC 课程](http://mooc.study.163.com/) 有一部分中国大学 MOOC 的内容,此外还有一些微专业内容,但是很多需要付费,推荐 [顶尖中文大学计算机专业课程体系](https://study.163.com/curricula/cs.htm) 与 [深度学习工程师微专业](https://mooc.study.163.com/smartSpec/detail/1001319001.htm) 15 | - [网易公开课](https://open.163.com/) 是网易推出的“全球名校视频公开课项目”,收录了哈佛大学等**世界级名校**的公开课课程以及可汗学院,TED 等教育性组织的精彩视频,内容较经典,但是也相对比较陈旧 16 | - [好大学在线](https://www.cnmooc.org/) 是上海交通大学拥有的中国顶尖慕课平台。主要是 **上海交通大学** 等大学或机构的课程 17 | - [爱课程](http://www.icourses.cn/) 的资源比较多,但总体相对陈旧 18 | - [视频公开课](http://www.icourses.cn/cuoc/) 19 | - [资源共享课](http://www.icourses.cn/mooc/) 20 | - [学堂在线](http://www.xuetangx.com/) 是清华大学发起的精品中文慕课平台。主要是 **清华大学** 的课程 21 | - [北京高校优质课程研究会](http://www.livedu.com.cn/) 是北京市教委组织的**北京各高校**课程平台 22 | -------------------------------------------------------------------------------- /docs/advance/cli.md: -------------------------------------------------------------------------------- 1 | # 命令行参数 2 | 3 | 4 | 5 | ## 显示帮助信息 6 | 7 | > `-h` `--help` 用于显示帮助信息。 8 | 9 | 输入 `python mooc.py -h` 或 `python mooc.py --help`。 10 | 11 | ## 指定下载目录 12 | 13 | > `-d ` `--dir=` 用于指定下载目录为 ``。 14 | 15 | 课程文件夹将在创建在 `` 中。默认创建在当前目录,即 `-d ""`。 16 | 17 | 示例 18 | 19 | ```bash 20 | python mooc.py -d "G:\MOOCs" https://www.icourse163.org/course/TONGJI-53004 21 | ``` 22 | 23 | ::: tip 24 | `` 不能以 \ 结尾;当 `` 存在空格的时候,必须使用 `"` 将路径包裹起来。 25 | ::: 26 | 27 | ## 重新录入 Cookies 28 | 29 | > `-c` `--restore-cookies` 用于在程序运行时录入新的 Cookies,以覆盖旧的 Cookies 30 | 31 | 由于 Cookies 经常存在过期的情况,手动去删除会很麻烦,这时只需要运行时加上这样一个参数就可以将旧的 Cookies 覆盖掉 32 | 33 | ## 指定视频清晰度 34 | 35 | > `-r ` `--quality ` 用于指定视频清晰度为 `` 36 | 37 | `` 可选列表为 `shd` `hd` `sd` ,分别对应超高清、高清、标清,默认为超高清 38 | 39 | 示例 40 | 41 | ```bash 42 | python mooc.py -r hd https://www.icourse163.org/course/TONGJI-53004 43 | ``` 44 | 45 | ::: tip 46 | 在支持清晰度调节的课程中,如果指定的清晰度不存在,则先自动降低清晰度,若仍无匹配的清晰度,则后升高清晰度,比如指定为 hd ,则会以 hd sd shd 序列对清晰度进行匹配 47 | ::: 48 | 49 | ## 强制覆盖已下载文件 50 | 51 | > `-w`, `--overwrite` 用于启用强制覆盖已经下载过的文件 52 | 53 | 示例 54 | 55 | ```bash 56 | python mooc.py https://www.icourse163.org/course/TONGJI-53004 -w 57 | ``` 58 | 59 | ## aria2 的调用 60 | 61 | 为了方便后续视频的下载,增加了直接调用 `aria2` 进行下载的支持 62 | 63 | ::: tip aria2 相关下载: 64 | 65 | - [aria2](https://github.com/aria2/aria2/releases) 66 | - [aria2 webui](https://github.com/ziahamza/webui-aria2/archive/master.zip) 67 | - [AriaNg(一个比较好看的 webui)](https://github.com/mayswind/AriaNg/releases) 68 | 69 | ::: 70 | 71 | > `--aria2` 用于启用 `aria2` 直接下载视频 72 | 73 | 当配置好 aria2 路径后,在课件解析完成时程序不退出,直接调用 `aria2` 下载视频 74 | 75 | ::: tip 76 | 77 | 请事先确保 `aria2c` 已经是可执行程序,即已经添加到环境变量 78 | 79 | ::: 80 | 81 | 示例 82 | 83 | ```bash 84 | python mooc.py --aria2 https://www.icourse163.org/course/TONGJI-53004 85 | ``` 86 | 87 | ## 播放列表设置 88 | 89 | 由于不同播放器对播放列表格式的要求并不相同,通过修改参数可以获得更通用的播放列表 90 | 91 | ::: tip 一些推荐的播放器 92 | 93 | - Windows 94 | - PotPlayer 95 | - Linux 96 | - SMPlayer 97 | - MacOS 98 | - IINA 99 | 100 | ::: 101 | 102 | ### 播放列表类型 103 | 104 | > `--playlist-type=` 用于指定播放列表类型 105 | 106 | 可选列表 `dpl` `m3u` `no` ,默认为 `dpl` ,若指定 `no` 则不生成播放列表 107 | 108 | ::: tip 109 | 110 | 默认生成的 `Playlist.dpl` 仅仅对 PotPlayer 有效,如果无法使用 PotPlayer (比如 Linux 下),请生成更通用的 `m3u` 格式 111 | 112 | ::: 113 | 114 | 示例 115 | 116 | ```bash 117 | python mooc.py --playlist-type=m3u https://www.icourse163.org/course/TONGJI-53004 118 | ``` 119 | 120 | ### 播放列表路径类型 121 | 122 | > `--abs-path` 用于指定播放列表内的路径为绝对路径 123 | 124 | ::: tip 125 | 126 | 有些播放器并不支持相对路径的播放列表,如果你的播放器无法打开该文件,请尝试生成绝对路径的播放列表 127 | 128 | ::: 129 | 130 | 示例 131 | 132 | ```bash 133 | python mooc.py --playlist-type=m3u --abs-path https://www.icourse163.org/course/TONGJI-53004 134 | ``` 135 | 136 | ::: warning 137 | 138 | 绝对路径的播放列表会在课程文件夹移动后失效,如果开启该选项,请不要在课程下载后进行移动 139 | 140 | ::: 141 | 142 | ## 不下载 ... 143 | 144 | ### 不下载文档 145 | 146 | > `--no-doc` 用于阻止下载 PDF、Word、PowerPoint 等文档。 147 | 148 | 默认会下载所有文档。 149 | 150 | 当指定了这个选项之后,不会下载任何文档(包括 PPT 和书籍等)。 151 | 152 | 示例 153 | 154 | ```bash 155 | python mooc.py https://www.icourse163.org/course/TONGJI-53004 --no-doc 156 | ``` 157 | 158 | ### 不下载字幕 159 | 160 | > `--no-sub` 用于阻止下载字幕。 161 | 162 | ### 不下载富文本 163 | 164 | > `--no-text` 用于阻止下载富文本。 165 | 166 | ### 不下载附件 167 | 168 | > `--no-file` 用于阻止下载附件。 169 | 170 | ### 不下载播放列表 171 | 172 | > `--playlist-type=no` 用于阻止下载播放列表。详情见 [播放列表类型](#播放列表类型) 173 | 174 | ## 修正视频/文档名 175 | 176 | > `--inter` 用于修改文件名。 177 | 178 | 会调出文件编辑器,编辑好视频的名字之后保存。默认没有启用。 179 | 180 | ::: tip 181 | 请严格按照原来文本长度进行设置,否则可能会发生没有标题的情况。 182 | ::: 183 | -------------------------------------------------------------------------------- /docs/advance/patch.md: -------------------------------------------------------------------------------- 1 | # 修改默认值 2 | 3 | 4 | 5 | ## 修改默认获取目录 6 | 7 | 如果不想每次都指定获取目录的话,可以修改 `mooc.py`,找到如下行: 8 | 9 | ```python 10 | parser.add_argument('-d', default=r'G:\MOOCs', help='下载目录') 11 | ``` 12 | 13 | 将 `G:\MOOCs` 替换为想要的文件夹即可。 14 | 15 | ## 默认启用某个选项 16 | 17 | 修改 `mooc.py`,将选项所在 `store_false` 或 `store_true` 切换一下就行了。 18 | 19 | 示例 20 | 21 | 如果我想默认不下载 PDF,那么将 `--no-pdf` 所在的那一行的 `store_false` 改了就行了,改成这样 22 | 23 | ```python 24 | parser.add_argument('--no-pdf', action='store_true', help='不下载 PDF 文档') 25 | ``` 26 | 27 | 这样默认就不会下载 PDF,而如果在命令中使用了 `--no-pdf` 就会下载 PDF 了。 28 | -------------------------------------------------------------------------------- /docs/courses/cnmooc.md: -------------------------------------------------------------------------------- 1 | # 好大学在线 2 | 3 | ## 简介 4 | 5 | [好大学在线](https://www.cnmooc.org/) 是上海交通大学拥有的中国顶尖慕课平台。主要是 **上海交通大学** 等大学或机构的课程。 6 | 7 | ## 地址格式 8 | 9 | 课程的地址必须类似以下这种格式 10 | 11 | ``` 12 | https://www.cnmooc.org/portal/course/4386/9729.mooc 13 | ``` 14 | 15 | ## 碎碎念 16 | 17 | 要想获得课程必须保证一下两个条件均满足: 18 | 19 | - 已经在客户端或 Web 端手动加入课程; 20 | - 当前课程已经在开课时间内。 21 | 22 | 同·中国大学 MOOC·一样,可以通过切换「开课班级」参加以前的课程。 23 | 24 | 如果当前课程还未开课,可以切换到以前的班次,并加入,这样就可以获得视频等资源。 25 | -------------------------------------------------------------------------------- /docs/courses/icourse163.md: -------------------------------------------------------------------------------- 1 | # 中国大学 MOOC 2 | 3 | ## 简介 4 | 5 | [中国大学 MOOC](https://www.icourse163.org/) 是国内优质的中文 MOOC 学习平台,由爱课程网携手·网易云课堂·打造。**大多数的名校都有一定数量课程**,如北京大学、浙江大学、哈尔滨工业大学等 6 | 7 | ## 地址格式 8 | 9 | 课程的地址必须类似以下两种格式 10 | 11 | ``` 12 | https://www.icourse163.org/course/TONGJI-53004 13 | https://www.icourse163.org/course/TONGJI-53004?tid=1001770008 14 | ``` 15 | 16 | ::: tip 17 | 18 | - 上面的 `course` 替换为 `learn` 也是支持的 19 | - `SPOC` 课程也是支持的,比如 `https://www.icourse163.org/spoc/course/WHUT-1002745006?tid=1002931006` 20 | ::: 21 | 22 | ## 开课次数 23 | 24 | 课程的地址包含了两部分信息,以 `https://www.icourse163.org/course/TONGJI-53004?tid=1001770008` 为例,`53004` 是课程号,唯一标志了同济大学开设的高等数学(一)这门课程,而 `1001770008` 代表了某学期的该课程的课程号,如果地址中不出现 `?tid=xxx` 字段,则默认为最新一次开课,所以我们可以通过控制最后的 `tid` 以达到下载不同学期的课件,而不同学期的地址我们可以在课程主页获取 25 | 26 | ![icourse163_01.png](../images/icourse163_01.png) 27 | 28 | 切换开课学期后便可在浏览器地址栏看到对应的学期课程地址 29 | 30 | ## 身份验证 31 | 32 | 中 M 的视频接口很不稳定,在这一年内进行了多次的变更,当前有两种内置的方案 33 | 34 | - 一种是在程序要求输入 Cookies 的时候直接回车注入空的 Cookies 以调用旧接口,但不保证该接口以后会不会删掉 35 | - 另一种输入完整的 Cookies ,这样会调用新的接口,但是最近(19 年 10 月),该接口只会返回新视频的 m3u8 播放列表,如果遇到该问题,请使用旧接口进行下载,问题详细描述见 [issue37](https://github.com/Foair/course-crawler/issues/37),如果该方案也无法解决,请临时使用 [mooc-dl](https://github.com/SigureMo/mooc-dl) 或者自行寻求其他解决方案 36 | 37 | ## 碎碎念 38 | 39 | 「老师已关闭该学期,无法查看」暂时无所畏惧。 40 | 41 | 找不到开课页面的话,可以先进入课程的公告页面,然后点击课程名。 42 | 43 | 如果你下载的是最新学期的课程,请**确定最新学期已经开课**,未开课的学期是无法下载的,不过你可以尝试下载前几个学期的课程。 44 | -------------------------------------------------------------------------------- /docs/courses/icourses.md: -------------------------------------------------------------------------------- 1 | # 爱课程 2 | 3 | ## 简介 4 | 5 | [爱课程](https://www.icourse163.org/) 的资源比较多,但总体相对陈旧 6 | 7 | ## 地址格式 8 | 9 | - 「资源共享课」 10 | 11 | ``` 12 | http://www.icourses.cn/sCourse/course_6076.html 13 | http://www.icourses.cn/web/sword/portal/shareDetails?cId=6076#/course/chapter 14 | ``` 15 | 16 | - 「视频公开课」 17 | 18 | ``` 19 | http://www.icourses.cn/web/sword/portal/videoDetail?courseId=1013d845-1344-1000-b974-22f745f72788#/?resId=10195dd1-1344-1000-bbd7-22f745f72788 20 | ``` 21 | 22 | ::: tip 23 | 只要是以如下地址开始都可以,不用在意是在那一个视频。 24 | 25 | ``` 26 | http://www.icourses.cn/web/sword/portal/videoDetail 27 | ``` 28 | 29 | ::: 30 | -------------------------------------------------------------------------------- /docs/courses/livedu.md: -------------------------------------------------------------------------------- 1 | # 北京高校优质课程研究会 2 | 3 | ## 简介 4 | 5 | [北京高校优质课程研究会](http://www.livedu.com.cn/) 是北京市教委组织的**北京各高校**课程平台 6 | 7 | ## 地址格式 8 | 9 | 课程的地址必须类似以下这种格式 10 | 11 | ``` 12 | http://www.livedu.com.cn/ispace4.0/moocxjkc/toKcView.do?kcid=253 13 | ``` 14 | 15 | ## 碎碎念 16 | 17 | 下载前请确定你已经完成选课,否则也是无法解析的 18 | 19 | 另外,由于是从 HTML 中解析数据,速度极慢 20 | -------------------------------------------------------------------------------- /docs/courses/open_163.md: -------------------------------------------------------------------------------- 1 | # 网易公开课 2 | 3 | ## 简介 4 | 5 | [网易公开课](https://open.163.com/) 是网易推出的“全球名校视频公开课项目”,收录了哈佛大学等**世界级名校**的公开课课程以及可汗学院,TED 等教育性组织的精彩视频,内容较经典,但是也相对比较陈旧 6 | 7 | ## 地址格式 8 | 9 | 课程的地址必须类似以下两种格式 10 | 11 | ``` 12 | http://open.163.com/special/opencourse/cs50.html 13 | http://open.163.com/movie/2010/3/U/R/M6U6LS8CV_M6U6MHDUR.html 14 | ``` 15 | 16 | ## 碎碎念 17 | 18 | 网易公开课也是不需要 Cookies 的 19 | -------------------------------------------------------------------------------- /docs/courses/study_163.md: -------------------------------------------------------------------------------- 1 | # 网易云课堂 2 | 3 | ## 简介 4 | 5 | [网易云课堂](http://study.163.com/) 涵盖方面较广,更注重于**职场、生活技能**,很多需要付费 6 | 7 | ## 地址格式 8 | 9 | 课程的地址必须类似以下三种格式 10 | 11 | ``` 12 | https://study.163.com/course/courseLearn.htm?courseId=1004570029#/learn/video?lessonId=1052094278&courseId=1004570029 13 | https://study.163.com/course/courseMain.htm?courseId=1004570029 14 | https://study.163.com/course/introduction/1004570029.htm 15 | ``` 16 | 17 | ## 碎碎念 18 | 19 | 网易云课堂免费课程当前并不需要身份认证 20 | 21 | 当然,没有身份认证的话也是**不可能支持下载付费视频的**,暂时也不打算做相关支持 22 | 23 | 本文档仅针对网易云课堂普通课程,普通课程与 MOOC 课程相差很大, MOOC 课程更类似于中国大学 MOOC ,如需查看其文档,请移步 [网易云课堂 MOOC](study_mooc.md) 24 | -------------------------------------------------------------------------------- /docs/courses/study_mooc.md: -------------------------------------------------------------------------------- 1 | # 网易云课堂 MOOC 2 | 3 | ## 简介 4 | 5 | [网易云课堂 MOOC 课程](http://mooc.study.163.com/) 有一部分中国大学 MOOC 的内容,此外还有一些微专业内容,但是很多需要付费,推荐 [顶尖中文大学计算机专业课程体系](https://study.163.com/curricula/cs.htm) 与 [深度学习工程师微专业](https://mooc.study.163.com/smartSpec/detail/1001319001.htm) 6 | 7 | ## 地址格式 8 | 9 | 课程的地址必须类似以下两种格式 10 | 11 | ``` 12 | http://mooc.study.163.com/course/2001281002#/info 13 | http://mooc.study.163.com/course/2001281002 14 | ``` 15 | 16 | ::: tip 17 | 18 | - 上面的 `course` 替换为 `learn` 也是支持的 19 | ::: 20 | 21 | ## 碎碎念 22 | 23 | 与[中国大学 MOOC](./icourse163.md) 大体上相同,但它对身份的验证比较苛刻,你**本身无法访问到的内容程序也是无法帮你获取的,也就是说它并不能帮你获取你未参加的已关闭学期的内容** 24 | 25 | Cookies 极易失效,可在运行时添加参数 `-c` 注入新的 Cookies 26 | -------------------------------------------------------------------------------- /docs/courses/xuetangx.md: -------------------------------------------------------------------------------- 1 | # 学堂在线 2 | 3 | ## 简介 4 | 5 | [学堂在线](http://www.xuetangx.com/) 是清华大学发起的精品中文慕课平台。主要是 **清华大学** 的课程 6 | 7 | ## 地址格式 8 | 9 | 课程的地址必须类似以下这种格式 10 | 11 | ``` 12 | https://next.xuetangx.com/course/HNU08071000999/1076493 13 | ``` 14 | 15 | ## 碎碎念 16 | 17 | 学堂在线于 19 年 10 月左右进行了大更新,域名改为了 `next.xuetangx.com` ,如果你还能找到类似下面这种 `www.xuetangx.com` 下的旧版本课程的话,现在也是支持下载的 18 | 19 | ``` 20 | http://www.xuetangx.com/courses/course-v1:TsinghuaX+00740043_2x_2015_T2+sp/about 21 | ``` 22 | -------------------------------------------------------------------------------- /docs/guide/basic.md: -------------------------------------------------------------------------------- 1 | # 深入了解 2 | 3 | ## 课程目录结构 4 | 5 | ``` 6 | 7 | |-- Outline.txt 8 | |-- Playlist.dpl 9 | |-- Files/ 10 | |-- PDFs/ 11 | |-- Texts/ 12 | `-- Videos/ 13 | |-- Rename.bat 14 | `-- Videos.txt 15 | ``` 16 | 17 | ### 课程大纲 18 | 19 | `Outline.txt` 是课程的大纲,它的内容类似 20 | 21 | ``` 22 | 6.1 空间直角坐标系及向量 {1} 23 | 6.1.1 空间直角坐标系的基本概念 {1.1} 24 | 6.1.1 空间直角坐标系的基本概念(视频) {1.1.1}# 25 | 6.1.1 空间直角坐标系的基本概念(PPT) {1.1.1}+ 26 | 6.1.1 空间直角坐标系的基本概念(PPT) 空间直角坐标系的基本概念.rar {1.1.1}! 27 | ... 28 | ``` 29 | 30 | 每个级别依次增加 2 个空格的缩进,`{}` 之间的是程序生成的编号,用来唯一标识一个资源(比如视频、富文本等等)。 31 | 32 | `{1.1.1}` 说明该视频文件以 `1.1.1` 开头,可以在 `Videos/` 中找到。如此可以方便地找到视频。 33 | 34 | 有些后面可能有奇怪的符号,比如 `{1.1.1}+`的后面有个 `+`。下面是符号的说明: 35 | 36 | - #: 视频,可以下载到 `Videos/` 37 | - \*:课件,一般是 PDF 文件,位于 `PDFs/` 38 | - +:富文本,一般是 HTML 文件,位于 `Texts/` 39 | - !:附件,位于 `Files/` 40 | - &:字幕,位于 `Videos/` 41 | 42 | ### 视频地址 43 | 44 | `Videos.txt` 是视频的链接,它的内容类似 45 | 46 | ``` 47 | http://v.stu.126.net/mooc-video/nos/mp4/2017/02/21/1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4?ak=99ed7479ee303d1b1361b0ee5a4abcee11069a7277fd2bfbd983de77f6586b3ab4d3781458cdbd61bf0041fae59dee85cb91769ba5850a28845217d0bc9bfb580015e48ffc49c659b128bfe612dda086d65894b8ef217f1626539e3c9eb40879c29b730d22bdcadb1b4f67996129275fa4c38c6336120510aea1ae1790819de86e0fa3e09eeabea1b068b3d9b9b6597acf0c219eb000a69c12ce9d568813365b3e099fcdb77c69ca7cd6141d92c122af 48 | http://v.stu.126.net/mooc-video/nos/mp4/2017/02/21/1005822368_a91783c5f05a49e29960d24f1dc06f15_shd.mp4?ak=99ed7479ee303d1b1361b0ee5a4abcee11069a7277fd2bfbd983de77f6586b3a33090c48273cc5e338f1d269a2b016013857294759d07b499e26c45d788128b30015e48ffc49c659b128bfe612dda086d65894b8ef217f1626539e3c9eb40879c29b730d22bdcadb1b4f67996129275fa4c38c6336120510aea1ae1790819de86e0fa3e09eeabea1b068b3d9b9b6597acf0c219eb000a69c12ce9d568813365b3e099fcdb77c69ca7cd6141d92c122af 49 | ... 50 | ``` 51 | 52 | 复制到下载工具下载,比如 [aria2](https://github.com/aria2/aria2/releases)、迅雷 等,也可以直接在浏览器中打开。 53 | 54 | ### 视频文件名 55 | 56 | `Rename.bat` (或 `Rename.sh`)用于将视频重命名,它的内容类似 57 | 58 | ``` 59 | CHCP 65001 60 | 61 | REN "1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4" "1.1.1 空间直角坐标系的基本概念(视频).mp4" 62 | REN "1005822368_a91783c5f05a49e29960d24f1dc06f15_shd.mp4" "1.2.1 向量的坐标表示(视频).mp4" 63 | REN "1005817378_500b5301360f49c18c6f8d3406959cf5_shd.mp4" "1.3.1 向量的模、方向余弦、投影(视频).mp4" 64 | REN "1005821395_ff485bb1e65145ec90bf04a259eb6b0e_shd.mp4" "2.1.1 向量的数量积(视频).mp4" 65 | REN "1005821396_9180e5908bc847548a8db625af9b1ad7_shd.mp4" "2.2.1 向量的数量积(续)(视频).mp4" 66 | REN "1005817386_18d7ede415ec4cb5befa71a9d790ce0f_shd.mp4" "2.3.1 向量的向量积(视频).mp4" 67 | REN "1005822373_8bf3846066e045cda306bd7d27e38786_shd.mp4" "2.4.1 向量的向量积(续)(视频).mp4" 68 | REN "1005899086_7780acc4ac074ed89b6301e41349a2c1_shd.mp4" "3.1.1 平面方程(视频).mp4" 69 | ... 70 | ``` 71 | 72 | 下载下来的视频文件名是一团糟的,比如 73 | 74 | ``` 75 | 1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4 76 | ``` 77 | 78 | 运行该文件,视频的文件名就清晰整齐了,也会按照章节次序排列。 79 | 80 | ::: tip 81 | 82 | - `Windows` 下,当视频和这个文件在同一个文件夹时直接**双击**该文件即可运行 83 | - `*nix`需要终端运行 `sh Rename.sh` 84 | 85 | ::: 86 | 87 | ### 播放列表 88 | 89 | 打开 `Playlist.dpl` 即可播放 `Videos/` 中的视频。 90 | 91 | 由于文件系统的限制,特殊字符比如 `"` `/` `\` 都不允许出现在文件名中,所以文件名中的特殊字符是被删除的。假如原视频的标题是「有 3/4 的概率会下雨」,就会变成 `有 34 的概率会下雨`,就很奇怪吧。而播放的列表就可以解决这个问题,在播放列表中会显示 `有 3/4 的概率会下雨`。 92 | 93 | ## 说明 94 | 95 | 学堂在线暂时只有 `Books`,没有 `PDFs`,因为如果提供 `PPT` 的话,在讲义那一栏就有链接可以下载。 96 | -------------------------------------------------------------------------------- /docs/guide/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ::: danger Q1: 4 | 5 | 我的登录信息输错了(失效了),怎么重新填写? 6 | 7 | ::: 8 | 9 | ::: tip A1: 10 | 11 | 重新启动程序,启动时添加参数 `-c` 并输入新的 Cookies 12 | 13 | ::: 14 | 15 | --- 16 | 17 | ::: danger Q2: 18 | 19 | 我遇到了一个课程无法成功获取,最快捷的反馈方式是? 20 | 21 | ::: 22 | 23 | ::: tip A2: 24 | 25 | 依次进行如下检查: 26 | 27 | - Cookie 是否失效,如果失效请使用参数 `-c` 并重新输入 28 | - 当前账号是否加入了该课程,并对该课程**有访问权限**(比如该学期是否是开启状态,课程是否是付费才能观看) 29 | - [Github issues](https://github.com/Foair/course-crawler/issues) 中是否有相似问题与解决方案 30 | 31 | 如果仍然无法解决,请在 Github 提出 [issue](https://github.com/Foair/course-crawler/issues/new) ,或者[邮件联系我](mailto:sigure_mo@163.com),我会尽快处理 32 | 33 | ::: 34 | 35 | --- 36 | 37 | ::: danger Q3: 38 | 39 | 我想看原版文档 40 | 41 | ::: 42 | 43 | ::: tip A3: 44 | 45 | 请前往 [Foair 的文档](https://mooc.xoy.io/) 查看 46 | 47 | ::: 48 | 49 | --- 50 | 51 | ::: danger Q4: 52 | 53 | 如何参与文档的修改? 54 | 55 | ::: 56 | 57 | ::: tip A4: 58 | 59 | 点击文档左下角的“在 GitHub 上编辑此页” 即可~ 60 | 61 | ::: 62 | -------------------------------------------------------------------------------- /docs/guide/getting-started.md: -------------------------------------------------------------------------------- 1 | # 快速开始 2 | 3 | 4 | 5 | ## 准备工作 6 | 7 | 在下载之前,你需要保证你已经安装 `python3.5` 及其以上版本,并且安装完成依赖 8 | 9 | 需要的依赖如下 10 | 11 | - `requests` 12 | - `BeautifulSoup4` 13 | - `lxml` 14 | - `pycryptodome` 15 | 16 | ```bash 17 | pip install requests BeautifulSoup4 lxml pycryptodome 18 | ``` 19 | 20 | ## 下载程序源码 21 | 22 | 前往项目主页下载程序,或者直接点击[这里](https://github.com/SigureMo/course-crawler/archive/master.zip),之后解压 23 | 24 | 当然,已经安装 `git` 的同学可以直接 `clone` 25 | 26 | ```bash 27 | git clone https://github.com/SigureMo/course-crawler.git 28 | ``` 29 | 30 | ## 运行程序 31 | 32 | 在刚刚下载的项目根目录下打开命令行(“终端”、“命令提示符”、“PowerShell”都行,`Win10` 在项目根目录按住 `shift` 右键就有相应的选项,后面统称命令行) 33 | 34 | 在命令行中输入 `python mooc.py ` ,即可将课程课件下载到当前文件夹 35 | 36 | 比如,中国大学 MOOC 课程 `《高等数学(一)》 - 同济大学` 37 | 38 | ```bash 39 | python mooc.py https://www.icourse163.org/course/TONGJI-53004 40 | ``` 41 | 42 | ::: tip 43 | 这里的 `` 为课程主页的地址,网址的具体要求及课程下载的额外要求详见[分类](../courses/icourse163.md) 44 | ::: 45 | 46 | ## 身份验证 47 | 48 | 很多课程并不是直接就能下载的,需要验证下你的身份,这大多都可以通过输入 Cookies 解决 49 | 50 | 当你下载的课程需要输入 Cookies 时,用浏览器打开课程主页,然后按下 `F12` 打开开发者工具 51 | 52 | 切换到 `Network` 选项卡,刷新页面,在左侧选择第一个抓到的包,在右侧 `Headers` 中找到 `cookie` (也可能是 `Cookie`),复制粘贴到程序命令行中 53 | 54 | ![get_cookies.png](../images/get_cookies.png) 55 | 56 | ::: tip 57 | 58 | 如果你和我一样懒的话,可以直接三击 cookies 快速将整个 cookies 及前面的 `cookie:` 一起选中,直接复制粘贴到程序中,也是可以的,反正我是懒得从左上滑到右下啦,所以特意做了这个小“优化”~ 59 | 60 | ::: 61 | 62 | ## 等待 ... 63 | 64 | 等待程序运行,程序首先会从课程主页获取课件列表及解析所需相关信息,之后逐个课件进行解析下载 65 | 66 | ## 下载视频 67 | 68 | 特别地,由于视频资源相对来说花费时间较多,所以视频资源并不是在解析时直接进行下载,而是解析出 `url` 至 `/Videos/Videos.txt` ,之后需要你自行使用下载工具进行下载(比如 `aria2` ,或者迅雷等) 69 | 70 | 下载后将视频移动到 `/Videos/` 内,之后双击 `Rename.bat` 即可修正视频名 71 | 72 | ::: tip 73 | 74 | - 这里的 `` 指课程根目录 75 | - Linux 下的使用以及 `Rename` 文件详情请见[视频文件名](basic.html#视频文件名) 76 | 77 | ::: 78 | 79 | ## 视频的播放 80 | 81 | 使用 PotPlayer 打开 `Playlist.dpl` 即可播放视频 82 | 83 | ::: tip 84 | 85 | 如果你并不想使用 PotPlayer ,请修改[播放列表设置](../advance/cli.html#播放列表设置) 86 | 87 | ::: 88 | -------------------------------------------------------------------------------- /docs/guide/known-issues.md: -------------------------------------------------------------------------------- 1 | # 已知问题 2 | 3 | ::: warning Q1: 4 | 5 | 可能会出现被远程主机强制关闭一个连接。 6 | 7 | ::: 8 | 9 | ::: tip A1: 10 | 11 | 解决方法:等待一段时间然后重新尝试。 12 | 13 | ::: 14 | 15 | --- 16 | 17 | ::: warning Q2: 18 | 19 | 网易云课堂(MOOC) 的 Cookie 很容易失效。 20 | 21 | ::: 22 | 23 | ::: tip A2: 24 | 25 | 解决方法:更加频繁地修改 Cookie。 26 | 27 | ::: 28 | 29 | --- 30 | 31 | ::: warning Q3: 32 | 33 | Windows 下不能自动删除 `process.out`。 34 | 35 | ::: 36 | 37 | ::: tip A3: 38 | 39 | 解决方法:手动删除 :joy:。 40 | 41 | ::: 42 | -------------------------------------------------------------------------------- /docs/guide/notice.md: -------------------------------------------------------------------------------- 1 | # 告示板 2 | 3 | ## Course Cralwer 4 | 5 | 仅限个人学习和研究使用,切勿用于其他用途。强烈建议到 MOOC 网站进行学习,本程序只是提供一个备选方案。 6 | 7 | 本程序主体功能只是下载课件和附件,无任何手段获得付费课程,也没有以任何方式向任何人收取费用。 8 | 9 | 如果将程序用于商业用途或其他非法用途,一切后果由用户自负。 10 | 11 | 如果您发现有侵犯到您的合法权益,请与我联系删除相关程序,同时我对无意冒犯到您致以深深的歉意。 12 | 13 | 许可协议:MIT 14 | 15 | ## 本文档 16 | 17 | 许可协议:CC0 18 | 19 | ## 与原作联系 20 | 21 | SigureMo/course-crawler 基于 Foair 的 Course Crawler ,修复部分 bug ,并且增加部分新功能,本文档亦然 22 | 23 | 本程序的所有“完善”工作均离不开 Foair 原有的框架,在我刚刚接触到这个项目的时候,Foair 给了我莫大的鼓励与支持,这对我之后的 Coding 风格产生了极大的影响 24 | 25 | ## 推广 26 | 27 | - [bilili-dl](https://github.com/SigureMo/bilili-dl) B 站视频下载器,支持普通视频以及番剧的下载,B 站也是有很多不错的课程的 28 | - [mooc-dl](https://github.com/SigureMo/mooc-dl) 中国大学 MOOC 爬虫,使用手机端接口,可作为本项目 icourse163 的备用接口 29 | 30 | ## 感谢 31 | 32 | - vuepress [https://github.com/vuejs/vuepress](https://github.com/vuejs/vuepress) 33 | - [Foair/course-crawler](https://github.com/Foair/course-crawler) 34 | - [https://mooc.xoy.io/](https://mooc.xoy.io/) 35 | 36 | 以及你们的支持,有你们, Course Crawler 才能更加完善~ 37 | -------------------------------------------------------------------------------- /docs/images/get_cookies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/docs/images/get_cookies.png -------------------------------------------------------------------------------- /docs/images/icourse163_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/docs/images/icourse163_01.png -------------------------------------------------------------------------------- /mooc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """MOOC 课程下载""" 3 | 4 | import os 5 | import sys 6 | import re 7 | import argparse 8 | 9 | from moocs.utils import aria2_download, store_cookies 10 | 11 | 12 | def main(): 13 | """解析命令行参数并调用相关模块进行下载""" 14 | 15 | parser = argparse.ArgumentParser(description='Course Crawler') 16 | parser.add_argument('url', help='课程地址') 17 | parser.add_argument('-c', '--restore-cookies', action='store_true', 18 | help='执行任务的时候重新输入 cookies') 19 | parser.add_argument('-d', '--dir', default=r'', help='下载目录') 20 | parser.add_argument('-r', '--quality', default='shd', help='视频清晰度') 21 | parser.add_argument('-w', '--overwrite', 22 | action='store_true', help='强制覆盖重新下载') 23 | parser.add_argument('--inter', action='store_true', help='交互式修改文件名') 24 | parser.add_argument('--no-doc', action='store_false', 25 | help='不下载 PDF、Word 等文档') 26 | parser.add_argument('--no-sub', action='store_false', help='不下载字幕') 27 | parser.add_argument('--no-file', action='store_false', help='不下载附件') 28 | parser.add_argument('--no-text', action='store_false', help='不下载富文本') 29 | parser.add_argument("--playlist-type", default="dpl", 30 | choices=["dpl", "m3u", "no"], help="播放列表类型,支持 dpl 和 m3u,输入 no 不生成播放列表") 31 | parser.add_argument("--abs-path", action='store_true', 32 | help="播放列表路径使用绝对路径,默认为相对路径") 33 | parser.add_argument('--aria2', action='store_true', help='自动调用aria2下载视频') 34 | 35 | args = parser.parse_args() 36 | resolutions = ['shd', 'hd', 'sd'] 37 | playlist_path_type = 'AP' if args.abs_path else 'RP' 38 | 39 | config = {'doc': args.no_doc, 'sub': args.no_sub, 'file': args.no_file, 'text': args.no_text, 40 | 'rename': args.inter, 'dir': args.dir, 'resolution': resolutions.index(args.quality.lower()), 41 | 'overwrite': args.overwrite, 'playlist_type': args.playlist_type, 'playlist_path_type': playlist_path_type, 42 | 'aria2': args.aria2} 43 | 44 | if re.match(r'https?://www.icourse163.org/(spoc/)?(course|learn)/', args.url): 45 | from moocs import icourse163 as mooc 46 | elif re.match(r'https?://www.xuetangx.com/courses/.+/about', args.url): 47 | from moocs import xuetangx as mooc 48 | elif re.match(r'https?://next.xuetangx.com/course/.+', args.url): 49 | from moocs import xuetangx_next as mooc 50 | elif re.match(r'https?://mooc.study.163.com/(course|learn)/', args.url): 51 | from moocs import study_mooc as mooc 52 | elif re.match(r'https?://study.163.com/course/', args.url): 53 | from moocs import study_163 as mooc 54 | elif re.match(r'https?://open.163.com/(special|movie)/', args.url): 55 | from moocs import open_163 as mooc 56 | elif re.match(r'https?://www.cnmooc.org/portal/course/', args.url): 57 | from moocs import cnmooc as mooc 58 | elif re.match(r'https?://www.icourses.cn/web/sword/portal/videoDetail', args.url): 59 | from moocs import icourses as mooc 60 | elif re.match(r'https?://www.icourses.cn/sCourse/course_\d+.html', args.url) or \ 61 | re.match(r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=', args.url): 62 | from moocs import icourses_share as mooc 63 | elif re.match(r'https?://www.livedu.com.cn/ispace4.0/moocxjkc/toKcView.do\?kcid=', args.url): 64 | from moocs import livedu as mooc 65 | else: 66 | print('课程地址有误!') 67 | sys.exit(1) 68 | 69 | if mooc.need_cookies: 70 | cookies = store_cookies(mooc.name, restore=args.restore_cookies) 71 | else: 72 | cookies = None 73 | 74 | mooc.start(args.url, config, cookies) 75 | 76 | # 视频下载 77 | if config['aria2']: 78 | workdir = mooc.exports["workdir"] 79 | workdir.change('Videos') 80 | videos = mooc.exports["videos"] 81 | aria2_download(videos, workdir.path, overwrite=config["overwrite"]) 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /moocs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/moocs/__init__.py -------------------------------------------------------------------------------- /moocs/cnmooc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """好大学在线""" 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from moocs.utils import * 7 | from utils.crawler import Crawler 8 | 9 | name = "cnmooc" 10 | need_cookies = True 11 | CANDY = Crawler() 12 | CONFIG = {} 13 | FILES = {} 14 | VIDEOS = [] 15 | exports = {} 16 | __all__ = ["name", "need_cookies", "start", "exports"] 17 | 18 | 19 | def get_summary(url): 20 | """获得课程信息""" 21 | 22 | res = CANDY.get(url).text 23 | soup = BeautifulSoup(res, 'lxml') 24 | title = soup.find(class_='view-title substr').get_text(strip=True) 25 | university = soup.find(class_='person-attach substr').get_text(strip=True) 26 | 27 | dir_name = course_dir(title, university) 28 | print(dir_name) 29 | return dir_name 30 | 31 | 32 | def get_resource(course_nav): 33 | """获得视频资源""" 34 | 35 | counter = Counter() 36 | outline = Outline() 37 | video_list = [] 38 | document_list = [] 39 | 40 | res = CANDY.get(course_nav).text 41 | soup = BeautifulSoup(res, 'lxml') 42 | nav = soup.find(id='unitNavigation') 43 | chapters = nav.find_all(class_='view-chapter') 44 | for chapter in chapters: 45 | chapter_name = chapter.find( 46 | class_='chapter-text substr').get_text(strip=True) 47 | counter.add(0) 48 | outline.write(chapter_name, counter, 0) 49 | 50 | lectures = chapter.find_all(class_='view-lecture') 51 | for lecture in lectures: 52 | actions = lecture.find(class_='lecture-title') 53 | lecture_name = actions.get_text(strip=True) 54 | counter.add(1) 55 | outline.write(lecture_name, counter, 1) 56 | # unitid = actions.a['unitid'] 57 | # print(unitid) 58 | group = actions.div.find_all('a') 59 | # for action in group: 60 | # print(action.i['class']) 61 | videos = list( 62 | filter(lambda action: 'icon-play' in action.i['class'][0], group)) 63 | # videos = [action for action in group if lambda :'icon-play' in action.i['class'][0]] 64 | docs = list( 65 | filter(lambda action: 'icon-doc' in action.i['class'][0], group)) 66 | for video in videos: 67 | counter.add(2) 68 | outline.write(video['title'], counter, 2, sign='#') 69 | if len(videos) == 1: 70 | extra_num = '' 71 | else: 72 | extra_num = '-%s' % str(counter)[-1:] 73 | video_list.append( 74 | Video(counter, lecture_name + extra_num, video['itemid'])) 75 | counter.reset() 76 | for doc in docs: 77 | counter.add(2) 78 | outline.write(doc['title'], counter, 2, sign='*') 79 | document_list.append( 80 | Document(counter, lecture_name, doc['itemid'])) 81 | return video_list, document_list 82 | 83 | 84 | def parse_resource(video): 85 | """解析视频地址""" 86 | 87 | res = CANDY.post('https://www.cnmooc.org/study/play.mooc', 88 | data={'itemId': video.meta, 'itemType': '10', 'testPaperId': ''}).text 89 | soup = BeautifulSoup(res, 'lxml') 90 | node_id = soup.find(id='nodeId')['value'] 91 | 92 | res = CANDY.post('https://www.cnmooc.org/item/detail.mooc', 93 | data={'nodeId': node_id, 'itemId': video.meta}).json() 94 | if WORK_DIR.need_download(video.file_name+".mp4", CONFIG["overwrite"]): 95 | url = res['node']['flvUrl'] 96 | FILES['videos'].write_string(url) 97 | FILES['renamer'].write(url.split('/')[-1], video.file_name) 98 | VIDEOS.append((url, video.file_name+".mp4")) 99 | 100 | if CONFIG['sub']: 101 | exts = res['node']['nodeExts'] 102 | for ext in exts: 103 | file_name = '%s%s.srt' % (video.file_name, '' if len( 104 | exts) == 1 else '_' + ext['languageCode']) 105 | if WORK_DIR.need_download(file_name, CONFIG["overwrite"]): 106 | CANDY.download_bin('https://static.cnmooc.org' + 107 | ext['node']['rsUrl'], WORK_DIR.file(file_name)) 108 | 109 | 110 | def get_doc(doc_list): 111 | """获得文档""" 112 | 113 | WORK_DIR.change('Docs') 114 | for doc in doc_list: 115 | post_data = {'itemId': doc.meta, 'itemType': '20', 'testPaperId': ''} 116 | res = CANDY.post( 117 | 'https://www.cnmooc.org/study/play.mooc', data=post_data).text 118 | try: 119 | url = re.search(r'isSlideShow\("(.+)?"\);', res).group(1) 120 | except AttributeError: 121 | continue 122 | ext = url.split('.')[-1] 123 | file_name = doc.file_name 124 | if WORK_DIR.need_download(file_name + '.' + ext, CONFIG["overwrite"]): 125 | CANDY.download_bin('https://static.cnmooc.org' + url, WORK_DIR.file(file_name + '.' + ext)) 126 | 127 | 128 | def start(url, config, cookies=None): 129 | """调用接口函数""" 130 | 131 | global WORK_DIR 132 | CONFIG.update(config) 133 | 134 | CANDY.set_cookies(cookies) 135 | 136 | course_info = get_summary(url) 137 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info) 138 | WORK_DIR.change('Videos') 139 | 140 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 141 | FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt')) 142 | 143 | course = 'https://www.cnmooc.org/portal/session/unitNavigation/' 144 | course_nav = course + url.split('/')[-1] 145 | resource = get_resource(course_nav) 146 | 147 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 148 | 149 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 150 | if playlist: 151 | parse_res_list(resource[0], rename, playlist.write, parse_resource) 152 | else: 153 | parse_res_list(resource[0], rename, parse_resource) 154 | 155 | if CONFIG['doc']: 156 | get_doc(resource[1]) 157 | 158 | exports.update({ 159 | "workdir": WORK_DIR, 160 | "spider": CANDY, 161 | "videos": VIDEOS 162 | }) 163 | -------------------------------------------------------------------------------- /moocs/icourse163.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """中国大学MOOC""" 3 | 4 | import json 5 | import time 6 | import sys 7 | 8 | from moocs.utils import * 9 | from utils.crawler import Crawler 10 | 11 | name = "icourse163" 12 | need_cookies = True 13 | CANDY = Crawler() 14 | CONFIG = {} 15 | FILES = {} 16 | VIDEOS = [] 17 | exports = {} 18 | __all__ = ["name", "need_cookies", "start", "exports"] 19 | 20 | 21 | def get_summary(url): 22 | """从课程主页面获取信息""" 23 | 24 | url = url.replace('learn/', 'course/') 25 | res = CANDY.get(url).text 26 | 27 | term_id = re.search(r'termId : "(\d+)"', res).group(1) 28 | names = re.findall(r'name:"(.+)"', res) 29 | 30 | dir_name = course_dir(*names[:2]) 31 | 32 | print(dir_name) 33 | CONFIG['term_id'] = term_id 34 | return term_id, dir_name 35 | 36 | 37 | def parse_resource(resource): 38 | """解析资源地址和下载资源""" 39 | 40 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 41 | 'httpSessionId': '5531d06316b34b9486a6891710115ebc', 'c0-scriptName': 'CourseBean', 42 | 'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': '0', 'c0-param0': 'number:' + resource.meta[0], 43 | 'c0-param1': 'number:' + resource.meta[1], 'c0-param2': 'number:0', 44 | 'c0-param3': 'number:' + resource.meta[2], 'batchId': str(int(time.time()) * 1000)} 45 | res = CANDY.post('https://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr', 46 | data=post_data).text 47 | 48 | file_name = resource.file_name 49 | if resource.type == 'Video': 50 | if CONFIG["hasToken"]: 51 | video_token = CANDY.post('https://www.icourse163.org/web/j/resourceRpcBean.getResourceToken.rpc?csrfKey='+CONFIG['token'], data={ 52 | 'bizId': resource.meta[2], 53 | 'bizType': 1, 54 | 'contentType': 1, 55 | }).json()['result']['videoSignDto']['signature'] 56 | data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={ 57 | 'videoId': resource.meta[0], 58 | 'signature': video_token, 59 | 'clientType': '1' 60 | }).json() 61 | 62 | resolutions = [3, 2, 1] 63 | url, ext = '', '' 64 | for sp in resolutions[CONFIG['resolution']:]: 65 | # TODO: 增加视频格式选择 66 | for video in data['result']['videos']: 67 | if video['quality'] == sp and video['format'] == 'mp4': 68 | url = video['videoUrl'] 69 | ext = '.mp4' 70 | break 71 | else: 72 | continue 73 | break 74 | assert ext, "近期中国大学 MOOC 接口变动,请临时使用 https://github.com/SigureMo/mooc-dl" 75 | 76 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]): 77 | FILES['renamer'].write( 78 | re.search(r'(\w+\.mp4)', url).group(1), file_name, ext) 79 | FILES['video'].write_string(url) 80 | VIDEOS.append((url, file_name+ext)) 81 | resource.ext = ext 82 | else: 83 | resolutions = ['Shd', 'Hd', 'Sd'] 84 | url, ext = '', '' 85 | for sp in resolutions[CONFIG['resolution']:]: 86 | # TODO: 增加视频格式选择 87 | # video_info = re.search(r'%sUrl="(?P.*?(?P\.((m3u8)|(mp4)|(flv))).*?)"' % sp, res) 88 | video_info = re.search(r'(?Pmp4)%sUrl="(?P.*?\.(?P=ext).*?)"' % sp, res) 89 | if video_info: 90 | url, ext = video_info.group('url', 'ext') 91 | ext = '.' + ext 92 | break 93 | assert ext, "近期中国大学 MOOC 接口变动,请临时使用 https://github.com/SigureMo/mooc-dl" 94 | 95 | url = url.replace('v.stu.126.net', 'jdvodrvfb210d.vod.126.net') 96 | if CANDY.head(url, allow_redirects=True, timeout=20).status_code != 200: 97 | url = url.replace('mooc-video', 'jdvodrvfb210d') 98 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]): 99 | FILES['renamer'].write(re.search(r'(\w+\.((m3u8)|(mp4)|(flv)))', url).group(1), file_name, ext) 100 | FILES['video'].write_string(url) 101 | VIDEOS.append((url, file_name+ext)) 102 | resource.ext = ext 103 | 104 | if not CONFIG['sub']: 105 | return 106 | subtitles = re.findall(r'name="(.+)";.*url="(.*?)"', res) 107 | for subtitle in subtitles: 108 | if len(subtitles) == 1: 109 | sub_name = file_name + '.srt' 110 | else: 111 | subtitle_lang = subtitle[0].encode( 112 | 'utf_8').decode('unicode_escape') 113 | sub_name = file_name + '_' + subtitle_lang + '.srt' 114 | if not WORK_DIR.need_download(sub_name, CONFIG["overwrite"]): 115 | continue 116 | CANDY.download_bin(subtitle[1], WORK_DIR.file(sub_name)) 117 | 118 | elif resource.type == 'Document': 119 | if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]): 120 | return 121 | pdf_url = re.search(r'textOrigUrl:"(.*?)"', res).group(1) 122 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf')) 123 | 124 | elif resource.type == 'Rich': 125 | if not WORK_DIR.need_download(file_name + '.html', CONFIG["overwrite"]): 126 | return 127 | text = re.search(r'htmlContent:"(.*)",id', 128 | res.encode('utf_8').decode('unicode_escape'), re.S).group(1) 129 | with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file: 130 | file.write(text) 131 | 132 | 133 | def get_resource(term_id): 134 | """获取各种资源""" 135 | 136 | outline = Outline() 137 | counter = Counter() 138 | 139 | video_list = [] 140 | pdf_list = [] 141 | rich_text_list = [] 142 | 143 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 'c0-scriptName': 'CourseBean', 144 | 'c0-methodName': 'getMocTermDto', 'c0-id': '0', 'c0-param0': 'number:' + term_id, 145 | 'c0-param1': 'number:0', 'c0-param2': 'boolean:true', 'batchId': str(int(time.time()) * 1000)} 146 | res = CANDY.post('https://www.icourse163.org/dwr/call/plaincall/CourseBean.getMocTermDto.dwr', 147 | data=post_data).text.encode('utf_8').decode('unicode_escape') 148 | 149 | chapters = re.findall(r'homeworks=\w+;.+id=(\d+).+name="([\s\S]+?)";', res) 150 | for chapter in chapters: 151 | counter.add(0) 152 | outline.write(chapter[1], counter, 0) 153 | 154 | lessons = re.findall( 155 | r'chapterId=' + chapter[0] + r'.+contentId=null.+contentType=1.+id=(\d+).+name="([\s\S]+?)"', res) 156 | for lesson in lessons: 157 | counter.add(1) 158 | outline.write(lesson[1], counter, 1) 159 | 160 | videos = re.findall(r'contentId=(\d+).+contentType=(1).+id=(\d+).+lessonId=' + 161 | lesson[0] + r'.+name="([\s\S]+?)"', res) 162 | for video in videos: 163 | counter.add(2) 164 | outline.write(video[3], counter, 2, sign='#') 165 | video_list.append(Video(counter, video[3], video)) 166 | counter.reset() 167 | 168 | pdfs = re.findall(r'contentId=(\d+).+contentType=(3).+id=(\d+).+lessonId=' + 169 | lesson[0] + r'.+name="([\s\S]+?)"', res) 170 | for pdf in pdfs: 171 | counter.add(2) 172 | outline.write(pdf[3], counter, 2, sign='*') 173 | if CONFIG['doc']: 174 | pdf_list.append(Document(counter, pdf[3], pdf)) 175 | counter.reset() 176 | 177 | rich_text = re.findall(r'contentId=(\d+).+contentType=(4).+id=(\d+).+jsonContent=(.+?);.+lessonId=' + 178 | lesson[0] + r'.+name="([\s\S]]+?)"', res) 179 | for text in rich_text: 180 | counter.add(2) 181 | outline.write(text[4], counter, 2, sign='+') 182 | if CONFIG['text']: 183 | rich_text_list.append(RichText(counter, text[4], text)) 184 | if CONFIG['file']: 185 | if text[3] != 'null' and text[3] != '""': 186 | params = {'nosKey': re.search('nosKey":"(.+?)"', text[3]).group(1), 187 | 'fileName': re.search('"fileName":"(.+?)"', text[3]).group(1)} 188 | file_name = Resource.file_to_save(params['fileName']) 189 | outline.write(file_name, counter, 2, sign='!') 190 | 191 | WORK_DIR.change('Files') 192 | file_name = '%s %s' % (counter, file_name) 193 | if WORK_DIR.need_download(file_name, CONFIG["overwrite"]): 194 | CANDY.download_bin('https://www.icourse163.org/course/attachment.htm', 195 | WORK_DIR.file(file_name), params=params) 196 | counter.reset() 197 | 198 | if video_list: 199 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 200 | WORK_DIR.change('Videos') 201 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 202 | if playlist is not None: 203 | parse_res_list(video_list, rename, parse_resource, playlist.write) 204 | else: 205 | parse_res_list(video_list, rename, parse_resource) 206 | if pdf_list: 207 | WORK_DIR.change('PDFs') 208 | parse_res_list(pdf_list, None, parse_resource) 209 | if rich_text_list: 210 | WORK_DIR.change('Texts') 211 | parse_res_list(rich_text_list, None, parse_resource) 212 | 213 | 214 | def start(url, config, cookies): 215 | """调用接口函数""" 216 | 217 | global WORK_DIR 218 | CANDY.set_cookies(cookies) 219 | CONFIG.update(config) 220 | 221 | if cookies.get('NTESSTUDYSI'): 222 | CONFIG['hasToken'] = True 223 | CONFIG['token'] = cookies.get('NTESSTUDYSI') 224 | else: 225 | CONFIG['hasToken'] = False 226 | 227 | term_id, dir_name = get_summary(url) 228 | WORK_DIR = WorkingDir(CONFIG['dir'], dir_name) 229 | WORK_DIR.change('Videos') 230 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 231 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt')) 232 | 233 | get_resource(term_id) 234 | 235 | exports.update({ 236 | "workdir": WORK_DIR, 237 | "spider": CANDY, 238 | "videos": VIDEOS 239 | }) 240 | -------------------------------------------------------------------------------- /moocs/icourses.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爱课程""" 3 | 4 | from moocs.utils import * 5 | from bs4 import BeautifulSoup 6 | import re 7 | import json 8 | from utils.crawler import Crawler 9 | 10 | name = "icourses" 11 | need_cookies = False 12 | CANDY = Crawler() 13 | CONFIG = {} 14 | FILES = {} 15 | VIDEOS = [] 16 | exports = {} 17 | __all__ = ["name", "need_cookies", "start", "exports"] 18 | 19 | 20 | def get_content(url): 21 | """获得课程信息""" 22 | 23 | res = CANDY.get(url).text 24 | soup = BeautifulSoup(res, 'lxml') 25 | script = soup.find_all('script')[-2].string 26 | js = re.search(r'_sourceArrStr = (.*);', script) 27 | school = soup.find(class_='teacher-infor-from').string 28 | name = soup.find(class_='coursetitle pull-left').a.string 29 | dir_name = course_dir(name, school) 30 | res_info = json.loads(js.group(1)) 31 | print(dir_name) 32 | return dir_name, res_info 33 | 34 | 35 | def parse_res(js): 36 | """获得视频名称和地址""" 37 | outline = Outline() 38 | length = len(str(len(js))) 39 | counter = 0 40 | video_list = [] 41 | for lesson in js: 42 | counter += 1 43 | counter_str = str(counter).zfill(length) 44 | title = lesson['title'] 45 | url = lesson['fullLinkUrl'] 46 | outline.write_string('%s {%s}#' % (title, counter_str)) 47 | video = Video(counter_str, title, url) 48 | video_list.append(video) 49 | 50 | return video_list 51 | 52 | 53 | def parse_video(video): 54 | """将视频信息添加到相关列表中""" 55 | 56 | if WORK_DIR.need_download(video.file_name+".mp4", CONFIG["overwrite"]): 57 | FILES['videos'].write_string(video.meta) 58 | FILES['renamer'].write(video.meta.split('/')[-1], video.file_name) 59 | VIDEOS.append((video.meta, video.file_name+".mp4")) 60 | 61 | 62 | def start(url, config, cookies=None): 63 | """调用接口函数""" 64 | 65 | global WORK_DIR 66 | CONFIG.update(config) 67 | 68 | course_info = get_content(url) 69 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[0]) 70 | 71 | WORK_DIR.change('Videos') 72 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 73 | FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt')) 74 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 75 | if playlist: 76 | FILES['playlist'] = playlist 77 | 78 | video_list = parse_res(course_info[1]) 79 | 80 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 81 | 82 | if playlist: 83 | parse_res_list(video_list, rename, 84 | FILES['playlist'].write, parse_video) 85 | else: 86 | parse_res_list(video_list, rename, parse_video) 87 | 88 | exports.update({ 89 | "workdir": WORK_DIR, 90 | "spider": CANDY, 91 | "videos": VIDEOS 92 | }) 93 | -------------------------------------------------------------------------------- /moocs/icourses_share.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爱课程 资源共享课""" 3 | import re 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | from moocs.utils import * 8 | from utils.crawler import Crawler 9 | 10 | name = "icourses_share" 11 | need_cookies = False 12 | CANDY = Crawler() 13 | CONFIG = {} 14 | FILES = {} 15 | VIDEOS = [] 16 | exports = {} 17 | __all__ = ["name", "need_cookies", "start", "exports"] 18 | 19 | 20 | def get_summary(url): 21 | """从课程主页面获取信息""" 22 | if re.match(r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=(\d+)', url): 23 | course_id = re.match( 24 | r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=(\d+)', url).group(1) 25 | url = 'http://www.icourses.cn/sCourse/course_{}.html'.format(course_id) 26 | else: 27 | course_id = re.match( 28 | r'https?://www.icourses.cn/sCourse/course_(\d+).html', url).group(1) 29 | res = CANDY.get(url) 30 | res.encoding = 'utf8' 31 | soup = BeautifulSoup(res.text, 'lxml') 32 | name = soup.find('div', class_='course-introduction-infor').find('div', 33 | class_='course-title').p.string 34 | 35 | dir_name = course_dir(name, '爱课程资源共享课') 36 | 37 | print(dir_name) 38 | 39 | return course_id, dir_name 40 | 41 | 42 | def parse_resource(resource): 43 | """解析资源地址和下载资源""" 44 | 45 | file_name = resource.file_name 46 | if resource.type == 'Video': 47 | video_urls = {} 48 | video_urls['sd'] = resource.meta['fullResUrl'] 49 | if resource.meta.get('fullResUrl2'): 50 | video_urls['hd'] = resource.meta['fullResUrl2'] 51 | 52 | resolutions = ['shd', 'hd', 'sd'] 53 | for sp in resolutions[CONFIG['resolution']:]: 54 | if video_urls.get(sp): 55 | url = video_urls[sp] 56 | break 57 | 58 | if WORK_DIR.need_download(file_name+".mp4", CONFIG["overwrite"]): 59 | FILES['renamer'].write( 60 | re.search(r'(\w+\.mp4)', url).group(1), file_name) 61 | FILES['video'].write_string(url) 62 | VIDEOS.append((url, file_name+".mp4")) 63 | #resource.ext = ext 64 | 65 | if not CONFIG['sub']: 66 | return 67 | # 暂未发现字幕 68 | 69 | elif resource.type == 'Document': 70 | pdf_url = resource.meta['fullResUrl'] 71 | if WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]): 72 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf')) 73 | 74 | 75 | def get_resource(course_id): 76 | """获取各种资源""" 77 | 78 | outline = Outline() 79 | counter = Counter() 80 | 81 | video_list = [] 82 | pdf_list = [] 83 | 84 | res = CANDY.get( 85 | 'http://www.icourses.cn/web/sword/portal/shareChapter?cid={}'.format(course_id)) 86 | soup = BeautifulSoup(res.text, 'lxml') 87 | chapters = soup.find('ul', id='chapters').children 88 | for chapter in chapters: 89 | if chapter.name is None: 90 | continue 91 | counter.add(0) 92 | chapter_id = chapter.attrs['data-id'] 93 | chapter_name = chapter.find( 94 | 'a', class_='chapter-title-text').string.replace('\n\t\t\t\t\t\t\t', ' ') 95 | outline.write(chapter_name, counter, 0) 96 | 97 | # 章前导读 98 | try: 99 | important = chapter.find( 100 | 'a', attrs={'title': '重点难点'}).attrs['data-url'] 101 | instructional_design = chapter.find( 102 | 'a', attrs={'title': '教学设计'}).attrs['data-url'] 103 | exam_id = chapter.find( 104 | 'a', attrs={'title': '评价考核'}).attrs['data-id'] 105 | exam_contents = CANDY.post( 106 | 'http://www.icourses.cn/web//sword/common/getTextBody', data={'id': exam_id}).text 107 | textbook_id = chapter.find( 108 | 'a', attrs={'title': '教材内容'}).attrs['data-id'] 109 | textbook_contents = CANDY.post( 110 | 'http://www.icourses.cn/web//sword/common/getTextBody', data={'id': textbook_id}).text 111 | WORK_DIR.change('Introduction') 112 | outline.write('重点难点', counter, 2, sign='*') 113 | CANDY.download_bin(important, WORK_DIR.file( 114 | '%s 重点难点.html') % counter) 115 | outline.write('教学设计', counter, 2, sign='*') 116 | CANDY.download_bin(instructional_design, 117 | WORK_DIR.file('%s 教学设计.html') % counter) 118 | outline.write('评价考核', counter, 2, sign='+') 119 | with open(WORK_DIR.file('%s 评价考核.html' % counter), 'w', encoding='utf_8') as file: 120 | file.write(exam_contents) 121 | outline.write('教材内容', counter, 2, sign='+') 122 | with open(WORK_DIR.file('%s 教材内容.html' % counter), 'w', encoding='utf_8') as file: 123 | file.write(textbook_contents) 124 | except: 125 | pass 126 | 127 | lessons = chapter.find('ul', class_='chapter-body-l').contents 128 | for lesson in lessons: 129 | if len(lessons) == 1: 130 | counter.add(1) 131 | lesson_id = chapter_id 132 | lesson_name = chapter_name 133 | else: 134 | if lesson.name is None: 135 | continue 136 | counter.add(1) 137 | lesson_info = lesson.find( 138 | 'a', class_='chapter-body-content-text') 139 | lesson_id = lesson_info.attrs['data-secid'] 140 | lesson_name = lesson_info.text.replace('\n', '') 141 | rej = CANDY.post( 142 | 'http://www.icourses.cn/web//sword/portal/getRess', data={'sectionId': lesson_id}).json() 143 | 144 | outline.write(lesson_name, counter, 1) 145 | 146 | for resource in rej['model']['listRes']: 147 | if resource['mediaType'] == 'mp4': 148 | counter.add(2) 149 | outline.write(resource['title'], counter, 2, sign='#') 150 | video_list.append( 151 | Video(counter, resource['title'], resource)) 152 | counter.reset() 153 | 154 | for resource in rej['model']['listRes']: 155 | if resource['mediaType'] in ['pdf', 'ppt']: 156 | counter.add(2) 157 | outline.write(resource['title'], counter, 2, sign='*') 158 | if CONFIG['doc']: 159 | pdf_list.append( 160 | Document(counter, resource['title'], resource)) 161 | counter.reset() 162 | 163 | if video_list: 164 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 165 | WORK_DIR.change('Videos') 166 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 167 | if playlist: 168 | parse_res_list(video_list, rename, playlist.write, parse_resource) 169 | else: 170 | parse_res_list(video_list, rename, parse_resource) 171 | if pdf_list: 172 | WORK_DIR.change('PDFs') 173 | parse_res_list(pdf_list, None, parse_resource) 174 | 175 | 176 | def start(url, config, cookies=None): 177 | """调用接口函数""" 178 | 179 | # 初始化设置 180 | global WORK_DIR 181 | CONFIG.update(config) 182 | 183 | # 课程信息 184 | course_info = get_summary(url) 185 | 186 | # 创建课程目录 187 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1]) 188 | 189 | WORK_DIR.change('Videos') 190 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 191 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt')) 192 | 193 | # 获得资源 194 | get_resource(course_info[0]) 195 | 196 | exports.update({ 197 | "workdir": WORK_DIR, 198 | "spider": CANDY, 199 | "videos": VIDEOS 200 | }) 201 | -------------------------------------------------------------------------------- /moocs/livedu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """北京高校优质课程研究会""" 3 | 4 | import time 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | from moocs.utils import * 9 | from utils.crawler import Crawler 10 | 11 | name = "livedu" 12 | need_cookies = True 13 | CANDY = Crawler() 14 | CONFIG = {} 15 | FILES = {} 16 | VIDEOS = [] 17 | exports = {} 18 | __all__ = ["name", "need_cookies", "start", "exports"] 19 | 20 | 21 | def get_summary(url): 22 | """从课程主页面获取信息""" 23 | 24 | course_id = re.search(r'kcid=(?P\d+)', url).group('course_id') 25 | data = { 26 | 'kcid': course_id, 27 | 'kcdm': course_id, 28 | } 29 | res = CANDY.post(CONFIG['study_page'], data=data) 30 | study_soup = BeautifulSoup(res.text, 'html.parser') 31 | name = study_soup.find( 32 | 'dl', class_='content-a-title').find('dt').find('span').string 33 | 34 | home_text = CANDY.get(url).text 35 | home_soup = BeautifulSoup(home_text, 'html.parser') 36 | chapter_names = [] 37 | if home_soup.find('div', class_='vice-main-kcap'): 38 | for chapter_lable in home_soup.find('div', class_='vice-main-kcap')\ 39 | .find('ul')\ 40 | .children: 41 | try: 42 | chapter_names.insert( 43 | 0, chapter_lable.find('div').find('span').string) 44 | except: 45 | pass 46 | else: 47 | for chapter_lable in home_soup.find('div', id='accordion')\ 48 | .find_all('h3'): 49 | chapter_names.insert(0, chapter_lable.text) 50 | 51 | dir_name = course_dir(name, '北京高校优质课程研究会') 52 | 53 | print(dir_name) 54 | 55 | CONFIG['course_id'] = course_id 56 | CONFIG['study_soup'] = study_soup 57 | CONFIG['chapter_names'] = chapter_names 58 | return course_id, dir_name 59 | 60 | 61 | def parse_resource(resource): 62 | """解析资源地址和下载资源""" 63 | 64 | file_name = resource.file_name 65 | if resource.type == 'Video': 66 | ext = '.mp4' 67 | if WORK_DIR.need_download(file_name+ext, CONFIG["overwrite"]): 68 | resource.ext = ext 69 | FILES['renamer'].write( 70 | re.search(r'(\w+\.mp4)', resource.meta).group(1), file_name, ext) 71 | FILES['video'].write_string(resource.meta) 72 | VIDEOS.append((resource.meta, file_name+ext)) 73 | 74 | elif resource.type == 'Document': 75 | if not WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]): 76 | return 77 | CANDY.download_bin(resource.meta, WORK_DIR.file(file_name + '.pdf')) 78 | 79 | elif resource.type == 'Rich': 80 | if not WORK_DIR.need_download(file_name+".html", CONFIG["overwrite"]): 81 | return 82 | with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file: 83 | file.write(resource.meta) 84 | 85 | 86 | def get_resource(course_id): 87 | """获取各种资源""" 88 | 89 | outline = Outline() 90 | counter = Counter() 91 | 92 | video_list = [] 93 | pdf_list = [] 94 | test_list = [] 95 | 96 | study_soup = CONFIG['study_soup'] 97 | chapter_names = CONFIG['chapter_names'] 98 | study_div = study_soup.find('div', class_='ation-a-main') 99 | left_div = study_div.find('div', class_='xx-main-left') 100 | info_div = left_div.find('div', class_='xx-left-main') 101 | chapters = info_div.find_all('dl') 102 | for chapter in chapters: 103 | counter.add(0) 104 | # chapter_name = chapter.find('dt').contents[2].strip() 105 | chapter_name = chapter_names.pop() 106 | outline.write(chapter_name, counter, 0) 107 | 108 | lessons = chapter.find_all('dd') 109 | for lesson in lessons: 110 | counter.add(1) 111 | lesson_info = lesson.find('a') 112 | lesson_id = re.search(r"xsxx\('(?P.+)'\)", 113 | lesson_info.attrs.get('onclick')).group('lesson_id') 114 | 115 | data = { 116 | 'kcdm': course_id, 117 | 'zjdm': lesson_id, 118 | } 119 | res = CANDY.post(CONFIG['study_page'], data=data) 120 | soup = BeautifulSoup(res.text, 'html.parser') 121 | study_div = soup.find('div', class_='ation-a-main') 122 | right_div = study_div.find('div', class_='xx-main-right') 123 | study_box = right_div.find('div', class_='xx-main-box') 124 | lesson_name = study_box.find('h4').contents[1] 125 | outline.write(lesson_name, counter, 1) 126 | resource_div = study_box.find('div', class_='study-L-text') 127 | 128 | # GET video url 129 | video_div = resource_div.find('div', id='videoBj_1') 130 | if video_div: 131 | video_url = video_div.find('input', id='sp').attrs.get('value') 132 | video_name = 'Video:{}'.format(lesson_name) 133 | outline.write(video_name, counter, 2, sign='#') 134 | video_list.append(Video(counter, video_name, video_url)) 135 | 136 | # GET pdf url 137 | pdf_iframe = resource_div.find( 138 | 'iframe', attrs={'name': 'pdfContainer'}) 139 | if pdf_iframe: 140 | pdf_div = pdf_iframe.parent 141 | pdf_name = pdf_div.find('span').string.replace('.pdf', '') 142 | pdf_url = re.search( 143 | r'cclj=(?Phttp.+\.pdf)', pdf_iframe.attrs.get('src')).group('pdf_url') 144 | outline.write(pdf_name, counter, 2, sign='*') 145 | if CONFIG['doc']: 146 | pdf_list.append(Document(counter, pdf_name, pdf_url)) 147 | 148 | # GET test text 149 | test_div = study_box.find('div', class_='zy-a-list') 150 | if test_div: 151 | test_name = 'Test:{}'.format(lesson_name) 152 | outline.write(test_name, counter, 2, sign='+') 153 | if CONFIG['text']: 154 | test_list.append( 155 | RichText(counter, test_name, str(test_div))) 156 | 157 | if video_list: 158 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 159 | WORK_DIR.change('Videos') 160 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 161 | if playlist: 162 | parse_res_list(video_list, rename, playlist.write, parse_resource) 163 | else: 164 | parse_res_list(video_list, rename, parse_resource) 165 | if pdf_list: 166 | WORK_DIR.change('PDFs') 167 | parse_res_list(pdf_list, None, parse_resource) 168 | if test_list: 169 | WORK_DIR.change('Texts') 170 | parse_res_list(test_list, None, parse_resource) 171 | 172 | 173 | def start(url, config, cookies=None): 174 | """调用接口函数""" 175 | 176 | # 初始化设置 177 | global WORK_DIR 178 | CANDY.set_cookies(cookies) 179 | CONFIG.update(config) 180 | CONFIG['study_page'] = 'http://www.livedu.com.cn/ispace4.0/moocxsxx/queryAllZjByKcdm.do' 181 | 182 | # 课程信息 183 | course_info = get_summary(url) 184 | 185 | # 创建课程目录 186 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1]) 187 | 188 | WORK_DIR.change('Videos') 189 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 190 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt')) 191 | 192 | # 获得资源 193 | get_resource(course_info[0]) 194 | 195 | exports.update({ 196 | "workdir": WORK_DIR, 197 | "spider": CANDY, 198 | "videos": VIDEOS 199 | }) 200 | -------------------------------------------------------------------------------- /moocs/open_163.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """网易公开课""" 3 | 4 | import time 5 | 6 | from bs4 import BeautifulSoup 7 | from Crypto.Cipher import AES 8 | 9 | from moocs.utils import * 10 | from utils.crawler import Crawler 11 | 12 | name = "open_163" 13 | need_cookies = False 14 | CANDY = Crawler() 15 | CONFIG = {} 16 | FILES = {} 17 | VIDEOS = [] 18 | exports = {} 19 | __all__ = ["name", "need_cookies", "start", "exports"] 20 | 21 | 22 | def get_summary(url): 23 | """从课程主页面获取信息""" 24 | 25 | res = CANDY.get(url).text 26 | soup = BeautifulSoup(res, 'html.parser') 27 | links = [] 28 | if re.match(r'https?://open.163.com/special/', url): 29 | # 从课程主页解析各课程链接 30 | names = soup.find_all('div', class_='g-container')[1] 31 | organization = names.find('a').string.strip() 32 | course = names.find('span', class_='pos').string.strip() 33 | list1 = soup.find('table', id='list2') 34 | tds = list1.find_all('td', class_="u-ctitle") 35 | 36 | for td in tds: 37 | a = td.find('a') 38 | links.append((a.get('href'), a.string)) 39 | 40 | else: 41 | # 从学习页面解析各课程链接(有的课程不含课程主页) 42 | names = soup.find('p', class_='bread').find_all('a', class_='f-c9') 43 | organization = names[0].string.strip() 44 | course = names[1].string.strip() 45 | listrow = soup.find('div', class_='listrow') 46 | for item in listrow.find_all('div', class_='item'): 47 | p = item.find('p', class_='f-thide') 48 | if p.find('a'): 49 | a = p.find('a') 50 | links.append((a.get('href'), a.string)) 51 | else: 52 | links.append((url, p.string.split(']')[-1])) 53 | 54 | dir_name = course_dir(course, organization) 55 | 56 | print(dir_name) 57 | 58 | CONFIG['links'] = links 59 | return links, dir_name 60 | 61 | 62 | def parse_resource(resource): 63 | """解析资源地址和下载资源""" 64 | 65 | def open_decrypt(hex_string, t): 66 | """将加密16进制字符串转化为真实url""" 67 | CRYKey = {1: b"4fxGZqoGmesXqg2o", 2: b"3fxVNqoPmesAqg2o"} 68 | aes = AES.new(CRYKey[t], AES.MODE_ECB) 69 | return str(aes.decrypt(bytes.fromhex(hex_string)), encoding='gbk', errors="ignore").replace('\x08', '').replace('\x06', '') 70 | 71 | def update_hex_urls(node, hex_urls): 72 | """从node中解析出来url信息,并更新hex_url""" 73 | for child in node.children: 74 | sp = child.name 75 | if not hex_urls.get(sp): 76 | hex_urls[sp] = {} 77 | for hex_url_tag in child.children: 78 | hex_urls[sp][hex_url_tag.name] = hex_url_tag.string 79 | 80 | link = resource.meta 81 | file_name = resource.file_name 82 | video_info = link.replace('.html', '').split('/')[-1] 83 | xml_url = 'http://live.ws.126.net/movie/' + \ 84 | video_info[-2] + '/' + video_info[-1] + '/2_' + video_info + '.xml' 85 | res = CANDY.get(xml_url) 86 | res.encoding = 'gbk' 87 | 88 | # 解析xml数据 89 | soup = BeautifulSoup(res.text, 'lxml') 90 | name = soup.find('title').string 91 | encrypt = int(soup.find('encrypt').string) 92 | hex_urls = {} 93 | update_hex_urls(soup.find('flvurl'), hex_urls) 94 | update_hex_urls(soup.find('flvurlorigin'), hex_urls) 95 | update_hex_urls(soup.find('playurl'), hex_urls) 96 | update_hex_urls(soup.find('playurl_origin'), hex_urls) 97 | subs = {} 98 | for sub in soup.find('subs'): 99 | subs[sub.find('name').string] = sub.find('url').string 100 | 101 | formats = ['mp4', 'flv'] 102 | resolutions = ['shd', 'hd', 'sd'] 103 | resolutions = resolutions[CONFIG['resolution']:] + \ 104 | list(reversed(resolutions[:CONFIG['resolution']])) 105 | modes = ((sp, ext) for sp in resolutions for ext in formats) 106 | for sp, ext in modes: 107 | if hex_urls.get(sp): 108 | if hex_urls[sp].get(ext): 109 | hex_url = hex_urls[sp][ext] 110 | video_url = open_decrypt(hex_url, encrypt) 111 | ext = video_url.split('.')[-1] # 对扩展名进行修正,有的课程从mp4中解析出来的仍为flv 112 | if ext in formats: 113 | ext = '.' + ext 114 | resource.ext = ext 115 | break 116 | 117 | if WORK_DIR.need_download(file_name+ext, CONFIG["overwrite"]): 118 | FILES['renamer'].write(re.search(r'(\w+\%s)' % 119 | ext, video_url).group(1), file_name, ext) 120 | FILES['video'].write_string(video_url) 121 | VIDEOS.append((video_url, file_name+ext)) 122 | 123 | if not CONFIG['sub']: 124 | return 125 | for subtitle_lang, subtitle_url in subs.items(): 126 | if len(subs) == 1: 127 | sub_name = file_name + '.srt' 128 | else: 129 | sub_name = file_name + '_' + subtitle_lang + '.srt' 130 | if WORK_DIR.need_download(sub_name, CONFIG["overwrite"]): 131 | CANDY.download_bin(subtitle_url, WORK_DIR.file(sub_name)) 132 | 133 | 134 | def get_resource(links): 135 | """获取各种资源""" 136 | 137 | outline = Outline() 138 | counter = Counter(1) 139 | 140 | video_list = [] 141 | 142 | for link, name in links: 143 | counter.add(0) 144 | outline.write(name, counter, 0, sign='#') 145 | video_list.append(Video(counter, name, link)) 146 | 147 | if video_list: 148 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 149 | WORK_DIR.change('Videos') 150 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 151 | if playlist: 152 | parse_res_list(video_list, rename, parse_resource, playlist.write) 153 | else: 154 | parse_res_list(video_list, rename, parse_resource) 155 | 156 | 157 | def start(url, config, cookies=None): 158 | """调用接口函数""" 159 | 160 | # 初始化设置 161 | global WORK_DIR 162 | CONFIG.update(config) 163 | 164 | # 课程信息 165 | course_info = get_summary(url) 166 | 167 | # 创建课程目录 168 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1]) 169 | 170 | WORK_DIR.change('Videos') 171 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 172 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt')) 173 | 174 | # 获得资源 175 | get_resource(course_info[0]) 176 | 177 | exports.update({ 178 | "workdir": WORK_DIR, 179 | "spider": CANDY, 180 | "videos": VIDEOS 181 | }) 182 | -------------------------------------------------------------------------------- /moocs/study_163.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """网易云课堂""" 3 | 4 | import time 5 | from urllib import parse 6 | 7 | import requests 8 | 9 | from moocs.utils import * 10 | from utils.crawler import Crawler 11 | 12 | name = "study_163" 13 | need_cookies = False 14 | CANDY = Crawler() 15 | CONFIG = {} 16 | FILES = {} 17 | VIDEOS = [] 18 | exports = {} 19 | __all__ = ["name", "need_cookies", "start", "exports"] 20 | 21 | 22 | def get_summary(url): 23 | """从课程主页面获取信息""" 24 | 25 | res = requests.get(url).text 26 | 27 | if re.search(r'courseId=(\d+)', url): 28 | course_id = re.search(r'courseId=(\d+)', url).group(1) 29 | else: 30 | course_id = re.search(r'introduction/(\d+)\.htm', url).group(1) 31 | name = re.search(r'(.+) - 网易云课堂', res).group(1) 32 | 33 | dir_name = course_dir(name, '网易云课堂') 34 | 35 | print(dir_name) 36 | 37 | CONFIG['course_id'] = course_id 38 | return course_id, dir_name 39 | 40 | 41 | def parse_resource(resource): 42 | """解析资源地址和下载资源""" 43 | 44 | file_name = resource.file_name 45 | if resource.type == 'Video': 46 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 47 | 'httpSessionId': 'b1a6d411df364e51833ac11570fc3f07', 'c0-scriptName': 'LessonLearnBean', 48 | 'c0-methodName': 'getVideoLearnInfo', 'c0-id': '0', 'c0-param0': 'string:' + resource.meta[1], 49 | 'c0-param1': 'string:' + CONFIG['course_id'], 50 | 'batchId': str(int(time.time() * 1000))} 51 | res = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonLearnBean.getVideoLearnInfo.dwr', 52 | data=post_data).text.encode('utf_8').decode('unicode_escape') 53 | video_info = re.search( 54 | r'signature="(\w+)";.+videoId=(\d+);[\s\S]+name:"(.+?)",', res).group(1, 2, 3) 55 | data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={ 56 | 'videoId': video_info[1], 57 | 'signature': video_info[0], 58 | 'clientType': '1' 59 | }).json() 60 | 61 | resolutions = [3, 2, 1] 62 | for sp in resolutions[CONFIG['resolution']:]: 63 | # TODO: 增加视频格式选择 64 | for video in data['result']['videos']: 65 | if video['quality'] == sp and video['format'] == 'mp4': 66 | url = video['videoUrl'] 67 | ext = '.mp4' 68 | break 69 | else: 70 | continue 71 | break 72 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]): 73 | FILES['renamer'].write( 74 | re.search(r'(\w+\.mp4)', url).group(1), file_name, ext) 75 | FILES['video'].write_string(url) 76 | VIDEOS.append((url, file_name+ext)) 77 | resource.ext = ext 78 | 79 | if not CONFIG['sub']: 80 | return 81 | # 暂未发现字幕 api应该在data['result']['srtCaptions'] 82 | 83 | elif resource.type == 'Document': 84 | if not WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]): 85 | return 86 | post_data = { 87 | 'callCount': '1', 88 | 'scriptSessionId': '${scriptSessionId}190', 89 | 'httpSessionId': 'c4927103a1c042ee95faed758d0db8f8', 90 | 'c0-scriptName': 'LessonLearnBean', 91 | 'c0-methodName': 'getTextLearnInfo', 92 | 'c0-id': '0', 93 | 'c0-param0': 'string:' + resource.meta[1], 94 | 'c0-param1': 'string:' + CONFIG['course_id'], 95 | 'batchId': str(int(time.time() * 1000)), 96 | } 97 | res = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonLearnBean.getTextLearnInfo.dwr', 98 | data=post_data).text.encode('utf_8').decode('unicode_escape') 99 | pdf_url = re.search(r'pdfUrl:"(http://.+?)",', res).group(1) 100 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf')) 101 | else: 102 | if not WORK_DIR.need_download(file_name+resource.meta[2], CONFIG["overwrite"]): 103 | return 104 | CANDY.download_bin(resource.meta[3], WORK_DIR.file( 105 | file_name + resource.meta[2])) 106 | 107 | 108 | def get_resource(course_id): 109 | """获取各种资源""" 110 | 111 | outline = Outline() 112 | counter = Counter() 113 | 114 | video_list = [] 115 | pdf_list = [] 116 | file_list = [] 117 | 118 | post_data = { 119 | 'callCount': '1', 120 | 'scriptSessionId': '${scriptSessionId}190', 121 | 'httpSessionId': '89a04ce41c7d42759b0a62efe392e153', 122 | 'c0-scriptName': 'PlanNewBean', 123 | 'c0-methodName': 'getPlanCourseDetail', 124 | 'c0-id': '0', 125 | 'c0-param0': 'string:' + course_id, 126 | 'c0-param1': 'number:0', 127 | 'c0-param2': 'null:null', 128 | 'batchId': str(int(time.time() * 1000)), 129 | } 130 | res = CANDY.post('https://study.163.com/dwr/call/plaincall/PlanNewBean.getPlanCourseDetail.dwr', 131 | data=post_data).text.encode('utf_8').decode('unicode_escape') 132 | 133 | chapters = re.findall(r'courseId=\d+;.+id=(\d+);.+name="(.+)";', res) 134 | for chapter in chapters: 135 | counter.add(0) 136 | outline.write(chapter[1], counter, 0) 137 | 138 | lessons = re.findall( 139 | r'chapterId=%s;.+?hasReferences=(\w+);.+?id=(\d+).+?lessonName="(.*?)";.+?type=(\d+);' % chapter[0], res, re.DOTALL) 140 | for lesson in lessons: 141 | counter.add(1) 142 | outline.write(lesson[2], counter, 1) 143 | 144 | # Video 145 | if lesson[3] == '2' or lesson[3] == '50': 146 | counter.add(2) 147 | outline.write(lesson[2], counter, 2, sign='#') 148 | video_list.append(Video(counter, lesson[2], lesson)) 149 | counter.reset() 150 | 151 | # Pdf 152 | elif lesson[3] == '3': 153 | counter.add(2) 154 | outline.write(lesson[2], counter, 2, sign='*') 155 | pdf_list.append(Document(counter, lesson[2], lesson)) 156 | 157 | # References 158 | files = [] 159 | if eval(lesson[0][0].upper() + lesson[0][1:]): 160 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 161 | 'httpSessionId': 'b1a6d411df364e51833ac11570fc3f07', 'c0-scriptName': 'LessonReferenceBean', 162 | 'c0-methodName': 'getLessonReferenceVoByLessonId', 'c0-id': '0', 'c0-param0': 'number:' + lesson[1], 163 | 'batchId': str(int(time.time() * 1000))} 164 | ref_info = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonReferenceBean.getLessonReferenceVoByLessonId.dwr', 165 | data=post_data).text.encode('utf_8').decode('unicode_escape') 166 | refs = re.findall( 167 | r'id=(\d+);.+name="(.+)";.+suffix="(\.\w+)";.+url="(.+?)";', ref_info) 168 | 169 | for ref in refs: 170 | ref = (ref[0], parse.unquote(ref[1]), ref[2], ref[3]) 171 | files.append(ref) 172 | 173 | for file in files: 174 | counter.add(2) 175 | outline.write(file[1], counter, 2, sign='!') 176 | if CONFIG['file']: 177 | file_list.append(Resource(counter, file[1], file)) 178 | counter.reset() 179 | 180 | if video_list: 181 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 182 | WORK_DIR.change('Videos') 183 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 184 | if playlist: 185 | parse_res_list(video_list, rename, playlist.write, parse_resource) 186 | else: 187 | parse_res_list(video_list, rename, parse_resource) 188 | if pdf_list: 189 | WORK_DIR.change('PDFs') 190 | parse_res_list(pdf_list, None, parse_resource) 191 | if file_list: 192 | WORK_DIR.change('Files') 193 | parse_res_list(file_list, None, parse_resource) 194 | 195 | 196 | def start(url, config, cookies=None): 197 | """调用接口函数""" 198 | 199 | # 初始化设置 200 | global WORK_DIR 201 | CONFIG.update(config) 202 | 203 | # 课程信息 204 | course_info = get_summary(url) 205 | 206 | # 创建课程目录 207 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1]) 208 | 209 | WORK_DIR.change('Videos') 210 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 211 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt')) 212 | 213 | # 获得资源 214 | get_resource(course_info[0]) 215 | 216 | exports.update({ 217 | "workdir": WORK_DIR, 218 | "spider": CANDY, 219 | "videos": VIDEOS 220 | }) 221 | -------------------------------------------------------------------------------- /moocs/study_mooc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """网易云课堂 MOOC""" 3 | 4 | import time 5 | 6 | from moocs.utils import * 7 | from utils.crawler import Crawler 8 | 9 | name = "study_mooc" 10 | need_cookies = True 11 | CANDY = Crawler() 12 | CONFIG = {} 13 | FILES = {} 14 | VIDEOS = [] 15 | exports = {} 16 | __all__ = ["name", "need_cookies", "start", "exports"] 17 | 18 | 19 | def get_summary(url): 20 | """从课程主页面获取信息""" 21 | 22 | url = url.replace('learn/', 'course/') 23 | res = CANDY.get(url).text 24 | 25 | term_id = re.search(r'termId : "(\d+)"', res).group(1) 26 | names = re.findall(r'name:"(.+)"', res) 27 | 28 | dir_name = course_dir(names[0], names[1]) 29 | 30 | print(dir_name) 31 | 32 | CONFIG['term_id'] = term_id 33 | return term_id, dir_name 34 | 35 | 36 | def get_announce(term_id): 37 | """ 获取课程的公告 """ 38 | 39 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 40 | 'httpSessionId': 'dba4977be78d42a78a6e2c2dd2b9bb42', 'c0-scriptName': 'CourseBean', 41 | 'c0-methodName': 'getAllAnnouncementByTerm', 'c0-id': '0', 'c0-param0': 'number:' + term_id, 42 | 'c0-param1': 'number:1', 'batchId': str(int(time.time() * 1000))} 43 | res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getAllAnnouncementByTerm.dwr', 44 | data=post_data).text 45 | announcements = re.findall( 46 | r'content="(.*?[^\\])".*title="(.*?[^\\])"', res) 47 | 48 | with open('Announcements.html', 'w', encoding='utf-8') as announce_file: 49 | for announcement in announcements: 50 | # 公告内容 51 | announce_content = announcement[0].encode( 52 | 'utf-8').decode('unicode_escape') 53 | 54 | # 公告标题 55 | announce_title = announcement[1].encode( 56 | 'utf-8').decode('unicode_escape') 57 | announce_file.write('

' + announce_title + 58 | '

\n' + announce_content + '\n') 59 | 60 | 61 | def parse_resource(resource): 62 | """解析资源地址和下载资源""" 63 | 64 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 65 | 'httpSessionId': 'b8efd4c73fd1434896507b83de631f0f', 'c0-scriptName': 'CourseBean', 66 | 'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': '0', 'c0-param0': 'number:' + CONFIG['term_id'], 67 | 'c0-param1': 'number:' + resource.meta[0], 'c0-param2': 'number:' + resource.meta[1], 68 | 'c0-param3': 'number:0', 'c0-param4': 'number:' + resource.meta[2], 69 | 'batchId': str(int(time.time() * 1000))} 70 | res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr', 71 | data=post_data).text 72 | 73 | file_name = resource.file_name 74 | if resource.type == 'Video': 75 | signature = re.search(r'signature="(.+?)"', res).group(1) 76 | data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={ 77 | 'videoId': resource.meta[0], 78 | 'signature': signature, 79 | 'clientType': '1' 80 | }).json() 81 | 82 | resolutions = [3, 2, 1] 83 | for sp in resolutions[CONFIG['resolution']:]: 84 | # TODO: 增加视频格式选择 85 | for video in data['result']['videos']: 86 | if video['quality'] == sp and video['format'] == 'mp4': 87 | url = video['videoUrl'] 88 | ext = '.mp4' 89 | break 90 | else: 91 | continue 92 | break 93 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]): 94 | FILES['renamer'].write( 95 | re.search(r'(\w+\.mp4)', url).group(1), file_name, ext) 96 | FILES['video'].write_string(url) 97 | VIDEOS.append((url, file_name+ext)) 98 | resource.ext = ext 99 | 100 | if not CONFIG['sub']: 101 | return 102 | subtitles = re.findall(r'name="(.+)";.*url="(.*?)"', res) 103 | WORK_DIR.change('Videos') 104 | for subtitle in subtitles: 105 | if len(subtitles) == 1: 106 | sub_name = file_name + '.srt' 107 | else: 108 | subtitle_lang = subtitle[0].encode( 109 | 'utf_8').decode('unicode_escape') 110 | sub_name = file_name + '_' + subtitle_lang + '.srt' 111 | if WORK_DIR.need_download(sub_name, CONFIG["overwrite"]): 112 | CANDY.download_bin(subtitle[1], WORK_DIR.file(sub_name)) 113 | 114 | elif resource.type == 'Document': 115 | if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]): 116 | return 117 | pdf_url = re.search(r'textOrigUrl:"(.*?)"', res).group(1) 118 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf')) 119 | 120 | elif resource.type == 'Rich': 121 | if not WORK_DIR.need_download(file_name + '.html', CONFIG["overwrite"]): 122 | return 123 | text = re.search(r'htmlContent:"(.*)",id', 124 | res.encode('utf_8').decode('unicode_escape'), re.S).group(1) 125 | with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file: 126 | file.write(text) 127 | 128 | 129 | def get_resource(term_id): 130 | """获取各种资源""" 131 | 132 | outline = Outline() 133 | counter = Counter() 134 | 135 | video_list = [] 136 | pdf_list = [] 137 | rich_text_list = [] 138 | 139 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 140 | 'httpSessionId': 'b8efd4c73fd1434896507b83de631f0f', 'c0-scriptName': 'CourseBean', 141 | 'c0-methodName': 'getLastLearnedMocTermDto', 'c0-id': '0', 'c0-param0': 'number:' + term_id, 142 | 'batchId': str(int(time.time() * 1000))} 143 | res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr', 144 | data=post_data).text.encode('utf_8').decode('unicode_escape') 145 | 146 | chapters = re.findall(r'homeworks=\w+;.+id=(\d+).+name="(.+)";', res) 147 | for chapter in chapters: 148 | counter.add(0) 149 | outline.write(chapter[1], counter, 0) 150 | 151 | lessons = re.findall( 152 | r'chapterId=' + chapter[0] + r'.+contentType=1.+id=(\d+).+name="(.+)".+test', res) 153 | for lesson in lessons: 154 | counter.add(1) 155 | outline.write(lesson[1], counter, 1) 156 | 157 | videos = re.findall(r'contentId=(\d+).+contentType=(1).+id=(\d+).+lessonId=' + 158 | lesson[0] + r'.+name="(.+)"', res) 159 | for video in videos: 160 | counter.add(2) 161 | outline.write(video[3], counter, 2, sign='#') 162 | video_list.append(Video(counter, video[3], video)) 163 | counter.reset() 164 | 165 | pdfs = re.findall(r'contentId=(\d+).+contentType=(3).+id=(\d+).+lessonId=' + 166 | lesson[0] + r'.+name="(.+)"', res) 167 | for pdf in pdfs: 168 | counter.add(2) 169 | outline.write(pdf[3], counter, 2, sign='*') 170 | if CONFIG['doc']: 171 | pdf_list.append(Document(counter, pdf[3], pdf)) 172 | counter.reset() 173 | 174 | rich_text = re.findall(r'contentId=(\d+).+contentType=(4).+id=(\d+).+jsonContent=(.+);.+lessonId=' + 175 | lesson[0] + r'.+name="(.+)"', res) 176 | for text in rich_text: 177 | counter.add(2) 178 | outline.write(text[4], counter, 2, sign='+') 179 | if CONFIG['text']: 180 | rich_text_list.append(RichText(counter, text[4], text)) 181 | if CONFIG['file']: 182 | if text[3] != 'null' and text[3] != '""': 183 | params = {'nosKey': re.search('nosKey":"(.+?)"', text[3]).group(1), 184 | 'fileName': re.search('"fileName":"(.+?)"', text[3]).group(1)} 185 | file_name = Resource.file_to_save(params['fileName']) 186 | outline.write(file_name, counter, 2, sign='!') 187 | 188 | WORK_DIR.change('Files') 189 | file_name = '%s %s' % (counter, file_name) 190 | if WORK_DIR.need_download(file_name, CONFIG["overwrite"]): 191 | CANDY.download_bin('https://www.icourse163.org/course/attachment.htm', 192 | WORK_DIR.file(file_name), params=params, cookies={'STUDY_SESS': None}) 193 | counter.reset() 194 | 195 | if video_list: 196 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 197 | WORK_DIR.change('Videos') 198 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 199 | if playlist: 200 | parse_res_list(video_list, rename, playlist.write, parse_resource) 201 | else: 202 | parse_res_list(video_list, rename, parse_resource) 203 | if pdf_list: 204 | WORK_DIR.change('PDFs') 205 | parse_res_list(pdf_list, None, parse_resource) 206 | if rich_text_list: 207 | WORK_DIR.change('Texts') 208 | parse_res_list(rich_text_list, None, parse_resource) 209 | 210 | 211 | def start(url, config, cookies=None): 212 | """调用接口函数""" 213 | 214 | # 初始化设置 215 | global WORK_DIR 216 | CANDY.set_cookies(cookies) 217 | CONFIG.update(config) 218 | 219 | # 课程信息 220 | course_info = get_summary(url) 221 | 222 | # 创建课程目录 223 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1]) 224 | 225 | print(course_info[0]) 226 | # 课程公告 227 | get_announce(course_info[0]) 228 | 229 | WORK_DIR.change('Videos') 230 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 231 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt')) 232 | 233 | # 获得资源 234 | get_resource(course_info[0]) 235 | 236 | exports.update({ 237 | "workdir": WORK_DIR, 238 | "spider": CANDY, 239 | "videos": VIDEOS 240 | }) 241 | -------------------------------------------------------------------------------- /moocs/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """核心程序组件""" 3 | 4 | import json 5 | import os 6 | import platform 7 | import re 8 | import subprocess 9 | import sys 10 | import time 11 | 12 | from utils.aria2 import Aria2, Aria2File 13 | 14 | SYS = platform.system() 15 | 16 | 17 | class Resource(object): 18 | """所有资源类的基类 19 | 20 | 用来定义一个资源,但不同类型的资源可能要对部分功能进行重写。 21 | 22 | 属性 23 | 类 24 | regex_sort:匹配序号的正则表达式; 25 | regex_file:匹配 Windows 下文件名的非法字符; 26 | regex_spaces:匹配连续多个空白字符。 27 | type:资源的类型,默认是 'Resource'; 28 | 29 | id:资源的唯一标识,用于在程序中标识一个资源,如 '2.3.2'; 30 | name:资源的名称(可含有特殊字符),和最终的文件名有关; 31 | meta:资源的元信息,比如资源在每个网站的 ID 和文件名等等; 32 | feature:其他特征(基本用不到)。 33 | """ 34 | 35 | regex_sort = re.compile(r'^[第一二三四五六七八九十\d]+[\s\d._\-章课节讲]*[.\s、\-]\s*\d*') 36 | regex_file = re.compile(r'[\\/:*?"<>|]') 37 | regex_spaces = re.compile(r'\s+') 38 | type = 'Resource' 39 | 40 | def __init__(self, identify, name, meta, feature=None): 41 | """将 name 的序号消除,并依次为属性赋值""" 42 | self.id = str(identify) 43 | self.name = Resource.regex_spaces.sub( 44 | ' ', Resource.regex_sort.sub('', name)).strip() 45 | self.meta = meta 46 | self.feature = feature 47 | 48 | def __str__(self): 49 | """返回资源的名称""" 50 | 51 | return self.name 52 | 53 | @property 54 | def file_name(self): 55 | """动态生成文件名(包含前缀的 ID,不含扩展名),比如 '2.3.2 file_name'""" 56 | 57 | return self.id + ' ' + Resource.regex_file.sub('', self.name) 58 | 59 | def operation(self, *funcs): 60 | """传入一个或多个函数,使用函数对资源对象进行调用""" 61 | 62 | for func in funcs: 63 | func(self) 64 | 65 | @staticmethod 66 | def file_to_save(name): 67 | """通过一个名字生成文件名""" 68 | 69 | return Resource.regex_file.sub('', Resource.regex_spaces.sub(' ', Resource.regex_sort.sub('', name)).strip()) 70 | 71 | 72 | class Video(Resource): 73 | """视频资源类 74 | 75 | 属性 76 | type:默认值是 'Video'; 77 | """ 78 | 79 | type = 'Video' 80 | ext = '.mp4' 81 | 82 | 83 | class Document(Resource): 84 | """文档资源类 85 | 86 | 属性 87 | type:默认值是 'Video'; 88 | """ 89 | 90 | type = 'Document' 91 | 92 | 93 | class RichText(Resource): 94 | """富文本资源类 95 | 96 | 属性 97 | type:默认值是 'Rich'; 98 | """ 99 | 100 | type = 'Rich' 101 | 102 | 103 | class Attachment(Resource): 104 | """视频资源类 105 | 106 | 属性 107 | type:默认值是 'Attachment'; 108 | """ 109 | 110 | type = 'Attachment' 111 | 112 | 113 | class ClassicFile(object): 114 | """典型文件(UTF-8 编码的文件)类 115 | 116 | 属性 117 | _f:文件指针; 118 | file:文件名或文件路径。 119 | """ 120 | 121 | def __init__(self, file): 122 | """传入一个文件名或路径,然后打开文件,并保存文件指针和文件名""" 123 | 124 | self._f = open(file, 'w', encoding='utf_8') 125 | self.file = file 126 | 127 | def __del__(self): 128 | """关闭文件,并将文件号和文件名都清空""" 129 | 130 | self._f.close() 131 | del self._f 132 | del self.file 133 | 134 | def write_string(self, string): 135 | """向对象中打开的文件写入字符串,会自动加入换行""" 136 | 137 | self._f.write(string + '\n') 138 | 139 | 140 | class Playlist(ClassicFile): 141 | """ 播放列表类 """ 142 | 143 | def __init__(self, file, path_type): 144 | super().__init__(file) 145 | self.path_type = path_type 146 | 147 | def switch_path(self, path): 148 | """ 根据路径类别生成路径项 """ 149 | path = os.path.normpath(path) 150 | if self.path_type == 'AP': 151 | path = os.path.abspath(path) 152 | elif self.path_type == 'RP': 153 | path = os.path.relpath(path, start=os.path.dirname(self.file)) 154 | return path 155 | 156 | def write(self, video): 157 | """传入一个 Video 类的对象,将该对象的信息写入播放列表""" 158 | 159 | path = os.path.join("Videos", video.file_name + video.ext) 160 | path = self.switch_path(path) 161 | self.write_string(path) 162 | 163 | 164 | class M3u(Playlist): 165 | """ m3u 播放列表类 """ 166 | 167 | def __init__(self, path_type='RP'): 168 | super().__init__('Playlist.m3u', path_type) 169 | 170 | 171 | class Dpl(Playlist): 172 | """ Potplayer 播放列表类 173 | 174 | 属性 175 | _count:已经写入的播放列表的文件数; 176 | """ 177 | 178 | def __init__(self, path_type='RP'): 179 | super().__init__('Playlist.dpl', path_type) 180 | self.write_string('DAUMPLAYLIST\n') 181 | self._count = 0 182 | 183 | def write(self, video): 184 | """传入一个 Video 类的对象,将该对象的信息写入播放列表""" 185 | 186 | self._count += 1 187 | path = os.path.join("Videos", video.file_name + video.ext) 188 | path = self.switch_path(path) 189 | self.write_string('{}*file*{}'.format(self._count, path)) 190 | self.write_string('{}*title*{} {}\n'.format(self._count, 191 | '.'.join(video.id.split('.')[:-1]), video.name)) 192 | 193 | 194 | class Subtitle(ClassicFile): 195 | """ 播放列表类 """ 196 | 197 | def __init__(self, path): 198 | super().__init__(path) 199 | self._count = 0 200 | 201 | @staticmethod 202 | def time_format(seconds): 203 | ms = int(1000 * (seconds - int(seconds))) 204 | seconds = int(seconds) 205 | minutes, sec = seconds // 60, seconds % 60 206 | hour, min = minutes // 60, minutes % 60 207 | return "{:02}:{:02}:{:02},{}".format(hour, min, sec, ms) 208 | 209 | 210 | def write(self, content, from_time, to_time): 211 | self._count += 1 212 | self.write_string(str(self._count)) 213 | self.write_string( 214 | "{} --> {}".format(self.time_format(from_time), self.time_format(to_time))) 215 | self.write_string(content + "\n") 216 | 217 | 218 | class Renamer(ClassicFile): 219 | """重命名批处理文件类""" 220 | 221 | ext = 'bat' if SYS == 'Windows' else 'sh' 222 | 223 | def __init__(self, file): 224 | """初始化文件,并写入调用 UTF-8 代码页的命令""" 225 | 226 | file = file.format(ext=Renamer.ext) 227 | super().__init__(file) 228 | if SYS == 'Windows': 229 | self.write_string('CHCP 65001\n') 230 | 231 | def write(self, origin_name, file_name, ext='.mp4'): 232 | """传入一个文件的原始名字(URL 中的文件名)和一个新的文件名""" 233 | 234 | if SYS == 'Windows': 235 | self.write_string('REN "%s" "%s%s"' % 236 | (origin_name, file_name, ext)) 237 | else: 238 | self.write_string('mv "%s" "%s%s"' % (origin_name, file_name, ext)) 239 | 240 | 241 | class Outline(ClassicFile): 242 | """课程大纲类 243 | 244 | 属性 245 | res_type:通过一个符号代表一种文件类型。 246 | """ 247 | 248 | res_type = {'#': '【视频】', '!': '【附件】', '*': '【文档】', 249 | '+': '【富文本】', '&': '【字幕】', '': ''} 250 | 251 | def __init__(self): 252 | """创建 Outline.txt 文件""" 253 | 254 | super().__init__('Outline.txt') 255 | 256 | def write(self, string, counter, level=2, sign=''): 257 | """传入一个字符串,一个计数器,一个级别(从 0 开始)和一个符号,然后写入大纲。首先会打印出相关信息。""" 258 | 259 | print('%s%s%s' % (' ' * level, Outline.res_type[sign], string)) 260 | name = '%s%s {%s}%s' % (' ' * level, string, counter[level], sign) 261 | self.write_string(name) 262 | 263 | 264 | class WorkingDir(object): 265 | """工作目录类 266 | 267 | 用于切换下载目录和创建目录等。 268 | 269 | 属性 270 | base_dir:工作目录的根目录,任何时候都基于这个目录; 271 | path:相对于根目录的路径。 272 | """ 273 | 274 | def __init__(self, *base_dirs): 275 | """传递一些字符串,创建一个目录,并切换到这个目录""" 276 | 277 | base_dir = os.path.join(*base_dirs) 278 | if not os.path.isdir(base_dir): 279 | os.makedirs(base_dir) 280 | os.chdir(base_dir) 281 | self.base_dir = os.getcwd() 282 | self.path = '' 283 | 284 | def change(self, *relative): 285 | """切换工作目录(假),可以接受连续多个目录名,如果不存在该目录就创建它 286 | 287 | 切换的功能需要配合 file() 才能实现。 288 | """ 289 | 290 | self.path = os.path.join(self.base_dir, *relative) 291 | if not os.path.isdir(self.path): 292 | os.makedirs(self.path) 293 | 294 | def file(self, file_name): 295 | """根据文件名返回一个完整的路径,会根据 path 生成一个路径""" 296 | 297 | return os.path.join(self.path, file_name) 298 | 299 | def exist(self, file_name): 300 | """判断当前路径(雾)是否存在一个文件""" 301 | 302 | return os.path.exists(os.path.join(self.path, file_name)) 303 | 304 | def need_download(self, file_name, overwrite=False): 305 | """判断当前文件是否需要下载,并且打印输出""" 306 | 307 | need = overwrite or not self.exist(file_name) 308 | sign = ">" if need else "!" 309 | res_print(file_name, sign=sign) 310 | return need 311 | 312 | 313 | class Counter(object): 314 | """计数器类 315 | 316 | 属性 317 | counter:计数器的列表。 318 | """ 319 | 320 | def __init__(self, num_level=3): 321 | """初始化一个列表""" 322 | 323 | self.counter = [0] * num_level 324 | self.num_level = num_level 325 | 326 | def add(self, level): 327 | """给第 level 级别的计数器 +1""" 328 | 329 | for i in range(level + 1, self.num_level): 330 | self.counter[i] = 0 331 | self.counter[level] += 1 332 | 333 | def __str__(self): 334 | """返回一个完整的计数器""" 335 | 336 | return '.'.join(map(str, self.counter)) 337 | 338 | def __getitem__(self, index): 339 | """返回到第 level 级别为止的计数器""" 340 | 341 | return '.'.join(map(str, self.counter[:index + 1])) 342 | 343 | def reset(self): 344 | """将第 2 级别的计数置为 0""" 345 | 346 | self.counter[-1] = 0 347 | 348 | 349 | def res_print(file_name, sign=">"): 350 | """打印一个将要输出的文件""" 351 | 352 | print('------{}'.format(sign), file_name) 353 | 354 | 355 | def course_dir(course_name, institution): 356 | """通过课程名和机构名返回一个完整的目录名字""" 357 | 358 | return Resource.regex_file.sub('', '%s - %s' % (course_name, institution)) 359 | 360 | def file_input(file, origin_text="", message=""): 361 | """ 调用编辑器,以文件的形式获取输入 """ 362 | 363 | with open(file, 'w', encoding='utf8') as f: 364 | f.write(origin_text) 365 | 366 | if SYS == 'Windows': 367 | os.startfile(file) 368 | elif SYS == 'Linux': 369 | subprocess.run('gedit "%s"' % 370 | file, shell=True, stdout=subprocess.PIPE) 371 | elif SYS == 'Darwin': 372 | subprocess.run('open -t "%s"' % 373 | file, shell=True, stdout=subprocess.PIPE) 374 | input(message) 375 | with open(file, 'r', encoding='utf8') as f: 376 | res = f.read() 377 | os.remove(file) 378 | return res 379 | 380 | 381 | def parse_res_list(res_list, file, *operator): 382 | """传入一个 Resource 实例的列表,并传入一个临时文件名,将调出默认程序修改名字,并调用对象的 operation 方法""" 383 | 384 | if file: 385 | names_text = '\n'.join(list(map(lambda res: str(res), res_list))) 386 | names = file_input(file, origin_text=names_text, message='修改完文件名后按回车继续。').split('\n') 387 | for (res, name) in zip(res_list, names): 388 | res.name = name 389 | res.operation(*operator) 390 | else: 391 | for res in res_list: 392 | res.operation(*operator) 393 | 394 | 395 | def store_cookies(mooc_type, restore=False): 396 | """存储并返回 Cookie 字典""" 397 | 398 | def cookie_input(): 399 | # Mac 容易由于 Cookie 太多而阻塞 400 | if SYS == 'Darwin': 401 | cookies = file_input('cookies_tmp.txt', message='输入 Cookie 后保存,并回到终端回车继续...') 402 | else: 403 | print('输入 Cookie:') 404 | cookies = input('> ') 405 | return cookies 406 | 407 | def cookie_to_json(raw_cookies): 408 | """将分号分隔的 Cookie 转为字典""" 409 | 410 | cookies_dict = {} 411 | if not raw_cookies: 412 | return {} 413 | if raw_cookies[:7].lower() == 'cookie:': 414 | raw_cookies = raw_cookies[7:] 415 | 416 | for cookie in raw_cookies.split(';'): 417 | key, value = cookie.strip().split("=", 1) 418 | cookies_dict[key] = value 419 | 420 | return cookies_dict 421 | 422 | file_path = os.path.join(sys.path[0], "cookies.json") 423 | if not os.path.isfile(file_path): 424 | cookies = {} 425 | else: 426 | with open(file_path, 'r') as cookies_file: 427 | cookies = json.load(cookies_file) 428 | 429 | if restore or not cookies.get(mooc_type): 430 | raw_cookies = cookie_input() 431 | cookies[mooc_type] = cookie_to_json(raw_cookies) 432 | with open(file_path, 'w') as f: 433 | json.dump(cookies, f, indent=2) 434 | 435 | return cookies[mooc_type] 436 | 437 | 438 | def size_format(size, ndigits=2): 439 | """ 输入数据字节数,与保留小数位数,返回数据量字符串 """ 440 | flag = '-' if size < 0 else '' 441 | size = abs(size) 442 | units = ["Bytes", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", "BB"] 443 | idx = len(units) - 1 444 | unit = "" 445 | unit_size = 0 446 | while idx >= 0: 447 | unit_size = 2 ** (idx * 10) 448 | if size >= unit_size: 449 | unit = units[idx] 450 | break 451 | idx -= 1 452 | return "{}{:.{}f} {}".format(flag, size/unit_size, ndigits, unit) 453 | 454 | 455 | def get_playlist(playlist_type, path_type): 456 | """传入播放列表类型及路径类型,返回播放列表对象""" 457 | 458 | if playlist_type == 'no': 459 | playlist = None 460 | elif playlist_type == 'dpl': 461 | playlist = Dpl(path_type=path_type) 462 | elif playlist_type == 'm3u': 463 | playlist = M3u(path_type=path_type) 464 | return playlist 465 | 466 | 467 | def aria2_download(videos, workdir, overwrite=False): 468 | """调用 aria2 下载视频""" 469 | 470 | aria2 = Aria2() 471 | files = [] 472 | 473 | for url, file_name in videos: 474 | file = Aria2File(aria2, url, file_name, workdir, overwrite=overwrite) 475 | files.append(file) 476 | 477 | # 显示进度 478 | process_bar_length = 50 479 | total_length = sum([file.get_length() for file in files]) 480 | length_flag = False 481 | while True: 482 | if not length_flag: 483 | length_flag = True 484 | total_length = 0 485 | for file in files: 486 | length = file.get_length() 487 | if length == 0: 488 | length_flag = False 489 | total_length += length 490 | 491 | speed = sum([file.get_speed() for file in files]) 492 | completed_length = sum([file.get_complete_length() for file in files]) 493 | len_done = (process_bar_length * completed_length // \ 494 | total_length) if total_length else process_bar_length 495 | len_undone = process_bar_length - len_done 496 | log_string = '{}{} {}/{} {:12}'.format( 497 | "#" * len_done, "_" * len_undone, size_format(completed_length), 498 | size_format(total_length), size_format(speed)+"/s") 499 | print(log_string, end="\r") 500 | time.sleep(1) 501 | 502 | # 重命名文件 503 | for file in files: 504 | if file.get_status() == "complete" and not file.renamed: 505 | file.rename() 506 | if all([file.get_status() == "complete" for file in files]): 507 | break 508 | 509 | print("视频已下载全部完成~") 510 | -------------------------------------------------------------------------------- /moocs/xuetangx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """学堂在线""" 3 | 4 | import json 5 | import sys 6 | 7 | from bs4 import BeautifulSoup 8 | 9 | from moocs.utils import * 10 | from utils.crawler import Crawler 11 | 12 | name = "xuetangx" 13 | need_cookies = True 14 | BASE_URL = 'http://www.xuetangx.com' 15 | CANDY = Crawler() 16 | CONFIG = {} 17 | FILES = {} 18 | VIDEOS = [] 19 | exports = {} 20 | __all__ = ["name", "need_cookies", "start", "exports"] 21 | 22 | 23 | def get_book(url): 24 | """获得所有的 PDF 电子书""" 25 | 26 | nav_page = CANDY.get(url).text 27 | shelves = set(re.findall(r'/courses/.+/pdfbook/\d/', nav_page)) 28 | for shelf_count, shelf in enumerate(shelves, 1): 29 | res = CANDY.get(BASE_URL + shelf).text 30 | soup = BeautifulSoup(res, 'lxml') 31 | WORK_DIR.change('Books', str(shelf_count)) 32 | for book_count, book in enumerate(soup.select('#booknav a'), 1): 33 | file_name = Resource.file_to_save(book.string) + '.pdf' 34 | if WORK_DIR.need_download(file_name, CONFIG["overwrite"]): 35 | CANDY.download_bin( 36 | BASE_URL + book['rel'][0], WORK_DIR.file(file_name)) 37 | 38 | 39 | def get_handout(url): 40 | """从课程信息页面获得课程讲义并存为 HTML 文件""" 41 | 42 | handouts_html = ClassicFile('Handouts.html') 43 | res = CANDY.get(url).text 44 | soup = BeautifulSoup(res, 'lxml') 45 | handouts = soup.find(class_='handouts') 46 | 47 | # 将相对地址替换为绝对地址 48 | for link in handouts.select('a[href^="/"]'): 49 | link['href'] = BASE_URL + link['href'] 50 | handouts_html.write_string('\n\n\n讲义\n\n' 51 | '\n\n%s\n' % handouts.prettify()) 52 | 53 | 54 | def get_video(video): 55 | """根据视频 ID 和文件名字获取视频信息""" 56 | 57 | file_name = video.file_name 58 | if WORK_DIR.need_download(file_name+'.mp4', CONFIG["overwrite"]): 59 | res = CANDY.get('http://xuetangx.com/videoid2source/' + video.meta).text 60 | try: 61 | video_url = json.loads(res)['sources']['quality20'][0] 62 | except: 63 | video_url = json.loads(res)['sources']['quality10'][0] 64 | FILES['videos'].write_string(video_url) 65 | FILES['renamer'].write( 66 | re.search(r'(\w+-[12]0.mp4)', video_url).group(1), file_name) 67 | VIDEOS.append((video_url, file_name+".mp4")) 68 | 69 | 70 | def get_content(url): 71 | """获取网页详细内容""" 72 | 73 | outline = Outline() 74 | counter = Counter() 75 | video_counter = Counter() 76 | video_list = [] 77 | 78 | courseware = CANDY.get(url).text 79 | soup = BeautifulSoup(courseware, 'lxml') 80 | 81 | chapters = soup.find(id='accordion').find_all(class_='chapter') 82 | for chapter in chapters: 83 | counter.add(0) 84 | video_counter.add(0) 85 | chapter_title = chapter.h3.a.get_text(strip=True) 86 | outline.write(chapter_title, counter, 0) 87 | 88 | sections = chapter.select('ul a') 89 | for section_info in sections: 90 | counter.add(1) 91 | video_counter.add(1) 92 | section_url = BASE_URL + section_info['href'] 93 | section_title = section_info.p.string.strip() 94 | 95 | outline.write(section_title, counter, 1) 96 | 97 | section_page = CANDY.get(section_url).text 98 | soup = BeautifulSoup(section_page, 'lxml') 99 | 100 | # 对于某些需要安装 MathPlayer 插件的网页 101 | try: 102 | tabs = soup.find(id='sequence-list').find_all('li') 103 | except AttributeError: 104 | break 105 | for tab_count, tab_info in enumerate(tabs, 1): 106 | counter.add(2) 107 | # title 可能出现换行符和重复,所以用 data-page-title 108 | tab_title = tab_info.a.get('data-page-title') 109 | 110 | outline.write(tab_title, counter) 111 | 112 | if tab_title == 'Video' or tab_title == '视频' or tab_title == '': 113 | tab_title = section_title 114 | 115 | tab_sequence = tab_info.a.get('aria-controls') 116 | 117 | tab_escape = soup.find(id=tab_sequence).string 118 | tab = BeautifulSoup(tab_escape, 'lxml').div.div 119 | 120 | blocks = tab.find_all('div', class_='xblock') 121 | for block in blocks: 122 | try: 123 | # 极少数没有 data-type 属性 124 | block_type = block['data-type'] 125 | except KeyError: 126 | continue 127 | if block_type == 'Video': 128 | video_counter.add(2) 129 | # 替换连续空格或制表符为单个空格 130 | video_name = block.h2.string.strip() 131 | 132 | outline.write(video_name, video_counter, 133 | level=3, sign='#') 134 | 135 | if video_name == 'Video' or video_name == '视频' or video_name == '': 136 | video_name = tab_title 137 | 138 | video_id = block.div['data-ccsource'] 139 | 140 | video = Video(video_counter, video_name, video_id) 141 | video_list.append(video) 142 | 143 | if CONFIG['sub']: 144 | get_subtitles(block.div['data-transcript-available-translations-url'], 145 | block.div['data-transcript-translation-url'], 146 | video.file_name) 147 | if video_list: 148 | WORK_DIR.change('Videos') 149 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 150 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 151 | if playlist: 152 | parse_res_list(video_list, rename, playlist.write, get_video) 153 | else: 154 | parse_res_list(video_list, rename, get_video) 155 | 156 | 157 | def get_subtitles(available, transcript, file_name): 158 | """获取字幕""" 159 | 160 | subtitle_available_url = BASE_URL + available 161 | try: 162 | subtitle_available = CANDY.get(subtitle_available_url).json() 163 | except json.decoder.JSONDecodeError: 164 | return 165 | WORK_DIR.change('Videos') 166 | base_subtitle_url = BASE_URL + transcript + '/' 167 | multi_subtitle = False if len(subtitle_available) == 1 else True 168 | for subtitle_desc in subtitle_available: 169 | subtitle_url = base_subtitle_url + subtitle_desc 170 | CANDY.get(subtitle_url) 171 | if multi_subtitle: 172 | sub_file_name = file_name + '_' + \ 173 | subtitle_desc.replace('_xuetangx', '') + '.srt' 174 | else: 175 | sub_file_name = file_name + '.srt' 176 | subtitle = CANDY.get(subtitle_available_url.rstrip( 177 | 'available_translations') + 'download').content 178 | with open(WORK_DIR.file(sub_file_name), 'wb') as subtitle_file: 179 | subtitle_file.write(subtitle) 180 | 181 | 182 | def get_summary(url): 183 | """从课程地址获得课程文件夹名称""" 184 | 185 | about_page = CANDY.get(url).text 186 | soup = BeautifulSoup(about_page, 'lxml') 187 | 188 | course_name = soup.find(id='title1').string 189 | institution = soup.find(class_='courseabout_text').a.string 190 | 191 | dir_name = course_dir(course_name, institution) 192 | print(dir_name) 193 | return dir_name 194 | 195 | 196 | def start(url, config, cookies=None): 197 | """调用接口函数""" 198 | 199 | global WORK_DIR 200 | CONFIG.update(config) 201 | 202 | CANDY.set_cookies(cookies) 203 | status = CANDY.get('http://www.xuetangx.com/header_ajax') 204 | if status.json()['login']: 205 | print('验证成功!') 206 | else: 207 | print('Cookie 失效。请获取新的 Cookie ') 208 | sys.exit(1) 209 | 210 | course_name = get_summary(url) 211 | 212 | WORK_DIR = WorkingDir(CONFIG['dir'], course_name) 213 | WORK_DIR.change('Videos') 214 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 215 | FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt')) 216 | 217 | handout = url.rstrip('about') + 'info' 218 | courseware = url.rstrip('about') + 'courseware' 219 | 220 | if CONFIG['doc']: 221 | # 使用 handout 作为入口更快 222 | get_book(handout) 223 | 224 | get_handout(handout) 225 | get_content(courseware) 226 | 227 | exports.update({ 228 | "workdir": WORK_DIR, 229 | "spider": CANDY, 230 | "videos": VIDEOS 231 | }) 232 | -------------------------------------------------------------------------------- /moocs/xuetangx_next.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """学堂在线""" 3 | 4 | from moocs.utils import * 5 | from utils.crawler import Crawler 6 | 7 | name = "xuetangx_next" 8 | need_cookies = True 9 | CANDY = Crawler() 10 | CONFIG = {} 11 | FILES = {} 12 | VIDEOS = [] 13 | exports = {} 14 | __all__ = ["name", "need_cookies", "start", "exports"] 15 | 16 | 17 | def get_summary(url): 18 | """从课程主页面获取信息""" 19 | 20 | sign, cid = re.match(r"https?://next.xuetangx.com/course/" 21 | "(?P.+?)/(?P.+)", url).group("sign", "cid") 22 | 23 | res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/product/info?cid=%s&sign=%s" % (cid, sign)) 24 | course_name = res.json()['data']['classroom_name'] 25 | # 机构名称不太容易获取,暂时不获取 26 | dir_name = course_dir(course_name, "学堂在线") 27 | 28 | print(dir_name) 29 | CONFIG['sign'] = sign 30 | CONFIG['cid'] = cid 31 | return cid, sign, dir_name 32 | 33 | 34 | def parse_resource(resource): 35 | 36 | cid, sign = CONFIG['cid'], CONFIG['sign'] 37 | file_name = resource.file_name 38 | item_id, item_info_id = resource.meta 39 | res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/leaf_info/%s/%s/?sign=%s" % (cid, item_id, sign), 40 | headers={"xtbz": "xt"}) 41 | if resource.type == 'Video': 42 | ccid = res.json()['data']['content_info']['media']['ccid'] 43 | 44 | video_url_res = CANDY.get("https://next.xuetangx.com/api/v1/lms/service/playurl/%s/?appid=10000" % ccid) 45 | sources = video_url_res.json()['data']['sources'] 46 | qualitys = ['20', '10'] 47 | for qa in qualitys: 48 | if sources.get('quality' + qa): 49 | # 居然是个数组,暂时没发现多段的,希望以后也没有吧…… 50 | video_url = sources['quality' + qa][0] 51 | break 52 | 53 | ext = '.mp4' 54 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]): 55 | FILES['renamer'].write(video_url.split('?')[0].split('/')[-1], file_name, ext) 56 | FILES['video'].write_string(video_url) 57 | VIDEOS.append((video_url, file_name+ext)) 58 | resource.ext = ext 59 | 60 | if not CONFIG['sub']: 61 | return 62 | # 暂未支持多语言 63 | subtitle_res = CANDY.get("https://next.xuetangx.com/api/v1/lms/service/subtitle_parse/?c_d=%s&lg=0" % ccid) 64 | if subtitle_res.status_code != 200: 65 | return 66 | subtitle_json = subtitle_res.json() 67 | starts, ends, texts = subtitle_json['start'], subtitle_json['end'], subtitle_json['text'] 68 | subtitle = Subtitle(WORK_DIR.file(file_name + '.srt')) 69 | assert len(starts) == len(ends) == len(texts) 70 | for i in range(len(starts)): 71 | subtitle.write(texts[i], starts[i]/1000, ends[i]/1000) 72 | 73 | elif resource.type == 'Document': 74 | if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]): 75 | return 76 | # 暂时也没遇到多个文件的情况 77 | downloads = res.json()['data']['content_info']['download'] 78 | if downloads: 79 | pdf_url = downloads[0]['file_url'] 80 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf')) 81 | 82 | 83 | def get_resource(cid, sign): 84 | """获取各种资源""" 85 | 86 | outline = Outline() 87 | counter = Counter() 88 | 89 | video_list = [] 90 | pdf_list = [] 91 | 92 | res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/course/chapter?cid=%s&sign=%s" % (cid, sign), 93 | headers={"xtbz": "xt"}) 94 | for chapter in res.json()['data']['course_chapter']: 95 | counter.add(0) 96 | chapter_id, chapter_name, chapter_order = chapter['id'], chapter['name'], chapter['order'] 97 | outline.write(chapter_name, counter, 0) 98 | 99 | for section in chapter['section_leaf_list']: 100 | counter.add(1) 101 | section_id, section_name, section_order = section['id'], section['name'], section['order'] 102 | outline.write(section_name, counter, 1) 103 | 104 | # 暂时忽略讨论、测验,以后可能支持(在 section 中作为叶子结点, type_id = 4 6) 105 | for item in section.get('leaf_list', []): 106 | counter.add(2) 107 | item_id, item_name, item_order = item['id'], item['name'], item['order'] 108 | item_type, item_info_id = item['leaf_type'], item['leafinfo_id'] 109 | # Video 110 | if item_type == 0: 111 | outline.write(item_name, counter, 2, sign='#') 112 | video_list.append(Video(counter, item_name, (item_id, item_info_id))) 113 | # Docs 114 | elif item_type == 3: 115 | item_name = item_name.rstrip('.pdf') 116 | outline.write(item_name, counter, 2, sign='*') 117 | if CONFIG['doc']: 118 | pdf_list.append(Document(counter, item_name, (item_id, item_info_id))) 119 | 120 | if video_list: 121 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False 122 | WORK_DIR.change('Videos') 123 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"]) 124 | if playlist is not None: 125 | parse_res_list(video_list, rename, parse_resource, playlist.write) 126 | else: 127 | parse_res_list(video_list, rename, parse_resource) 128 | if pdf_list: 129 | WORK_DIR.change('PDFs') 130 | parse_res_list(pdf_list, None, parse_resource) 131 | 132 | 133 | def start(url, config, cookies=None): 134 | """调用接口函数""" 135 | 136 | global WORK_DIR 137 | CANDY.set_cookies(cookies) 138 | CONFIG.update(config) 139 | 140 | cid, sign, course_name = get_summary(url) 141 | 142 | WORK_DIR = WorkingDir(CONFIG['dir'], course_name) 143 | WORK_DIR.change('Videos') 144 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}')) 145 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt')) 146 | 147 | get_resource(cid, sign) 148 | 149 | exports.update({ 150 | "workdir": WORK_DIR, 151 | "spider": CANDY, 152 | "videos": VIDEOS 153 | }) 154 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "course-crawler", 3 | "description": "一个基于 Python 3 的 MOOC 课程下载工具", 4 | "scripts": { 5 | "docs:dev": "vuepress dev docs", 6 | "docs:build": "vuepress build docs", 7 | "deploy": "bash scripts/deploy.sh" 8 | }, 9 | "husky": { 10 | "hooks": { 11 | "pre-commit": "pretty-quick --staged" 12 | } 13 | }, 14 | "devDependencies": { 15 | "husky": "^3.0.4", 16 | "prettier": "1.18.2", 17 | "pretty-quick": "^1.11.1", 18 | "vuepress": "^1.2.0" 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.22.0 2 | beautifulsoup4==4.8.0 3 | lxml==4.4.1 4 | pycryptodome==3.9.0 5 | -------------------------------------------------------------------------------- /scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ACCESS_TOKEN=$1 4 | USERNAME=SigureMo # 你的用户名 5 | REPO=course-crawler # 如果不指定,将发布在 .github.io 6 | BRANCH=gh-pages # 如果不指定,将发布在 master 分支 7 | CNAME="" # 你想发布到的域名 8 | 9 | if [ $ACCESS_TOKEN ] 10 | then TOKEN_PREFIX="${ACCESS_TOKEN}@" 11 | else TOKEN_PREFIX="" 12 | fi 13 | 14 | if [ $BRANCH ] 15 | then BRANCH_POSTFIX=":${BRANCH}" 16 | else BRANCH_POSTFIX="" 17 | fi 18 | 19 | if [ $REPO ] 20 | then REMOTE=https://${TOKEN_PREFIX}github.com/${USERNAME}/${REPO}.git 21 | else REMOTE=https://${TOKEN_PREFIX}github.com/${USERNAME}/${USERNAME}.github.io.git 22 | fi 23 | 24 | # 确保脚本抛出遇到的错误 25 | set -e 26 | 27 | # 生成静态文件 28 | npm run docs:build 29 | 30 | # 进入生成的文件夹 31 | cd docs/.vuepress/dist 32 | 33 | # 如果是发布到自定义域名 34 | if [ $CNAME ] 35 | then echo $CNAME > CNAME 36 | fi 37 | 38 | # 初始化仓库并提交发布 39 | git init 40 | git config user.name "GitHub Actions" 41 | git config user.email "support@github.com" 42 | git add -A 43 | time=$(date "+%Y-%m-%d %H:%M:%S") 44 | git commit -m "rebuild @${time}" 45 | git push -f $REMOTE master${BRANCH_POSTFIX} 46 | 47 | cd - 48 | -------------------------------------------------------------------------------- /utils/aria2.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import subprocess 3 | import json 4 | import time 5 | import os 6 | 7 | from urllib.request import urlopen 8 | 9 | rpc_url = "http://localhost:{port}/jsonrpc" 10 | 11 | 12 | class Aria2(): 13 | """ Aria2 RPC 接口调用器 14 | 完整接口见(简单封装即可): 15 | http://aria2.github.io/manual/en/html/aria2c.html#rpc-interface 16 | """ 17 | 18 | def __init__(self, aria2_path="aria2c", port=6800): 19 | self.port = port 20 | self.rpc_url = rpc_url.format(port=port) 21 | self.aria2_path = aria2_path 22 | self.process_file = open("process.out", "w") 23 | assert self.is_installed(), "请配置正确的 aria2 路径" 24 | if not self.is_connected(): 25 | self.process = self.init_rpc() 26 | # 防止操作过快导致 aria2 没来得及开启 27 | time.sleep(1) 28 | 29 | def __del__(self): 30 | """ 析构时确保 aria2 关闭 """ 31 | if self.is_connected(): 32 | self.shutdown() 33 | self.process_file.close() 34 | try: 35 | os.remove(self.process_file.name) 36 | except: 37 | print("process.out 自动删除失败……") 38 | 39 | def rpc_api(method): 40 | """ RPC 装饰器 """ 41 | def rpc_method(func): 42 | def new_func(self, *args): 43 | data = { 44 | 'jsonrpc': '2.0', 45 | 'id': 'qwer', 46 | 'method': method, 47 | 'params': list(filter(lambda arg: arg is not None, args)), 48 | } 49 | res = requests.post( 50 | self.rpc_url, data=json.dumps(data), timeout=2) 51 | return res.json()["result"] 52 | return new_func 53 | return rpc_method 54 | 55 | @rpc_api(method="aria2.addUri") 56 | def add_uri(self, uris, options=None, position=None): 57 | """ 添加 URI 任务 """ 58 | pass 59 | 60 | @rpc_api(method="aria2.getGlobalStat") 61 | def get_global_stat(self): 62 | """ 获取全局统计信息 """ 63 | pass 64 | 65 | @rpc_api(method="aria2.shutdown") 66 | def shutdown(self): 67 | """ 关闭 aria2 """ 68 | pass 69 | 70 | @rpc_api(method="aria2.tellStatus") 71 | def tell_status(self, gid, keys=None): 72 | """ 获取某一下载资源的状态信息 """ 73 | pass 74 | 75 | def init_rpc(self): 76 | """ 启动 aria2 RPC """ 77 | cmd = self.aria2_path + \ 78 | ' --enable-rpc' \ 79 | ' --rpc-listen-port %d' \ 80 | ' --continue' \ 81 | ' --max-concurrent-downloads=20' \ 82 | ' --max-connection-per-server=10' \ 83 | ' --rpc-max-request-size=1024M' % self.port 84 | 85 | return subprocess.Popen(cmd, shell=True, stdout=self.process_file) 86 | 87 | def is_connected(self): 88 | """ 是否可以连接 aria2 """ 89 | try: 90 | requests.post(self.rpc_url) 91 | return True 92 | except requests.exceptions.ConnectionError: 93 | return False 94 | 95 | def is_installed(self): 96 | """ 是否已经下载 aria2 """ 97 | try: 98 | return subprocess.run([self.aria2_path], stdout=subprocess.PIPE, 99 | stderr=subprocess.PIPE).returncode == 1 100 | except FileNotFoundError: 101 | return False 102 | 103 | 104 | class Aria2File(): 105 | 106 | def __init__(self, aria2, url, file_name, dir, overwrite=False): 107 | self.aria2 = aria2 108 | self.path = os.path.join(dir, file_name) 109 | self.tmp_path = self.path + ".t" 110 | self.aria2_file = self.tmp_path + ".aria2" 111 | if overwrite: 112 | if os.path.exists(self.tmp_path): 113 | os.remove(self.tmp_path) 114 | if os.path.exists(self.aria2_file): 115 | os.remove(self.aria2_file) 116 | self.gid = aria2.add_uri([url], {"dir": dir, "out": file_name+".t"}) 117 | self.renamed = False 118 | 119 | def get_length(self): 120 | """ 获取总大小 """ 121 | return int(self.aria2.tell_status(self.gid)["totalLength"]) 122 | 123 | def get_complete_length(self): 124 | """ 获取已完成部分大小 """ 125 | return int(self.aria2.tell_status(self.gid)["completedLength"]) 126 | 127 | def get_status(self): 128 | """ 获取状态 """ 129 | return self.aria2.tell_status(self.gid)["status"] 130 | 131 | def get_speed(self): 132 | """ 获取下载速度 """ 133 | return int(self.aria2.tell_status(self.gid)["downloadSpeed"]) 134 | 135 | def exists(self): 136 | """ 文件是否已存在 """ 137 | return os.path.exists(self.path) 138 | 139 | def rename(self): 140 | """ 将文件从临时位置移动到目标位置 """ 141 | if os.path.exists(self.path): 142 | os.remove(self.path) 143 | os.rename(self.tmp_path, self.path) 144 | self.renamed = True 145 | -------------------------------------------------------------------------------- /utils/crawler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import requests 5 | 6 | 7 | class Crawler(requests.Session): 8 | 9 | header = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 11 | } 12 | 13 | def __init__(self): 14 | super().__init__() 15 | self.headers.update(Crawler.header) 16 | 17 | def set_cookies(self, cookies): 18 | """传入一个字典,用于设置 cookies""" 19 | 20 | requests.utils.add_dict_to_cookiejar(self.cookies, cookies) 21 | 22 | def download_bin(self, url, file_path, stream=True, chunk_size=1024, **kw): 23 | """下载二进制文件""" 24 | 25 | res = self.get(url, stream=stream, **kw) 26 | tmp_path = file_path + ".t" 27 | try: 28 | with open(tmp_path, "wb") as f: 29 | if stream: 30 | for chunk in res.iter_content(chunk_size=chunk_size): 31 | if not chunk: 32 | break 33 | f.write(chunk) 34 | else: 35 | f.write(res.content) 36 | except: 37 | os.remove(tmp_path) 38 | print("[warn] {} failed to download".format(file_path)) 39 | if os.path.exists(file_path): 40 | os.remove(file_path) 41 | os.rename(tmp_path, file_path) 42 | 43 | def download_text(self, url, file_path, **kw): 44 | """下载文本,以 UTF-8 编码保存文件""" 45 | 46 | res = self.get(url, **kw) 47 | res.encoding = res.apparent_encoding 48 | with open(file_path, 'w', encoding='utf_8') as f: 49 | f.write(res.text) 50 | --------------------------------------------------------------------------------