├── .editorconfig
├── .github
├── ISSUE_TEMPLATE.md
└── workflows
│ └── nodejs.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── docker-entrypoint.sh
├── docs
├── .editorconfig
├── .vuepress
│ ├── components
│ │ └── bilibili-player.vue
│ └── config.js
├── README.md
├── advance
│ ├── cli.md
│ └── patch.md
├── courses
│ ├── cnmooc.md
│ ├── icourse163.md
│ ├── icourses.md
│ ├── livedu.md
│ ├── open_163.md
│ ├── study_163.md
│ ├── study_mooc.md
│ └── xuetangx.md
├── guide
│ ├── basic.md
│ ├── faq.md
│ ├── getting-started.md
│ ├── known-issues.md
│ └── notice.md
└── images
│ ├── get_cookies.png
│ └── icourse163_01.png
├── mooc.py
├── moocs
├── __init__.py
├── cnmooc.py
├── icourse163.py
├── icourses.py
├── icourses_share.py
├── livedu.py
├── open_163.py
├── study_163.py
├── study_mooc.py
├── utils.py
├── xuetangx.py
└── xuetangx_next.py
├── package.json
├── requirements.txt
├── scripts
└── deploy.sh
└── utils
├── aria2.py
└── crawler.py
/.editorconfig:
--------------------------------------------------------------------------------
1 | # EditorConfig
2 | # https://editorconfig.org/
3 |
4 | root = true
5 |
6 | [*]
7 | indent_style = space
8 | indent_size = 2
9 | end_of_line = lf
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 | insert_final_newline = true
13 |
14 | [*.py]
15 | indent_size = 4
16 |
17 | [*.md]
18 | indent_size = 3
19 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## (请在这里填写错误简述)
2 |
3 | 网站:中国大学MOOC(网易云课堂 MOOC、学堂在线)
4 |
5 | 课程地址:(请在这里填写课程地址)
6 |
7 | 问题描述:(请在这里填写问题描述)
8 |
--------------------------------------------------------------------------------
/.github/workflows/nodejs.yml:
--------------------------------------------------------------------------------
1 | name: Node CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 |
8 | jobs:
9 | build-and-deploy:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@master
13 | - name: git-lfs
14 | run: |
15 | git lfs install
16 | git lfs pull
17 | - uses: actions/setup-node@master
18 | - name: deploy
19 | run: |
20 | npm install yarn
21 | yarn
22 | yarn deploy $ACCESS_TOKEN
23 | env:
24 | ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # IPython
78 | profile_default/
79 | ipython_config.py
80 |
81 | # pyenv
82 | .python-version
83 |
84 | # celery beat schedule file
85 | celerybeat-schedule
86 |
87 | # SageMath parsed files
88 | *.sage.py
89 |
90 | # Environments
91 | .env
92 | .venv
93 | env/
94 | venv/
95 | ENV/
96 | env.bak/
97 | venv.bak/
98 |
99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 |
103 | # Rope project settings
104 | .ropeproject
105 |
106 | # mkdocs documentation
107 | /site
108 |
109 | # mypy
110 | .mypy_cache/
111 | .dmypy.json
112 | dmypy.json
113 |
114 | # Pyre type checker
115 | .pyre/
116 |
117 | ### Node ###
118 | # Logs
119 | logs
120 | *.log
121 | npm-debug.log*
122 | yarn-debug.log*
123 | yarn-error.log*
124 | lerna-debug.log*
125 |
126 | # Diagnostic reports (https://nodejs.org/api/report.html)
127 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
128 |
129 | # Runtime data
130 | pids
131 | *.pid
132 | *.seed
133 | *.pid.lock
134 |
135 | # Directory for instrumented libs generated by jscoverage/JSCover
136 | lib-cov
137 |
138 | # Coverage directory used by tools like istanbul
139 | coverage
140 | *.lcov
141 |
142 | # nyc test coverage
143 | .nyc_output
144 |
145 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
146 | .grunt
147 |
148 | # Bower dependency directory (https://bower.io/)
149 | bower_components
150 |
151 | # node-waf configuration
152 | .lock-wscript
153 |
154 | # Compiled binary addons (https://nodejs.org/api/addons.html)
155 | build/Release
156 |
157 | # Dependency directories
158 | node_modules/
159 | jspm_packages/
160 |
161 | # TypeScript v1 declaration files
162 | typings/
163 |
164 | # TypeScript cache
165 | *.tsbuildinfo
166 |
167 | # Optional npm cache directory
168 | .npm
169 |
170 | # Optional eslint cache
171 | .eslintcache
172 |
173 | # Optional REPL history
174 | .node_repl_history
175 |
176 | # Output of 'npm pack'
177 | *.tgz
178 |
179 | # Yarn Integrity file
180 | .yarn-integrity
181 |
182 | # dotenv environment variables file
183 | .env
184 | .env.test
185 |
186 | # parcel-bundler cache (https://parceljs.org/)
187 | .cache
188 |
189 | # next.js build output
190 | .next
191 |
192 | # nuxt.js build output
193 | .nuxt
194 |
195 | # vuepress build output
196 | .vuepress/dist
197 |
198 | # Serverless directories
199 | .serverless/
200 |
201 | # FuseBox cache
202 | .fusebox/
203 |
204 | # DynamoDB Local files
205 | .dynamodb/
206 |
207 | # End of https://www.gitignore.io/api/node
208 |
209 | # Node.js
210 | yarn.lock
211 | package.json
212 | .huskyrc
213 | .editorconfig
214 | commitlint.config.js
215 |
216 | # draft
217 | draft/
218 |
219 | # IDEs/editors
220 | .vscode/
221 | .idea/
222 |
223 | # Yarn
224 | yarn.lock
225 |
226 | # course crawler
227 | __pycache__/
228 | *.pyc
229 | /* - */
230 | /*.json
231 |
232 | # Others
233 | .ipynb_checkpoints
234 | .idea
235 | .DS_Store
236 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:alpine
2 |
3 | WORKDIR /app
4 |
5 |
6 | RUN apk add --update --no-cache --virtual build_images g++ gcc libxslt-dev git && \
7 | git clone https://github.com/Foair/course-crawler.git /app && \
8 | pip install requests BeautifulSoup4 lxml -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com && \
9 | apk del build_images && \
10 | rm -rf /app/README.md /app/LICENSE
11 |
12 | COPY ./docker-entrypoint.sh /app
13 |
14 | RUN chmod 777 ./docker-entrypoint.sh
15 |
16 | ENTRYPOINT ["./docker-entrypoint.sh"]
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Foair
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Course Crawler
2 |
3 | 
4 |
5 | 一个基于 Python 3 的 MOOC 课程下载工具,可以获取多个慕课网站的课件,方便离线观看
6 |
7 | ### 支持列表
8 |
9 | - [中国大学MOOC](https://www.icourse163.org/)
10 | - [网易云课堂](http://study.163.com/)
11 | - [普通课程](http://study.163.com/)
12 | - [MOOC 课程](http://mooc.study.163.com/)
13 | - [网易公开课](https://open.163.com/)
14 | - [好大学在线](https://www.cnmooc.org/)
15 | - [爱课程](http://www.icourses.cn/)
16 | - [视频公开课](http://www.icourses.cn/cuoc/)
17 | - [资源共享课](http://www.icourses.cn/mooc/)
18 | - [学堂在线](http://www.xuetangx.com/)
19 | - [北京高校优质课程研究会](http://www.livedu.com.cn/)
20 |
21 | 详细信息和用法请见 [https://www.sigure.xyz/course-crawler/](https://www.sigure.xyz/course-crawler/)。
22 |
23 | ### 声明
24 |
25 | 仅限个人学习和研究使用,切勿用于其他用途。强烈建议到 MOOC 网站进行学习,本程序只是提供一个备选方案。
26 |
27 | 本程序主体功能只是下载课件和附件,无任何手段获得付费课程,也没有以任何方式向任何人收取费用。
28 |
29 | 如果将程序用于商业用途或其他非法用途,一切后果由用户自负。
30 |
31 | 如果您发现有侵犯到您的合法权益,请与我联系删除相关代码,同时我对无意冒犯到您致以深深的歉意。
32 |
33 | ### 许可协议
34 |
35 | 请遵照 MIT 许可使用该程序。
36 |
--------------------------------------------------------------------------------
/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | python mooc.py "$@" -d "/video"
4 |
--------------------------------------------------------------------------------
/docs/.editorconfig:
--------------------------------------------------------------------------------
1 | # EditorConfig
2 | # https://editorconfig.org/
3 |
4 | root = true
5 |
6 | [*]
7 | indent_style = space
8 | indent_size = 2
9 | end_of_line = lf
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 | insert_final_newline = true
13 |
14 | [*.py]
15 | indent_size = 4
16 |
17 | [*.sh]
18 | indent_size = 4
19 |
20 | [*.md]
21 | indent_size = 3
22 |
--------------------------------------------------------------------------------
/docs/.vuepress/components/bilibili-player.vue:
--------------------------------------------------------------------------------
1 |
2 |
3 |
12 |
13 |
14 |
15 |
34 |
35 |
50 |
--------------------------------------------------------------------------------
/docs/.vuepress/config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | title: "Course Crawler",
3 | description: "基于 Python 3 的 MOOC 课程下载工具",
4 | base: "/course-crawler/",
5 |
6 | // 插件
7 | plugins: [
8 | // 页面滚动时自动激活侧边栏链接
9 | "@vuepress/active-header-links"
10 | ],
11 |
12 | // 主题配置
13 | themeConfig: {
14 | nav: [
15 | { text: "指南", link: "/" },
16 | { text: "分类", link: "/courses/icourse163" },
17 | { text: "进阶", link: "/advance/cli" }
18 | ],
19 | sidebarDepth: 1,
20 | sidebar: {
21 | "/advance/": ["cli", "patch"],
22 | "/courses/": [
23 | "icourse163",
24 | "study_163",
25 | "study_mooc",
26 | "open_163",
27 | "icourses",
28 | "xuetangx",
29 | "cnmooc",
30 | "livedu"
31 | ],
32 | "/": [
33 | "",
34 | "guide/getting-started",
35 | "guide/basic",
36 | "guide/faq",
37 | "guide/known-issues",
38 | "guide/notice"
39 | ]
40 | },
41 |
42 | // algolia: {
43 | // apiKey: "20560f10044e76d7f16908746c3adeb1",
44 | // indexName: "siguremo_course-crawler"
45 | // },
46 |
47 | lastUpdated: "Last Updated", // string | boolean
48 |
49 | // 假定是 GitHub. 同时也可以是一个完整的 GitLab URL
50 | repo: "SigureMo/course-crawler",
51 | // 自定义仓库链接文字。默认从 `themeConfig.repo` 中自动推断为
52 | // "GitHub"/"GitLab"/"Bitbucket" 其中之一,或是 "Source"。
53 | repoLabel: "GitHub",
54 |
55 | // 以下为可选的编辑链接选项
56 |
57 | // 假如你的文档仓库和项目本身不在一个仓库:
58 | docsRepo: "SigureMo/course-crawler",
59 | // 假如文档不是放在仓库的根目录下:
60 | docsDir: "docs/",
61 | // 假如文档放在一个特定的分支下:
62 | // docsBranch: "docs",
63 | // 默认是 false, 设置为 true 来启用
64 | editLinks: true,
65 | // 默认为 "Edit this page"
66 | editLinkText: "在GitHub上编辑此页!",
67 | // Service Worker 的配置
68 | serviceWorker: {
69 | updatePopup: true
70 | }
71 | }
72 | };
73 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # 介绍
2 |
3 | 
4 |
5 | 一个基于 Python 3 的 MOOC 课程内容获取工具,方便离线观看。
6 |
7 | [下载最新程序](https://github.com/SigureMo/course-crawler/archive/master.zip) 或 [前往 GitHub](https://github.com/SigureMo/course-crawler)
8 |
9 | ## 支持列表
10 |
11 | - [中国大学 MOOC](https://www.icourse163.org/) 是国内优质的中文 MOOC 学习平台,由爱课程网携手·网易云课堂·打造。**大多数的名校都有一定数量课程**,如北京大学、浙江大学、哈尔滨工业大学等
12 | - [网易云课堂](http://study.163.com/)
13 | - [普通课程](http://study.163.com/) 涵盖方面较广,更注重于**职场、生活技能**,很多需要付费
14 | - [MOOC 课程](http://mooc.study.163.com/) 有一部分中国大学 MOOC 的内容,此外还有一些微专业内容,但是很多需要付费,推荐 [顶尖中文大学计算机专业课程体系](https://study.163.com/curricula/cs.htm) 与 [深度学习工程师微专业](https://mooc.study.163.com/smartSpec/detail/1001319001.htm)
15 | - [网易公开课](https://open.163.com/) 是网易推出的“全球名校视频公开课项目”,收录了哈佛大学等**世界级名校**的公开课课程以及可汗学院,TED 等教育性组织的精彩视频,内容较经典,但是也相对比较陈旧
16 | - [好大学在线](https://www.cnmooc.org/) 是上海交通大学拥有的中国顶尖慕课平台。主要是 **上海交通大学** 等大学或机构的课程
17 | - [爱课程](http://www.icourses.cn/) 的资源比较多,但总体相对陈旧
18 | - [视频公开课](http://www.icourses.cn/cuoc/)
19 | - [资源共享课](http://www.icourses.cn/mooc/)
20 | - [学堂在线](http://www.xuetangx.com/) 是清华大学发起的精品中文慕课平台。主要是 **清华大学** 的课程
21 | - [北京高校优质课程研究会](http://www.livedu.com.cn/) 是北京市教委组织的**北京各高校**课程平台
22 |
--------------------------------------------------------------------------------
/docs/advance/cli.md:
--------------------------------------------------------------------------------
1 | # 命令行参数
2 |
3 |
4 |
5 | ## 显示帮助信息
6 |
7 | > `-h` `--help` 用于显示帮助信息。
8 |
9 | 输入 `python mooc.py -h` 或 `python mooc.py --help`。
10 |
11 | ## 指定下载目录
12 |
13 | > `-d ` `--dir=` 用于指定下载目录为 ``。
14 |
15 | 课程文件夹将在创建在 `` 中。默认创建在当前目录,即 `-d ""`。
16 |
17 | 示例
18 |
19 | ```bash
20 | python mooc.py -d "G:\MOOCs" https://www.icourse163.org/course/TONGJI-53004
21 | ```
22 |
23 | ::: tip
24 | `` 不能以 \ 结尾;当 `` 存在空格的时候,必须使用 `"` 将路径包裹起来。
25 | :::
26 |
27 | ## 重新录入 Cookies
28 |
29 | > `-c` `--restore-cookies` 用于在程序运行时录入新的 Cookies,以覆盖旧的 Cookies
30 |
31 | 由于 Cookies 经常存在过期的情况,手动去删除会很麻烦,这时只需要运行时加上这样一个参数就可以将旧的 Cookies 覆盖掉
32 |
33 | ## 指定视频清晰度
34 |
35 | > `-r ` `--quality ` 用于指定视频清晰度为 ``
36 |
37 | `` 可选列表为 `shd` `hd` `sd` ,分别对应超高清、高清、标清,默认为超高清
38 |
39 | 示例
40 |
41 | ```bash
42 | python mooc.py -r hd https://www.icourse163.org/course/TONGJI-53004
43 | ```
44 |
45 | ::: tip
46 | 在支持清晰度调节的课程中,如果指定的清晰度不存在,则先自动降低清晰度,若仍无匹配的清晰度,则后升高清晰度,比如指定为 hd ,则会以 hd sd shd 序列对清晰度进行匹配
47 | :::
48 |
49 | ## 强制覆盖已下载文件
50 |
51 | > `-w`, `--overwrite` 用于启用强制覆盖已经下载过的文件
52 |
53 | 示例
54 |
55 | ```bash
56 | python mooc.py https://www.icourse163.org/course/TONGJI-53004 -w
57 | ```
58 |
59 | ## aria2 的调用
60 |
61 | 为了方便后续视频的下载,增加了直接调用 `aria2` 进行下载的支持
62 |
63 | ::: tip aria2 相关下载:
64 |
65 | - [aria2](https://github.com/aria2/aria2/releases)
66 | - [aria2 webui](https://github.com/ziahamza/webui-aria2/archive/master.zip)
67 | - [AriaNg(一个比较好看的 webui)](https://github.com/mayswind/AriaNg/releases)
68 |
69 | :::
70 |
71 | > `--aria2` 用于启用 `aria2` 直接下载视频
72 |
73 | 当配置好 aria2 路径后,在课件解析完成时程序不退出,直接调用 `aria2` 下载视频
74 |
75 | ::: tip
76 |
77 | 请事先确保 `aria2c` 已经是可执行程序,即已经添加到环境变量
78 |
79 | :::
80 |
81 | 示例
82 |
83 | ```bash
84 | python mooc.py --aria2 https://www.icourse163.org/course/TONGJI-53004
85 | ```
86 |
87 | ## 播放列表设置
88 |
89 | 由于不同播放器对播放列表格式的要求并不相同,通过修改参数可以获得更通用的播放列表
90 |
91 | ::: tip 一些推荐的播放器
92 |
93 | - Windows
94 | - PotPlayer
95 | - Linux
96 | - SMPlayer
97 | - MacOS
98 | - IINA
99 |
100 | :::
101 |
102 | ### 播放列表类型
103 |
104 | > `--playlist-type=` 用于指定播放列表类型
105 |
106 | 可选列表 `dpl` `m3u` `no` ,默认为 `dpl` ,若指定 `no` 则不生成播放列表
107 |
108 | ::: tip
109 |
110 | 默认生成的 `Playlist.dpl` 仅仅对 PotPlayer 有效,如果无法使用 PotPlayer (比如 Linux 下),请生成更通用的 `m3u` 格式
111 |
112 | :::
113 |
114 | 示例
115 |
116 | ```bash
117 | python mooc.py --playlist-type=m3u https://www.icourse163.org/course/TONGJI-53004
118 | ```
119 |
120 | ### 播放列表路径类型
121 |
122 | > `--abs-path` 用于指定播放列表内的路径为绝对路径
123 |
124 | ::: tip
125 |
126 | 有些播放器并不支持相对路径的播放列表,如果你的播放器无法打开该文件,请尝试生成绝对路径的播放列表
127 |
128 | :::
129 |
130 | 示例
131 |
132 | ```bash
133 | python mooc.py --playlist-type=m3u --abs-path https://www.icourse163.org/course/TONGJI-53004
134 | ```
135 |
136 | ::: warning
137 |
138 | 绝对路径的播放列表会在课程文件夹移动后失效,如果开启该选项,请不要在课程下载后进行移动
139 |
140 | :::
141 |
142 | ## 不下载 ...
143 |
144 | ### 不下载文档
145 |
146 | > `--no-doc` 用于阻止下载 PDF、Word、PowerPoint 等文档。
147 |
148 | 默认会下载所有文档。
149 |
150 | 当指定了这个选项之后,不会下载任何文档(包括 PPT 和书籍等)。
151 |
152 | 示例
153 |
154 | ```bash
155 | python mooc.py https://www.icourse163.org/course/TONGJI-53004 --no-doc
156 | ```
157 |
158 | ### 不下载字幕
159 |
160 | > `--no-sub` 用于阻止下载字幕。
161 |
162 | ### 不下载富文本
163 |
164 | > `--no-text` 用于阻止下载富文本。
165 |
166 | ### 不下载附件
167 |
168 | > `--no-file` 用于阻止下载附件。
169 |
170 | ### 不下载播放列表
171 |
172 | > `--playlist-type=no` 用于阻止下载播放列表。详情见 [播放列表类型](#播放列表类型)
173 |
174 | ## 修正视频/文档名
175 |
176 | > `--inter` 用于修改文件名。
177 |
178 | 会调出文件编辑器,编辑好视频的名字之后保存。默认没有启用。
179 |
180 | ::: tip
181 | 请严格按照原来文本长度进行设置,否则可能会发生没有标题的情况。
182 | :::
183 |
--------------------------------------------------------------------------------
/docs/advance/patch.md:
--------------------------------------------------------------------------------
1 | # 修改默认值
2 |
3 |
4 |
5 | ## 修改默认获取目录
6 |
7 | 如果不想每次都指定获取目录的话,可以修改 `mooc.py`,找到如下行:
8 |
9 | ```python
10 | parser.add_argument('-d', default=r'G:\MOOCs', help='下载目录')
11 | ```
12 |
13 | 将 `G:\MOOCs` 替换为想要的文件夹即可。
14 |
15 | ## 默认启用某个选项
16 |
17 | 修改 `mooc.py`,将选项所在 `store_false` 或 `store_true` 切换一下就行了。
18 |
19 | 示例
20 |
21 | 如果我想默认不下载 PDF,那么将 `--no-pdf` 所在的那一行的 `store_false` 改了就行了,改成这样
22 |
23 | ```python
24 | parser.add_argument('--no-pdf', action='store_true', help='不下载 PDF 文档')
25 | ```
26 |
27 | 这样默认就不会下载 PDF,而如果在命令中使用了 `--no-pdf` 就会下载 PDF 了。
28 |
--------------------------------------------------------------------------------
/docs/courses/cnmooc.md:
--------------------------------------------------------------------------------
1 | # 好大学在线
2 |
3 | ## 简介
4 |
5 | [好大学在线](https://www.cnmooc.org/) 是上海交通大学拥有的中国顶尖慕课平台。主要是 **上海交通大学** 等大学或机构的课程。
6 |
7 | ## 地址格式
8 |
9 | 课程的地址必须类似以下这种格式
10 |
11 | ```
12 | https://www.cnmooc.org/portal/course/4386/9729.mooc
13 | ```
14 |
15 | ## 碎碎念
16 |
17 | 要想获得课程必须保证一下两个条件均满足:
18 |
19 | - 已经在客户端或 Web 端手动加入课程;
20 | - 当前课程已经在开课时间内。
21 |
22 | 同·中国大学 MOOC·一样,可以通过切换「开课班级」参加以前的课程。
23 |
24 | 如果当前课程还未开课,可以切换到以前的班次,并加入,这样就可以获得视频等资源。
25 |
--------------------------------------------------------------------------------
/docs/courses/icourse163.md:
--------------------------------------------------------------------------------
1 | # 中国大学 MOOC
2 |
3 | ## 简介
4 |
5 | [中国大学 MOOC](https://www.icourse163.org/) 是国内优质的中文 MOOC 学习平台,由爱课程网携手·网易云课堂·打造。**大多数的名校都有一定数量课程**,如北京大学、浙江大学、哈尔滨工业大学等
6 |
7 | ## 地址格式
8 |
9 | 课程的地址必须类似以下两种格式
10 |
11 | ```
12 | https://www.icourse163.org/course/TONGJI-53004
13 | https://www.icourse163.org/course/TONGJI-53004?tid=1001770008
14 | ```
15 |
16 | ::: tip
17 |
18 | - 上面的 `course` 替换为 `learn` 也是支持的
19 | - `SPOC` 课程也是支持的,比如 `https://www.icourse163.org/spoc/course/WHUT-1002745006?tid=1002931006`
20 | :::
21 |
22 | ## 开课次数
23 |
24 | 课程的地址包含了两部分信息,以 `https://www.icourse163.org/course/TONGJI-53004?tid=1001770008` 为例,`53004` 是课程号,唯一标志了同济大学开设的高等数学(一)这门课程,而 `1001770008` 代表了某学期的该课程的课程号,如果地址中不出现 `?tid=xxx` 字段,则默认为最新一次开课,所以我们可以通过控制最后的 `tid` 以达到下载不同学期的课件,而不同学期的地址我们可以在课程主页获取
25 |
26 | 
27 |
28 | 切换开课学期后便可在浏览器地址栏看到对应的学期课程地址
29 |
30 | ## 身份验证
31 |
32 | 中 M 的视频接口很不稳定,在这一年内进行了多次的变更,当前有两种内置的方案
33 |
34 | - 一种是在程序要求输入 Cookies 的时候直接回车注入空的 Cookies 以调用旧接口,但不保证该接口以后会不会删掉
35 | - 另一种输入完整的 Cookies ,这样会调用新的接口,但是最近(19 年 10 月),该接口只会返回新视频的 m3u8 播放列表,如果遇到该问题,请使用旧接口进行下载,问题详细描述见 [issue37](https://github.com/Foair/course-crawler/issues/37),如果该方案也无法解决,请临时使用 [mooc-dl](https://github.com/SigureMo/mooc-dl) 或者自行寻求其他解决方案
36 |
37 | ## 碎碎念
38 |
39 | 「老师已关闭该学期,无法查看」暂时无所畏惧。
40 |
41 | 找不到开课页面的话,可以先进入课程的公告页面,然后点击课程名。
42 |
43 | 如果你下载的是最新学期的课程,请**确定最新学期已经开课**,未开课的学期是无法下载的,不过你可以尝试下载前几个学期的课程。
44 |
--------------------------------------------------------------------------------
/docs/courses/icourses.md:
--------------------------------------------------------------------------------
1 | # 爱课程
2 |
3 | ## 简介
4 |
5 | [爱课程](https://www.icourse163.org/) 的资源比较多,但总体相对陈旧
6 |
7 | ## 地址格式
8 |
9 | - 「资源共享课」
10 |
11 | ```
12 | http://www.icourses.cn/sCourse/course_6076.html
13 | http://www.icourses.cn/web/sword/portal/shareDetails?cId=6076#/course/chapter
14 | ```
15 |
16 | - 「视频公开课」
17 |
18 | ```
19 | http://www.icourses.cn/web/sword/portal/videoDetail?courseId=1013d845-1344-1000-b974-22f745f72788#/?resId=10195dd1-1344-1000-bbd7-22f745f72788
20 | ```
21 |
22 | ::: tip
23 | 只要是以如下地址开始都可以,不用在意是在那一个视频。
24 |
25 | ```
26 | http://www.icourses.cn/web/sword/portal/videoDetail
27 | ```
28 |
29 | :::
30 |
--------------------------------------------------------------------------------
/docs/courses/livedu.md:
--------------------------------------------------------------------------------
1 | # 北京高校优质课程研究会
2 |
3 | ## 简介
4 |
5 | [北京高校优质课程研究会](http://www.livedu.com.cn/) 是北京市教委组织的**北京各高校**课程平台
6 |
7 | ## 地址格式
8 |
9 | 课程的地址必须类似以下这种格式
10 |
11 | ```
12 | http://www.livedu.com.cn/ispace4.0/moocxjkc/toKcView.do?kcid=253
13 | ```
14 |
15 | ## 碎碎念
16 |
17 | 下载前请确定你已经完成选课,否则也是无法解析的
18 |
19 | 另外,由于是从 HTML 中解析数据,速度极慢
20 |
--------------------------------------------------------------------------------
/docs/courses/open_163.md:
--------------------------------------------------------------------------------
1 | # 网易公开课
2 |
3 | ## 简介
4 |
5 | [网易公开课](https://open.163.com/) 是网易推出的“全球名校视频公开课项目”,收录了哈佛大学等**世界级名校**的公开课课程以及可汗学院,TED 等教育性组织的精彩视频,内容较经典,但是也相对比较陈旧
6 |
7 | ## 地址格式
8 |
9 | 课程的地址必须类似以下两种格式
10 |
11 | ```
12 | http://open.163.com/special/opencourse/cs50.html
13 | http://open.163.com/movie/2010/3/U/R/M6U6LS8CV_M6U6MHDUR.html
14 | ```
15 |
16 | ## 碎碎念
17 |
18 | 网易公开课也是不需要 Cookies 的
19 |
--------------------------------------------------------------------------------
/docs/courses/study_163.md:
--------------------------------------------------------------------------------
1 | # 网易云课堂
2 |
3 | ## 简介
4 |
5 | [网易云课堂](http://study.163.com/) 涵盖方面较广,更注重于**职场、生活技能**,很多需要付费
6 |
7 | ## 地址格式
8 |
9 | 课程的地址必须类似以下三种格式
10 |
11 | ```
12 | https://study.163.com/course/courseLearn.htm?courseId=1004570029#/learn/video?lessonId=1052094278&courseId=1004570029
13 | https://study.163.com/course/courseMain.htm?courseId=1004570029
14 | https://study.163.com/course/introduction/1004570029.htm
15 | ```
16 |
17 | ## 碎碎念
18 |
19 | 网易云课堂免费课程当前并不需要身份认证
20 |
21 | 当然,没有身份认证的话也是**不可能支持下载付费视频的**,暂时也不打算做相关支持
22 |
23 | 本文档仅针对网易云课堂普通课程,普通课程与 MOOC 课程相差很大, MOOC 课程更类似于中国大学 MOOC ,如需查看其文档,请移步 [网易云课堂 MOOC](study_mooc.md)
24 |
--------------------------------------------------------------------------------
/docs/courses/study_mooc.md:
--------------------------------------------------------------------------------
1 | # 网易云课堂 MOOC
2 |
3 | ## 简介
4 |
5 | [网易云课堂 MOOC 课程](http://mooc.study.163.com/) 有一部分中国大学 MOOC 的内容,此外还有一些微专业内容,但是很多需要付费,推荐 [顶尖中文大学计算机专业课程体系](https://study.163.com/curricula/cs.htm) 与 [深度学习工程师微专业](https://mooc.study.163.com/smartSpec/detail/1001319001.htm)
6 |
7 | ## 地址格式
8 |
9 | 课程的地址必须类似以下两种格式
10 |
11 | ```
12 | http://mooc.study.163.com/course/2001281002#/info
13 | http://mooc.study.163.com/course/2001281002
14 | ```
15 |
16 | ::: tip
17 |
18 | - 上面的 `course` 替换为 `learn` 也是支持的
19 | :::
20 |
21 | ## 碎碎念
22 |
23 | 与[中国大学 MOOC](./icourse163.md) 大体上相同,但它对身份的验证比较苛刻,你**本身无法访问到的内容程序也是无法帮你获取的,也就是说它并不能帮你获取你未参加的已关闭学期的内容**
24 |
25 | Cookies 极易失效,可在运行时添加参数 `-c` 注入新的 Cookies
26 |
--------------------------------------------------------------------------------
/docs/courses/xuetangx.md:
--------------------------------------------------------------------------------
1 | # 学堂在线
2 |
3 | ## 简介
4 |
5 | [学堂在线](http://www.xuetangx.com/) 是清华大学发起的精品中文慕课平台。主要是 **清华大学** 的课程
6 |
7 | ## 地址格式
8 |
9 | 课程的地址必须类似以下这种格式
10 |
11 | ```
12 | https://next.xuetangx.com/course/HNU08071000999/1076493
13 | ```
14 |
15 | ## 碎碎念
16 |
17 | 学堂在线于 19 年 10 月左右进行了大更新,域名改为了 `next.xuetangx.com` ,如果你还能找到类似下面这种 `www.xuetangx.com` 下的旧版本课程的话,现在也是支持下载的
18 |
19 | ```
20 | http://www.xuetangx.com/courses/course-v1:TsinghuaX+00740043_2x_2015_T2+sp/about
21 | ```
22 |
--------------------------------------------------------------------------------
/docs/guide/basic.md:
--------------------------------------------------------------------------------
1 | # 深入了解
2 |
3 | ## 课程目录结构
4 |
5 | ```
6 |
7 | |-- Outline.txt
8 | |-- Playlist.dpl
9 | |-- Files/
10 | |-- PDFs/
11 | |-- Texts/
12 | `-- Videos/
13 | |-- Rename.bat
14 | `-- Videos.txt
15 | ```
16 |
17 | ### 课程大纲
18 |
19 | `Outline.txt` 是课程的大纲,它的内容类似
20 |
21 | ```
22 | 6.1 空间直角坐标系及向量 {1}
23 | 6.1.1 空间直角坐标系的基本概念 {1.1}
24 | 6.1.1 空间直角坐标系的基本概念(视频) {1.1.1}#
25 | 6.1.1 空间直角坐标系的基本概念(PPT) {1.1.1}+
26 | 6.1.1 空间直角坐标系的基本概念(PPT) 空间直角坐标系的基本概念.rar {1.1.1}!
27 | ...
28 | ```
29 |
30 | 每个级别依次增加 2 个空格的缩进,`{}` 之间的是程序生成的编号,用来唯一标识一个资源(比如视频、富文本等等)。
31 |
32 | `{1.1.1}` 说明该视频文件以 `1.1.1` 开头,可以在 `Videos/` 中找到。如此可以方便地找到视频。
33 |
34 | 有些后面可能有奇怪的符号,比如 `{1.1.1}+`的后面有个 `+`。下面是符号的说明:
35 |
36 | - #: 视频,可以下载到 `Videos/`
37 | - \*:课件,一般是 PDF 文件,位于 `PDFs/`
38 | - +:富文本,一般是 HTML 文件,位于 `Texts/`
39 | - !:附件,位于 `Files/`
40 | - &:字幕,位于 `Videos/`
41 |
42 | ### 视频地址
43 |
44 | `Videos.txt` 是视频的链接,它的内容类似
45 |
46 | ```
47 | http://v.stu.126.net/mooc-video/nos/mp4/2017/02/21/1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4?ak=99ed7479ee303d1b1361b0ee5a4abcee11069a7277fd2bfbd983de77f6586b3ab4d3781458cdbd61bf0041fae59dee85cb91769ba5850a28845217d0bc9bfb580015e48ffc49c659b128bfe612dda086d65894b8ef217f1626539e3c9eb40879c29b730d22bdcadb1b4f67996129275fa4c38c6336120510aea1ae1790819de86e0fa3e09eeabea1b068b3d9b9b6597acf0c219eb000a69c12ce9d568813365b3e099fcdb77c69ca7cd6141d92c122af
48 | http://v.stu.126.net/mooc-video/nos/mp4/2017/02/21/1005822368_a91783c5f05a49e29960d24f1dc06f15_shd.mp4?ak=99ed7479ee303d1b1361b0ee5a4abcee11069a7277fd2bfbd983de77f6586b3a33090c48273cc5e338f1d269a2b016013857294759d07b499e26c45d788128b30015e48ffc49c659b128bfe612dda086d65894b8ef217f1626539e3c9eb40879c29b730d22bdcadb1b4f67996129275fa4c38c6336120510aea1ae1790819de86e0fa3e09eeabea1b068b3d9b9b6597acf0c219eb000a69c12ce9d568813365b3e099fcdb77c69ca7cd6141d92c122af
49 | ...
50 | ```
51 |
52 | 复制到下载工具下载,比如 [aria2](https://github.com/aria2/aria2/releases)、迅雷 等,也可以直接在浏览器中打开。
53 |
54 | ### 视频文件名
55 |
56 | `Rename.bat` (或 `Rename.sh`)用于将视频重命名,它的内容类似
57 |
58 | ```
59 | CHCP 65001
60 |
61 | REN "1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4" "1.1.1 空间直角坐标系的基本概念(视频).mp4"
62 | REN "1005822368_a91783c5f05a49e29960d24f1dc06f15_shd.mp4" "1.2.1 向量的坐标表示(视频).mp4"
63 | REN "1005817378_500b5301360f49c18c6f8d3406959cf5_shd.mp4" "1.3.1 向量的模、方向余弦、投影(视频).mp4"
64 | REN "1005821395_ff485bb1e65145ec90bf04a259eb6b0e_shd.mp4" "2.1.1 向量的数量积(视频).mp4"
65 | REN "1005821396_9180e5908bc847548a8db625af9b1ad7_shd.mp4" "2.2.1 向量的数量积(续)(视频).mp4"
66 | REN "1005817386_18d7ede415ec4cb5befa71a9d790ce0f_shd.mp4" "2.3.1 向量的向量积(视频).mp4"
67 | REN "1005822373_8bf3846066e045cda306bd7d27e38786_shd.mp4" "2.4.1 向量的向量积(续)(视频).mp4"
68 | REN "1005899086_7780acc4ac074ed89b6301e41349a2c1_shd.mp4" "3.1.1 平面方程(视频).mp4"
69 | ...
70 | ```
71 |
72 | 下载下来的视频文件名是一团糟的,比如
73 |
74 | ```
75 | 1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4
76 | ```
77 |
78 | 运行该文件,视频的文件名就清晰整齐了,也会按照章节次序排列。
79 |
80 | ::: tip
81 |
82 | - `Windows` 下,当视频和这个文件在同一个文件夹时直接**双击**该文件即可运行
83 | - `*nix`需要终端运行 `sh Rename.sh`
84 |
85 | :::
86 |
87 | ### 播放列表
88 |
89 | 打开 `Playlist.dpl` 即可播放 `Videos/` 中的视频。
90 |
91 | 由于文件系统的限制,特殊字符比如 `"` `/` `\` 都不允许出现在文件名中,所以文件名中的特殊字符是被删除的。假如原视频的标题是「有 3/4 的概率会下雨」,就会变成 `有 34 的概率会下雨`,就很奇怪吧。而播放的列表就可以解决这个问题,在播放列表中会显示 `有 3/4 的概率会下雨`。
92 |
93 | ## 说明
94 |
95 | 学堂在线暂时只有 `Books`,没有 `PDFs`,因为如果提供 `PPT` 的话,在讲义那一栏就有链接可以下载。
96 |
--------------------------------------------------------------------------------
/docs/guide/faq.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | ::: danger Q1:
4 |
5 | 我的登录信息输错了(失效了),怎么重新填写?
6 |
7 | :::
8 |
9 | ::: tip A1:
10 |
11 | 重新启动程序,启动时添加参数 `-c` 并输入新的 Cookies
12 |
13 | :::
14 |
15 | ---
16 |
17 | ::: danger Q2:
18 |
19 | 我遇到了一个课程无法成功获取,最快捷的反馈方式是?
20 |
21 | :::
22 |
23 | ::: tip A2:
24 |
25 | 依次进行如下检查:
26 |
27 | - Cookie 是否失效,如果失效请使用参数 `-c` 并重新输入
28 | - 当前账号是否加入了该课程,并对该课程**有访问权限**(比如该学期是否是开启状态,课程是否是付费才能观看)
29 | - [Github issues](https://github.com/Foair/course-crawler/issues) 中是否有相似问题与解决方案
30 |
31 | 如果仍然无法解决,请在 Github 提出 [issue](https://github.com/Foair/course-crawler/issues/new) ,或者[邮件联系我](mailto:sigure_mo@163.com),我会尽快处理
32 |
33 | :::
34 |
35 | ---
36 |
37 | ::: danger Q3:
38 |
39 | 我想看原版文档
40 |
41 | :::
42 |
43 | ::: tip A3:
44 |
45 | 请前往 [Foair 的文档](https://mooc.xoy.io/) 查看
46 |
47 | :::
48 |
49 | ---
50 |
51 | ::: danger Q4:
52 |
53 | 如何参与文档的修改?
54 |
55 | :::
56 |
57 | ::: tip A4:
58 |
59 | 点击文档左下角的“在 GitHub 上编辑此页” 即可~
60 |
61 | :::
62 |
--------------------------------------------------------------------------------
/docs/guide/getting-started.md:
--------------------------------------------------------------------------------
1 | # 快速开始
2 |
3 |
4 |
5 | ## 准备工作
6 |
7 | 在下载之前,你需要保证你已经安装 `python3.5` 及其以上版本,并且安装完成依赖
8 |
9 | 需要的依赖如下
10 |
11 | - `requests`
12 | - `BeautifulSoup4`
13 | - `lxml`
14 | - `pycryptodome`
15 |
16 | ```bash
17 | pip install requests BeautifulSoup4 lxml pycryptodome
18 | ```
19 |
20 | ## 下载程序源码
21 |
22 | 前往项目主页下载程序,或者直接点击[这里](https://github.com/SigureMo/course-crawler/archive/master.zip),之后解压
23 |
24 | 当然,已经安装 `git` 的同学可以直接 `clone`
25 |
26 | ```bash
27 | git clone https://github.com/SigureMo/course-crawler.git
28 | ```
29 |
30 | ## 运行程序
31 |
32 | 在刚刚下载的项目根目录下打开命令行(“终端”、“命令提示符”、“PowerShell”都行,`Win10` 在项目根目录按住 `shift` 右键就有相应的选项,后面统称命令行)
33 |
34 | 在命令行中输入 `python mooc.py ` ,即可将课程课件下载到当前文件夹
35 |
36 | 比如,中国大学 MOOC 课程 `《高等数学(一)》 - 同济大学`
37 |
38 | ```bash
39 | python mooc.py https://www.icourse163.org/course/TONGJI-53004
40 | ```
41 |
42 | ::: tip
43 | 这里的 `` 为课程主页的地址,网址的具体要求及课程下载的额外要求详见[分类](../courses/icourse163.md)
44 | :::
45 |
46 | ## 身份验证
47 |
48 | 很多课程并不是直接就能下载的,需要验证下你的身份,这大多都可以通过输入 Cookies 解决
49 |
50 | 当你下载的课程需要输入 Cookies 时,用浏览器打开课程主页,然后按下 `F12` 打开开发者工具
51 |
52 | 切换到 `Network` 选项卡,刷新页面,在左侧选择第一个抓到的包,在右侧 `Headers` 中找到 `cookie` (也可能是 `Cookie`),复制粘贴到程序命令行中
53 |
54 | 
55 |
56 | ::: tip
57 |
58 | 如果你和我一样懒的话,可以直接三击 cookies 快速将整个 cookies 及前面的 `cookie:` 一起选中,直接复制粘贴到程序中,也是可以的,反正我是懒得从左上滑到右下啦,所以特意做了这个小“优化”~
59 |
60 | :::
61 |
62 | ## 等待 ...
63 |
64 | 等待程序运行,程序首先会从课程主页获取课件列表及解析所需相关信息,之后逐个课件进行解析下载
65 |
66 | ## 下载视频
67 |
68 | 特别地,由于视频资源相对来说花费时间较多,所以视频资源并不是在解析时直接进行下载,而是解析出 `url` 至 `/Videos/Videos.txt` ,之后需要你自行使用下载工具进行下载(比如 `aria2` ,或者迅雷等)
69 |
70 | 下载后将视频移动到 `/Videos/` 内,之后双击 `Rename.bat` 即可修正视频名
71 |
72 | ::: tip
73 |
74 | - 这里的 `` 指课程根目录
75 | - Linux 下的使用以及 `Rename` 文件详情请见[视频文件名](basic.html#视频文件名)
76 |
77 | :::
78 |
79 | ## 视频的播放
80 |
81 | 使用 PotPlayer 打开 `Playlist.dpl` 即可播放视频
82 |
83 | ::: tip
84 |
85 | 如果你并不想使用 PotPlayer ,请修改[播放列表设置](../advance/cli.html#播放列表设置)
86 |
87 | :::
88 |
--------------------------------------------------------------------------------
/docs/guide/known-issues.md:
--------------------------------------------------------------------------------
1 | # 已知问题
2 |
3 | ::: warning Q1:
4 |
5 | 可能会出现被远程主机强制关闭一个连接。
6 |
7 | :::
8 |
9 | ::: tip A1:
10 |
11 | 解决方法:等待一段时间然后重新尝试。
12 |
13 | :::
14 |
15 | ---
16 |
17 | ::: warning Q2:
18 |
19 | 网易云课堂(MOOC) 的 Cookie 很容易失效。
20 |
21 | :::
22 |
23 | ::: tip A2:
24 |
25 | 解决方法:更加频繁地修改 Cookie。
26 |
27 | :::
28 |
29 | ---
30 |
31 | ::: warning Q3:
32 |
33 | Windows 下不能自动删除 `process.out`。
34 |
35 | :::
36 |
37 | ::: tip A3:
38 |
39 | 解决方法:手动删除 :joy:。
40 |
41 | :::
42 |
--------------------------------------------------------------------------------
/docs/guide/notice.md:
--------------------------------------------------------------------------------
1 | # 告示板
2 |
3 | ## Course Cralwer
4 |
5 | 仅限个人学习和研究使用,切勿用于其他用途。强烈建议到 MOOC 网站进行学习,本程序只是提供一个备选方案。
6 |
7 | 本程序主体功能只是下载课件和附件,无任何手段获得付费课程,也没有以任何方式向任何人收取费用。
8 |
9 | 如果将程序用于商业用途或其他非法用途,一切后果由用户自负。
10 |
11 | 如果您发现有侵犯到您的合法权益,请与我联系删除相关程序,同时我对无意冒犯到您致以深深的歉意。
12 |
13 | 许可协议:MIT
14 |
15 | ## 本文档
16 |
17 | 许可协议:CC0
18 |
19 | ## 与原作联系
20 |
21 | SigureMo/course-crawler 基于 Foair 的 Course Crawler ,修复部分 bug ,并且增加部分新功能,本文档亦然
22 |
23 | 本程序的所有“完善”工作均离不开 Foair 原有的框架,在我刚刚接触到这个项目的时候,Foair 给了我莫大的鼓励与支持,这对我之后的 Coding 风格产生了极大的影响
24 |
25 | ## 推广
26 |
27 | - [bilili-dl](https://github.com/SigureMo/bilili-dl) B 站视频下载器,支持普通视频以及番剧的下载,B 站也是有很多不错的课程的
28 | - [mooc-dl](https://github.com/SigureMo/mooc-dl) 中国大学 MOOC 爬虫,使用手机端接口,可作为本项目 icourse163 的备用接口
29 |
30 | ## 感谢
31 |
32 | - vuepress [https://github.com/vuejs/vuepress](https://github.com/vuejs/vuepress)
33 | - [Foair/course-crawler](https://github.com/Foair/course-crawler)
34 | - [https://mooc.xoy.io/](https://mooc.xoy.io/)
35 |
36 | 以及你们的支持,有你们, Course Crawler 才能更加完善~
37 |
--------------------------------------------------------------------------------
/docs/images/get_cookies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/docs/images/get_cookies.png
--------------------------------------------------------------------------------
/docs/images/icourse163_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/docs/images/icourse163_01.png
--------------------------------------------------------------------------------
/mooc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """MOOC 课程下载"""
3 |
4 | import os
5 | import sys
6 | import re
7 | import argparse
8 |
9 | from moocs.utils import aria2_download, store_cookies
10 |
11 |
12 | def main():
13 | """解析命令行参数并调用相关模块进行下载"""
14 |
15 | parser = argparse.ArgumentParser(description='Course Crawler')
16 | parser.add_argument('url', help='课程地址')
17 | parser.add_argument('-c', '--restore-cookies', action='store_true',
18 | help='执行任务的时候重新输入 cookies')
19 | parser.add_argument('-d', '--dir', default=r'', help='下载目录')
20 | parser.add_argument('-r', '--quality', default='shd', help='视频清晰度')
21 | parser.add_argument('-w', '--overwrite',
22 | action='store_true', help='强制覆盖重新下载')
23 | parser.add_argument('--inter', action='store_true', help='交互式修改文件名')
24 | parser.add_argument('--no-doc', action='store_false',
25 | help='不下载 PDF、Word 等文档')
26 | parser.add_argument('--no-sub', action='store_false', help='不下载字幕')
27 | parser.add_argument('--no-file', action='store_false', help='不下载附件')
28 | parser.add_argument('--no-text', action='store_false', help='不下载富文本')
29 | parser.add_argument("--playlist-type", default="dpl",
30 | choices=["dpl", "m3u", "no"], help="播放列表类型,支持 dpl 和 m3u,输入 no 不生成播放列表")
31 | parser.add_argument("--abs-path", action='store_true',
32 | help="播放列表路径使用绝对路径,默认为相对路径")
33 | parser.add_argument('--aria2', action='store_true', help='自动调用aria2下载视频')
34 |
35 | args = parser.parse_args()
36 | resolutions = ['shd', 'hd', 'sd']
37 | playlist_path_type = 'AP' if args.abs_path else 'RP'
38 |
39 | config = {'doc': args.no_doc, 'sub': args.no_sub, 'file': args.no_file, 'text': args.no_text,
40 | 'rename': args.inter, 'dir': args.dir, 'resolution': resolutions.index(args.quality.lower()),
41 | 'overwrite': args.overwrite, 'playlist_type': args.playlist_type, 'playlist_path_type': playlist_path_type,
42 | 'aria2': args.aria2}
43 |
44 | if re.match(r'https?://www.icourse163.org/(spoc/)?(course|learn)/', args.url):
45 | from moocs import icourse163 as mooc
46 | elif re.match(r'https?://www.xuetangx.com/courses/.+/about', args.url):
47 | from moocs import xuetangx as mooc
48 | elif re.match(r'https?://next.xuetangx.com/course/.+', args.url):
49 | from moocs import xuetangx_next as mooc
50 | elif re.match(r'https?://mooc.study.163.com/(course|learn)/', args.url):
51 | from moocs import study_mooc as mooc
52 | elif re.match(r'https?://study.163.com/course/', args.url):
53 | from moocs import study_163 as mooc
54 | elif re.match(r'https?://open.163.com/(special|movie)/', args.url):
55 | from moocs import open_163 as mooc
56 | elif re.match(r'https?://www.cnmooc.org/portal/course/', args.url):
57 | from moocs import cnmooc as mooc
58 | elif re.match(r'https?://www.icourses.cn/web/sword/portal/videoDetail', args.url):
59 | from moocs import icourses as mooc
60 | elif re.match(r'https?://www.icourses.cn/sCourse/course_\d+.html', args.url) or \
61 | re.match(r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=', args.url):
62 | from moocs import icourses_share as mooc
63 | elif re.match(r'https?://www.livedu.com.cn/ispace4.0/moocxjkc/toKcView.do\?kcid=', args.url):
64 | from moocs import livedu as mooc
65 | else:
66 | print('课程地址有误!')
67 | sys.exit(1)
68 |
69 | if mooc.need_cookies:
70 | cookies = store_cookies(mooc.name, restore=args.restore_cookies)
71 | else:
72 | cookies = None
73 |
74 | mooc.start(args.url, config, cookies)
75 |
76 | # 视频下载
77 | if config['aria2']:
78 | workdir = mooc.exports["workdir"]
79 | workdir.change('Videos')
80 | videos = mooc.exports["videos"]
81 | aria2_download(videos, workdir.path, overwrite=config["overwrite"])
82 |
83 |
84 | if __name__ == '__main__':
85 | main()
86 |
--------------------------------------------------------------------------------
/moocs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/moocs/__init__.py
--------------------------------------------------------------------------------
/moocs/cnmooc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """好大学在线"""
3 |
4 | from bs4 import BeautifulSoup
5 |
6 | from moocs.utils import *
7 | from utils.crawler import Crawler
8 |
9 | name = "cnmooc"
10 | need_cookies = True
11 | CANDY = Crawler()
12 | CONFIG = {}
13 | FILES = {}
14 | VIDEOS = []
15 | exports = {}
16 | __all__ = ["name", "need_cookies", "start", "exports"]
17 |
18 |
19 | def get_summary(url):
20 | """获得课程信息"""
21 |
22 | res = CANDY.get(url).text
23 | soup = BeautifulSoup(res, 'lxml')
24 | title = soup.find(class_='view-title substr').get_text(strip=True)
25 | university = soup.find(class_='person-attach substr').get_text(strip=True)
26 |
27 | dir_name = course_dir(title, university)
28 | print(dir_name)
29 | return dir_name
30 |
31 |
32 | def get_resource(course_nav):
33 | """获得视频资源"""
34 |
35 | counter = Counter()
36 | outline = Outline()
37 | video_list = []
38 | document_list = []
39 |
40 | res = CANDY.get(course_nav).text
41 | soup = BeautifulSoup(res, 'lxml')
42 | nav = soup.find(id='unitNavigation')
43 | chapters = nav.find_all(class_='view-chapter')
44 | for chapter in chapters:
45 | chapter_name = chapter.find(
46 | class_='chapter-text substr').get_text(strip=True)
47 | counter.add(0)
48 | outline.write(chapter_name, counter, 0)
49 |
50 | lectures = chapter.find_all(class_='view-lecture')
51 | for lecture in lectures:
52 | actions = lecture.find(class_='lecture-title')
53 | lecture_name = actions.get_text(strip=True)
54 | counter.add(1)
55 | outline.write(lecture_name, counter, 1)
56 | # unitid = actions.a['unitid']
57 | # print(unitid)
58 | group = actions.div.find_all('a')
59 | # for action in group:
60 | # print(action.i['class'])
61 | videos = list(
62 | filter(lambda action: 'icon-play' in action.i['class'][0], group))
63 | # videos = [action for action in group if lambda :'icon-play' in action.i['class'][0]]
64 | docs = list(
65 | filter(lambda action: 'icon-doc' in action.i['class'][0], group))
66 | for video in videos:
67 | counter.add(2)
68 | outline.write(video['title'], counter, 2, sign='#')
69 | if len(videos) == 1:
70 | extra_num = ''
71 | else:
72 | extra_num = '-%s' % str(counter)[-1:]
73 | video_list.append(
74 | Video(counter, lecture_name + extra_num, video['itemid']))
75 | counter.reset()
76 | for doc in docs:
77 | counter.add(2)
78 | outline.write(doc['title'], counter, 2, sign='*')
79 | document_list.append(
80 | Document(counter, lecture_name, doc['itemid']))
81 | return video_list, document_list
82 |
83 |
84 | def parse_resource(video):
85 | """解析视频地址"""
86 |
87 | res = CANDY.post('https://www.cnmooc.org/study/play.mooc',
88 | data={'itemId': video.meta, 'itemType': '10', 'testPaperId': ''}).text
89 | soup = BeautifulSoup(res, 'lxml')
90 | node_id = soup.find(id='nodeId')['value']
91 |
92 | res = CANDY.post('https://www.cnmooc.org/item/detail.mooc',
93 | data={'nodeId': node_id, 'itemId': video.meta}).json()
94 | if WORK_DIR.need_download(video.file_name+".mp4", CONFIG["overwrite"]):
95 | url = res['node']['flvUrl']
96 | FILES['videos'].write_string(url)
97 | FILES['renamer'].write(url.split('/')[-1], video.file_name)
98 | VIDEOS.append((url, video.file_name+".mp4"))
99 |
100 | if CONFIG['sub']:
101 | exts = res['node']['nodeExts']
102 | for ext in exts:
103 | file_name = '%s%s.srt' % (video.file_name, '' if len(
104 | exts) == 1 else '_' + ext['languageCode'])
105 | if WORK_DIR.need_download(file_name, CONFIG["overwrite"]):
106 | CANDY.download_bin('https://static.cnmooc.org' +
107 | ext['node']['rsUrl'], WORK_DIR.file(file_name))
108 |
109 |
110 | def get_doc(doc_list):
111 | """获得文档"""
112 |
113 | WORK_DIR.change('Docs')
114 | for doc in doc_list:
115 | post_data = {'itemId': doc.meta, 'itemType': '20', 'testPaperId': ''}
116 | res = CANDY.post(
117 | 'https://www.cnmooc.org/study/play.mooc', data=post_data).text
118 | try:
119 | url = re.search(r'isSlideShow\("(.+)?"\);', res).group(1)
120 | except AttributeError:
121 | continue
122 | ext = url.split('.')[-1]
123 | file_name = doc.file_name
124 | if WORK_DIR.need_download(file_name + '.' + ext, CONFIG["overwrite"]):
125 | CANDY.download_bin('https://static.cnmooc.org' + url, WORK_DIR.file(file_name + '.' + ext))
126 |
127 |
128 | def start(url, config, cookies=None):
129 | """调用接口函数"""
130 |
131 | global WORK_DIR
132 | CONFIG.update(config)
133 |
134 | CANDY.set_cookies(cookies)
135 |
136 | course_info = get_summary(url)
137 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info)
138 | WORK_DIR.change('Videos')
139 |
140 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
141 | FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt'))
142 |
143 | course = 'https://www.cnmooc.org/portal/session/unitNavigation/'
144 | course_nav = course + url.split('/')[-1]
145 | resource = get_resource(course_nav)
146 |
147 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
148 |
149 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
150 | if playlist:
151 | parse_res_list(resource[0], rename, playlist.write, parse_resource)
152 | else:
153 | parse_res_list(resource[0], rename, parse_resource)
154 |
155 | if CONFIG['doc']:
156 | get_doc(resource[1])
157 |
158 | exports.update({
159 | "workdir": WORK_DIR,
160 | "spider": CANDY,
161 | "videos": VIDEOS
162 | })
163 |
--------------------------------------------------------------------------------
/moocs/icourse163.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """中国大学MOOC"""
3 |
4 | import json
5 | import time
6 | import sys
7 |
8 | from moocs.utils import *
9 | from utils.crawler import Crawler
10 |
11 | name = "icourse163"
12 | need_cookies = True
13 | CANDY = Crawler()
14 | CONFIG = {}
15 | FILES = {}
16 | VIDEOS = []
17 | exports = {}
18 | __all__ = ["name", "need_cookies", "start", "exports"]
19 |
20 |
21 | def get_summary(url):
22 | """从课程主页面获取信息"""
23 |
24 | url = url.replace('learn/', 'course/')
25 | res = CANDY.get(url).text
26 |
27 | term_id = re.search(r'termId : "(\d+)"', res).group(1)
28 | names = re.findall(r'name:"(.+)"', res)
29 |
30 | dir_name = course_dir(*names[:2])
31 |
32 | print(dir_name)
33 | CONFIG['term_id'] = term_id
34 | return term_id, dir_name
35 |
36 |
37 | def parse_resource(resource):
38 | """解析资源地址和下载资源"""
39 |
40 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
41 | 'httpSessionId': '5531d06316b34b9486a6891710115ebc', 'c0-scriptName': 'CourseBean',
42 | 'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': '0', 'c0-param0': 'number:' + resource.meta[0],
43 | 'c0-param1': 'number:' + resource.meta[1], 'c0-param2': 'number:0',
44 | 'c0-param3': 'number:' + resource.meta[2], 'batchId': str(int(time.time()) * 1000)}
45 | res = CANDY.post('https://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr',
46 | data=post_data).text
47 |
48 | file_name = resource.file_name
49 | if resource.type == 'Video':
50 | if CONFIG["hasToken"]:
51 | video_token = CANDY.post('https://www.icourse163.org/web/j/resourceRpcBean.getResourceToken.rpc?csrfKey='+CONFIG['token'], data={
52 | 'bizId': resource.meta[2],
53 | 'bizType': 1,
54 | 'contentType': 1,
55 | }).json()['result']['videoSignDto']['signature']
56 | data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={
57 | 'videoId': resource.meta[0],
58 | 'signature': video_token,
59 | 'clientType': '1'
60 | }).json()
61 |
62 | resolutions = [3, 2, 1]
63 | url, ext = '', ''
64 | for sp in resolutions[CONFIG['resolution']:]:
65 | # TODO: 增加视频格式选择
66 | for video in data['result']['videos']:
67 | if video['quality'] == sp and video['format'] == 'mp4':
68 | url = video['videoUrl']
69 | ext = '.mp4'
70 | break
71 | else:
72 | continue
73 | break
74 | assert ext, "近期中国大学 MOOC 接口变动,请临时使用 https://github.com/SigureMo/mooc-dl"
75 |
76 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
77 | FILES['renamer'].write(
78 | re.search(r'(\w+\.mp4)', url).group(1), file_name, ext)
79 | FILES['video'].write_string(url)
80 | VIDEOS.append((url, file_name+ext))
81 | resource.ext = ext
82 | else:
83 | resolutions = ['Shd', 'Hd', 'Sd']
84 | url, ext = '', ''
85 | for sp in resolutions[CONFIG['resolution']:]:
86 | # TODO: 增加视频格式选择
87 | # video_info = re.search(r'%sUrl="(?P.*?(?P\.((m3u8)|(mp4)|(flv))).*?)"' % sp, res)
88 | video_info = re.search(r'(?Pmp4)%sUrl="(?P.*?\.(?P=ext).*?)"' % sp, res)
89 | if video_info:
90 | url, ext = video_info.group('url', 'ext')
91 | ext = '.' + ext
92 | break
93 | assert ext, "近期中国大学 MOOC 接口变动,请临时使用 https://github.com/SigureMo/mooc-dl"
94 |
95 | url = url.replace('v.stu.126.net', 'jdvodrvfb210d.vod.126.net')
96 | if CANDY.head(url, allow_redirects=True, timeout=20).status_code != 200:
97 | url = url.replace('mooc-video', 'jdvodrvfb210d')
98 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
99 | FILES['renamer'].write(re.search(r'(\w+\.((m3u8)|(mp4)|(flv)))', url).group(1), file_name, ext)
100 | FILES['video'].write_string(url)
101 | VIDEOS.append((url, file_name+ext))
102 | resource.ext = ext
103 |
104 | if not CONFIG['sub']:
105 | return
106 | subtitles = re.findall(r'name="(.+)";.*url="(.*?)"', res)
107 | for subtitle in subtitles:
108 | if len(subtitles) == 1:
109 | sub_name = file_name + '.srt'
110 | else:
111 | subtitle_lang = subtitle[0].encode(
112 | 'utf_8').decode('unicode_escape')
113 | sub_name = file_name + '_' + subtitle_lang + '.srt'
114 | if not WORK_DIR.need_download(sub_name, CONFIG["overwrite"]):
115 | continue
116 | CANDY.download_bin(subtitle[1], WORK_DIR.file(sub_name))
117 |
118 | elif resource.type == 'Document':
119 | if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]):
120 | return
121 | pdf_url = re.search(r'textOrigUrl:"(.*?)"', res).group(1)
122 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
123 |
124 | elif resource.type == 'Rich':
125 | if not WORK_DIR.need_download(file_name + '.html', CONFIG["overwrite"]):
126 | return
127 | text = re.search(r'htmlContent:"(.*)",id',
128 | res.encode('utf_8').decode('unicode_escape'), re.S).group(1)
129 | with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file:
130 | file.write(text)
131 |
132 |
133 | def get_resource(term_id):
134 | """获取各种资源"""
135 |
136 | outline = Outline()
137 | counter = Counter()
138 |
139 | video_list = []
140 | pdf_list = []
141 | rich_text_list = []
142 |
143 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 'c0-scriptName': 'CourseBean',
144 | 'c0-methodName': 'getMocTermDto', 'c0-id': '0', 'c0-param0': 'number:' + term_id,
145 | 'c0-param1': 'number:0', 'c0-param2': 'boolean:true', 'batchId': str(int(time.time()) * 1000)}
146 | res = CANDY.post('https://www.icourse163.org/dwr/call/plaincall/CourseBean.getMocTermDto.dwr',
147 | data=post_data).text.encode('utf_8').decode('unicode_escape')
148 |
149 | chapters = re.findall(r'homeworks=\w+;.+id=(\d+).+name="([\s\S]+?)";', res)
150 | for chapter in chapters:
151 | counter.add(0)
152 | outline.write(chapter[1], counter, 0)
153 |
154 | lessons = re.findall(
155 | r'chapterId=' + chapter[0] + r'.+contentId=null.+contentType=1.+id=(\d+).+name="([\s\S]+?)"', res)
156 | for lesson in lessons:
157 | counter.add(1)
158 | outline.write(lesson[1], counter, 1)
159 |
160 | videos = re.findall(r'contentId=(\d+).+contentType=(1).+id=(\d+).+lessonId=' +
161 | lesson[0] + r'.+name="([\s\S]+?)"', res)
162 | for video in videos:
163 | counter.add(2)
164 | outline.write(video[3], counter, 2, sign='#')
165 | video_list.append(Video(counter, video[3], video))
166 | counter.reset()
167 |
168 | pdfs = re.findall(r'contentId=(\d+).+contentType=(3).+id=(\d+).+lessonId=' +
169 | lesson[0] + r'.+name="([\s\S]+?)"', res)
170 | for pdf in pdfs:
171 | counter.add(2)
172 | outline.write(pdf[3], counter, 2, sign='*')
173 | if CONFIG['doc']:
174 | pdf_list.append(Document(counter, pdf[3], pdf))
175 | counter.reset()
176 |
177 | rich_text = re.findall(r'contentId=(\d+).+contentType=(4).+id=(\d+).+jsonContent=(.+?);.+lessonId=' +
178 | lesson[0] + r'.+name="([\s\S]]+?)"', res)
179 | for text in rich_text:
180 | counter.add(2)
181 | outline.write(text[4], counter, 2, sign='+')
182 | if CONFIG['text']:
183 | rich_text_list.append(RichText(counter, text[4], text))
184 | if CONFIG['file']:
185 | if text[3] != 'null' and text[3] != '""':
186 | params = {'nosKey': re.search('nosKey":"(.+?)"', text[3]).group(1),
187 | 'fileName': re.search('"fileName":"(.+?)"', text[3]).group(1)}
188 | file_name = Resource.file_to_save(params['fileName'])
189 | outline.write(file_name, counter, 2, sign='!')
190 |
191 | WORK_DIR.change('Files')
192 | file_name = '%s %s' % (counter, file_name)
193 | if WORK_DIR.need_download(file_name, CONFIG["overwrite"]):
194 | CANDY.download_bin('https://www.icourse163.org/course/attachment.htm',
195 | WORK_DIR.file(file_name), params=params)
196 | counter.reset()
197 |
198 | if video_list:
199 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
200 | WORK_DIR.change('Videos')
201 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
202 | if playlist is not None:
203 | parse_res_list(video_list, rename, parse_resource, playlist.write)
204 | else:
205 | parse_res_list(video_list, rename, parse_resource)
206 | if pdf_list:
207 | WORK_DIR.change('PDFs')
208 | parse_res_list(pdf_list, None, parse_resource)
209 | if rich_text_list:
210 | WORK_DIR.change('Texts')
211 | parse_res_list(rich_text_list, None, parse_resource)
212 |
213 |
214 | def start(url, config, cookies):
215 | """调用接口函数"""
216 |
217 | global WORK_DIR
218 | CANDY.set_cookies(cookies)
219 | CONFIG.update(config)
220 |
221 | if cookies.get('NTESSTUDYSI'):
222 | CONFIG['hasToken'] = True
223 | CONFIG['token'] = cookies.get('NTESSTUDYSI')
224 | else:
225 | CONFIG['hasToken'] = False
226 |
227 | term_id, dir_name = get_summary(url)
228 | WORK_DIR = WorkingDir(CONFIG['dir'], dir_name)
229 | WORK_DIR.change('Videos')
230 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
231 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
232 |
233 | get_resource(term_id)
234 |
235 | exports.update({
236 | "workdir": WORK_DIR,
237 | "spider": CANDY,
238 | "videos": VIDEOS
239 | })
240 |
--------------------------------------------------------------------------------
/moocs/icourses.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """爱课程"""
3 |
4 | from moocs.utils import *
5 | from bs4 import BeautifulSoup
6 | import re
7 | import json
8 | from utils.crawler import Crawler
9 |
10 | name = "icourses"
11 | need_cookies = False
12 | CANDY = Crawler()
13 | CONFIG = {}
14 | FILES = {}
15 | VIDEOS = []
16 | exports = {}
17 | __all__ = ["name", "need_cookies", "start", "exports"]
18 |
19 |
20 | def get_content(url):
21 | """获得课程信息"""
22 |
23 | res = CANDY.get(url).text
24 | soup = BeautifulSoup(res, 'lxml')
25 | script = soup.find_all('script')[-2].string
26 | js = re.search(r'_sourceArrStr = (.*);', script)
27 | school = soup.find(class_='teacher-infor-from').string
28 | name = soup.find(class_='coursetitle pull-left').a.string
29 | dir_name = course_dir(name, school)
30 | res_info = json.loads(js.group(1))
31 | print(dir_name)
32 | return dir_name, res_info
33 |
34 |
35 | def parse_res(js):
36 | """获得视频名称和地址"""
37 | outline = Outline()
38 | length = len(str(len(js)))
39 | counter = 0
40 | video_list = []
41 | for lesson in js:
42 | counter += 1
43 | counter_str = str(counter).zfill(length)
44 | title = lesson['title']
45 | url = lesson['fullLinkUrl']
46 | outline.write_string('%s {%s}#' % (title, counter_str))
47 | video = Video(counter_str, title, url)
48 | video_list.append(video)
49 |
50 | return video_list
51 |
52 |
53 | def parse_video(video):
54 | """将视频信息添加到相关列表中"""
55 |
56 | if WORK_DIR.need_download(video.file_name+".mp4", CONFIG["overwrite"]):
57 | FILES['videos'].write_string(video.meta)
58 | FILES['renamer'].write(video.meta.split('/')[-1], video.file_name)
59 | VIDEOS.append((video.meta, video.file_name+".mp4"))
60 |
61 |
62 | def start(url, config, cookies=None):
63 | """调用接口函数"""
64 |
65 | global WORK_DIR
66 | CONFIG.update(config)
67 |
68 | course_info = get_content(url)
69 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[0])
70 |
71 | WORK_DIR.change('Videos')
72 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
73 | FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt'))
74 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
75 | if playlist:
76 | FILES['playlist'] = playlist
77 |
78 | video_list = parse_res(course_info[1])
79 |
80 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
81 |
82 | if playlist:
83 | parse_res_list(video_list, rename,
84 | FILES['playlist'].write, parse_video)
85 | else:
86 | parse_res_list(video_list, rename, parse_video)
87 |
88 | exports.update({
89 | "workdir": WORK_DIR,
90 | "spider": CANDY,
91 | "videos": VIDEOS
92 | })
93 |
--------------------------------------------------------------------------------
/moocs/icourses_share.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """爱课程 资源共享课"""
3 | import re
4 |
5 | from bs4 import BeautifulSoup
6 |
7 | from moocs.utils import *
8 | from utils.crawler import Crawler
9 |
10 | name = "icourses_share"
11 | need_cookies = False
12 | CANDY = Crawler()
13 | CONFIG = {}
14 | FILES = {}
15 | VIDEOS = []
16 | exports = {}
17 | __all__ = ["name", "need_cookies", "start", "exports"]
18 |
19 |
20 | def get_summary(url):
21 | """从课程主页面获取信息"""
22 | if re.match(r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=(\d+)', url):
23 | course_id = re.match(
24 | r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=(\d+)', url).group(1)
25 | url = 'http://www.icourses.cn/sCourse/course_{}.html'.format(course_id)
26 | else:
27 | course_id = re.match(
28 | r'https?://www.icourses.cn/sCourse/course_(\d+).html', url).group(1)
29 | res = CANDY.get(url)
30 | res.encoding = 'utf8'
31 | soup = BeautifulSoup(res.text, 'lxml')
32 | name = soup.find('div', class_='course-introduction-infor').find('div',
33 | class_='course-title').p.string
34 |
35 | dir_name = course_dir(name, '爱课程资源共享课')
36 |
37 | print(dir_name)
38 |
39 | return course_id, dir_name
40 |
41 |
42 | def parse_resource(resource):
43 | """解析资源地址和下载资源"""
44 |
45 | file_name = resource.file_name
46 | if resource.type == 'Video':
47 | video_urls = {}
48 | video_urls['sd'] = resource.meta['fullResUrl']
49 | if resource.meta.get('fullResUrl2'):
50 | video_urls['hd'] = resource.meta['fullResUrl2']
51 |
52 | resolutions = ['shd', 'hd', 'sd']
53 | for sp in resolutions[CONFIG['resolution']:]:
54 | if video_urls.get(sp):
55 | url = video_urls[sp]
56 | break
57 |
58 | if WORK_DIR.need_download(file_name+".mp4", CONFIG["overwrite"]):
59 | FILES['renamer'].write(
60 | re.search(r'(\w+\.mp4)', url).group(1), file_name)
61 | FILES['video'].write_string(url)
62 | VIDEOS.append((url, file_name+".mp4"))
63 | #resource.ext = ext
64 |
65 | if not CONFIG['sub']:
66 | return
67 | # 暂未发现字幕
68 |
69 | elif resource.type == 'Document':
70 | pdf_url = resource.meta['fullResUrl']
71 | if WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]):
72 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
73 |
74 |
75 | def get_resource(course_id):
76 | """获取各种资源"""
77 |
78 | outline = Outline()
79 | counter = Counter()
80 |
81 | video_list = []
82 | pdf_list = []
83 |
84 | res = CANDY.get(
85 | 'http://www.icourses.cn/web/sword/portal/shareChapter?cid={}'.format(course_id))
86 | soup = BeautifulSoup(res.text, 'lxml')
87 | chapters = soup.find('ul', id='chapters').children
88 | for chapter in chapters:
89 | if chapter.name is None:
90 | continue
91 | counter.add(0)
92 | chapter_id = chapter.attrs['data-id']
93 | chapter_name = chapter.find(
94 | 'a', class_='chapter-title-text').string.replace('\n\t\t\t\t\t\t\t', ' ')
95 | outline.write(chapter_name, counter, 0)
96 |
97 | # 章前导读
98 | try:
99 | important = chapter.find(
100 | 'a', attrs={'title': '重点难点'}).attrs['data-url']
101 | instructional_design = chapter.find(
102 | 'a', attrs={'title': '教学设计'}).attrs['data-url']
103 | exam_id = chapter.find(
104 | 'a', attrs={'title': '评价考核'}).attrs['data-id']
105 | exam_contents = CANDY.post(
106 | 'http://www.icourses.cn/web//sword/common/getTextBody', data={'id': exam_id}).text
107 | textbook_id = chapter.find(
108 | 'a', attrs={'title': '教材内容'}).attrs['data-id']
109 | textbook_contents = CANDY.post(
110 | 'http://www.icourses.cn/web//sword/common/getTextBody', data={'id': textbook_id}).text
111 | WORK_DIR.change('Introduction')
112 | outline.write('重点难点', counter, 2, sign='*')
113 | CANDY.download_bin(important, WORK_DIR.file(
114 | '%s 重点难点.html') % counter)
115 | outline.write('教学设计', counter, 2, sign='*')
116 | CANDY.download_bin(instructional_design,
117 | WORK_DIR.file('%s 教学设计.html') % counter)
118 | outline.write('评价考核', counter, 2, sign='+')
119 | with open(WORK_DIR.file('%s 评价考核.html' % counter), 'w', encoding='utf_8') as file:
120 | file.write(exam_contents)
121 | outline.write('教材内容', counter, 2, sign='+')
122 | with open(WORK_DIR.file('%s 教材内容.html' % counter), 'w', encoding='utf_8') as file:
123 | file.write(textbook_contents)
124 | except:
125 | pass
126 |
127 | lessons = chapter.find('ul', class_='chapter-body-l').contents
128 | for lesson in lessons:
129 | if len(lessons) == 1:
130 | counter.add(1)
131 | lesson_id = chapter_id
132 | lesson_name = chapter_name
133 | else:
134 | if lesson.name is None:
135 | continue
136 | counter.add(1)
137 | lesson_info = lesson.find(
138 | 'a', class_='chapter-body-content-text')
139 | lesson_id = lesson_info.attrs['data-secid']
140 | lesson_name = lesson_info.text.replace('\n', '')
141 | rej = CANDY.post(
142 | 'http://www.icourses.cn/web//sword/portal/getRess', data={'sectionId': lesson_id}).json()
143 |
144 | outline.write(lesson_name, counter, 1)
145 |
146 | for resource in rej['model']['listRes']:
147 | if resource['mediaType'] == 'mp4':
148 | counter.add(2)
149 | outline.write(resource['title'], counter, 2, sign='#')
150 | video_list.append(
151 | Video(counter, resource['title'], resource))
152 | counter.reset()
153 |
154 | for resource in rej['model']['listRes']:
155 | if resource['mediaType'] in ['pdf', 'ppt']:
156 | counter.add(2)
157 | outline.write(resource['title'], counter, 2, sign='*')
158 | if CONFIG['doc']:
159 | pdf_list.append(
160 | Document(counter, resource['title'], resource))
161 | counter.reset()
162 |
163 | if video_list:
164 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
165 | WORK_DIR.change('Videos')
166 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
167 | if playlist:
168 | parse_res_list(video_list, rename, playlist.write, parse_resource)
169 | else:
170 | parse_res_list(video_list, rename, parse_resource)
171 | if pdf_list:
172 | WORK_DIR.change('PDFs')
173 | parse_res_list(pdf_list, None, parse_resource)
174 |
175 |
176 | def start(url, config, cookies=None):
177 | """调用接口函数"""
178 |
179 | # 初始化设置
180 | global WORK_DIR
181 | CONFIG.update(config)
182 |
183 | # 课程信息
184 | course_info = get_summary(url)
185 |
186 | # 创建课程目录
187 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
188 |
189 | WORK_DIR.change('Videos')
190 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
191 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
192 |
193 | # 获得资源
194 | get_resource(course_info[0])
195 |
196 | exports.update({
197 | "workdir": WORK_DIR,
198 | "spider": CANDY,
199 | "videos": VIDEOS
200 | })
201 |
--------------------------------------------------------------------------------
/moocs/livedu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """北京高校优质课程研究会"""
3 |
4 | import time
5 |
6 | from bs4 import BeautifulSoup
7 |
8 | from moocs.utils import *
9 | from utils.crawler import Crawler
10 |
11 | name = "livedu"
12 | need_cookies = True
13 | CANDY = Crawler()
14 | CONFIG = {}
15 | FILES = {}
16 | VIDEOS = []
17 | exports = {}
18 | __all__ = ["name", "need_cookies", "start", "exports"]
19 |
20 |
21 | def get_summary(url):
22 | """从课程主页面获取信息"""
23 |
24 | course_id = re.search(r'kcid=(?P\d+)', url).group('course_id')
25 | data = {
26 | 'kcid': course_id,
27 | 'kcdm': course_id,
28 | }
29 | res = CANDY.post(CONFIG['study_page'], data=data)
30 | study_soup = BeautifulSoup(res.text, 'html.parser')
31 | name = study_soup.find(
32 | 'dl', class_='content-a-title').find('dt').find('span').string
33 |
34 | home_text = CANDY.get(url).text
35 | home_soup = BeautifulSoup(home_text, 'html.parser')
36 | chapter_names = []
37 | if home_soup.find('div', class_='vice-main-kcap'):
38 | for chapter_lable in home_soup.find('div', class_='vice-main-kcap')\
39 | .find('ul')\
40 | .children:
41 | try:
42 | chapter_names.insert(
43 | 0, chapter_lable.find('div').find('span').string)
44 | except:
45 | pass
46 | else:
47 | for chapter_lable in home_soup.find('div', id='accordion')\
48 | .find_all('h3'):
49 | chapter_names.insert(0, chapter_lable.text)
50 |
51 | dir_name = course_dir(name, '北京高校优质课程研究会')
52 |
53 | print(dir_name)
54 |
55 | CONFIG['course_id'] = course_id
56 | CONFIG['study_soup'] = study_soup
57 | CONFIG['chapter_names'] = chapter_names
58 | return course_id, dir_name
59 |
60 |
61 | def parse_resource(resource):
62 | """解析资源地址和下载资源"""
63 |
64 | file_name = resource.file_name
65 | if resource.type == 'Video':
66 | ext = '.mp4'
67 | if WORK_DIR.need_download(file_name+ext, CONFIG["overwrite"]):
68 | resource.ext = ext
69 | FILES['renamer'].write(
70 | re.search(r'(\w+\.mp4)', resource.meta).group(1), file_name, ext)
71 | FILES['video'].write_string(resource.meta)
72 | VIDEOS.append((resource.meta, file_name+ext))
73 |
74 | elif resource.type == 'Document':
75 | if not WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]):
76 | return
77 | CANDY.download_bin(resource.meta, WORK_DIR.file(file_name + '.pdf'))
78 |
79 | elif resource.type == 'Rich':
80 | if not WORK_DIR.need_download(file_name+".html", CONFIG["overwrite"]):
81 | return
82 | with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file:
83 | file.write(resource.meta)
84 |
85 |
86 | def get_resource(course_id):
87 | """获取各种资源"""
88 |
89 | outline = Outline()
90 | counter = Counter()
91 |
92 | video_list = []
93 | pdf_list = []
94 | test_list = []
95 |
96 | study_soup = CONFIG['study_soup']
97 | chapter_names = CONFIG['chapter_names']
98 | study_div = study_soup.find('div', class_='ation-a-main')
99 | left_div = study_div.find('div', class_='xx-main-left')
100 | info_div = left_div.find('div', class_='xx-left-main')
101 | chapters = info_div.find_all('dl')
102 | for chapter in chapters:
103 | counter.add(0)
104 | # chapter_name = chapter.find('dt').contents[2].strip()
105 | chapter_name = chapter_names.pop()
106 | outline.write(chapter_name, counter, 0)
107 |
108 | lessons = chapter.find_all('dd')
109 | for lesson in lessons:
110 | counter.add(1)
111 | lesson_info = lesson.find('a')
112 | lesson_id = re.search(r"xsxx\('(?P.+)'\)",
113 | lesson_info.attrs.get('onclick')).group('lesson_id')
114 |
115 | data = {
116 | 'kcdm': course_id,
117 | 'zjdm': lesson_id,
118 | }
119 | res = CANDY.post(CONFIG['study_page'], data=data)
120 | soup = BeautifulSoup(res.text, 'html.parser')
121 | study_div = soup.find('div', class_='ation-a-main')
122 | right_div = study_div.find('div', class_='xx-main-right')
123 | study_box = right_div.find('div', class_='xx-main-box')
124 | lesson_name = study_box.find('h4').contents[1]
125 | outline.write(lesson_name, counter, 1)
126 | resource_div = study_box.find('div', class_='study-L-text')
127 |
128 | # GET video url
129 | video_div = resource_div.find('div', id='videoBj_1')
130 | if video_div:
131 | video_url = video_div.find('input', id='sp').attrs.get('value')
132 | video_name = 'Video:{}'.format(lesson_name)
133 | outline.write(video_name, counter, 2, sign='#')
134 | video_list.append(Video(counter, video_name, video_url))
135 |
136 | # GET pdf url
137 | pdf_iframe = resource_div.find(
138 | 'iframe', attrs={'name': 'pdfContainer'})
139 | if pdf_iframe:
140 | pdf_div = pdf_iframe.parent
141 | pdf_name = pdf_div.find('span').string.replace('.pdf', '')
142 | pdf_url = re.search(
143 | r'cclj=(?Phttp.+\.pdf)', pdf_iframe.attrs.get('src')).group('pdf_url')
144 | outline.write(pdf_name, counter, 2, sign='*')
145 | if CONFIG['doc']:
146 | pdf_list.append(Document(counter, pdf_name, pdf_url))
147 |
148 | # GET test text
149 | test_div = study_box.find('div', class_='zy-a-list')
150 | if test_div:
151 | test_name = 'Test:{}'.format(lesson_name)
152 | outline.write(test_name, counter, 2, sign='+')
153 | if CONFIG['text']:
154 | test_list.append(
155 | RichText(counter, test_name, str(test_div)))
156 |
157 | if video_list:
158 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
159 | WORK_DIR.change('Videos')
160 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
161 | if playlist:
162 | parse_res_list(video_list, rename, playlist.write, parse_resource)
163 | else:
164 | parse_res_list(video_list, rename, parse_resource)
165 | if pdf_list:
166 | WORK_DIR.change('PDFs')
167 | parse_res_list(pdf_list, None, parse_resource)
168 | if test_list:
169 | WORK_DIR.change('Texts')
170 | parse_res_list(test_list, None, parse_resource)
171 |
172 |
173 | def start(url, config, cookies=None):
174 | """调用接口函数"""
175 |
176 | # 初始化设置
177 | global WORK_DIR
178 | CANDY.set_cookies(cookies)
179 | CONFIG.update(config)
180 | CONFIG['study_page'] = 'http://www.livedu.com.cn/ispace4.0/moocxsxx/queryAllZjByKcdm.do'
181 |
182 | # 课程信息
183 | course_info = get_summary(url)
184 |
185 | # 创建课程目录
186 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
187 |
188 | WORK_DIR.change('Videos')
189 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
190 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
191 |
192 | # 获得资源
193 | get_resource(course_info[0])
194 |
195 | exports.update({
196 | "workdir": WORK_DIR,
197 | "spider": CANDY,
198 | "videos": VIDEOS
199 | })
200 |
--------------------------------------------------------------------------------
/moocs/open_163.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """网易公开课"""
3 |
4 | import time
5 |
6 | from bs4 import BeautifulSoup
7 | from Crypto.Cipher import AES
8 |
9 | from moocs.utils import *
10 | from utils.crawler import Crawler
11 |
12 | name = "open_163"
13 | need_cookies = False
14 | CANDY = Crawler()
15 | CONFIG = {}
16 | FILES = {}
17 | VIDEOS = []
18 | exports = {}
19 | __all__ = ["name", "need_cookies", "start", "exports"]
20 |
21 |
22 | def get_summary(url):
23 | """从课程主页面获取信息"""
24 |
25 | res = CANDY.get(url).text
26 | soup = BeautifulSoup(res, 'html.parser')
27 | links = []
28 | if re.match(r'https?://open.163.com/special/', url):
29 | # 从课程主页解析各课程链接
30 | names = soup.find_all('div', class_='g-container')[1]
31 | organization = names.find('a').string.strip()
32 | course = names.find('span', class_='pos').string.strip()
33 | list1 = soup.find('table', id='list2')
34 | tds = list1.find_all('td', class_="u-ctitle")
35 |
36 | for td in tds:
37 | a = td.find('a')
38 | links.append((a.get('href'), a.string))
39 |
40 | else:
41 | # 从学习页面解析各课程链接(有的课程不含课程主页)
42 | names = soup.find('p', class_='bread').find_all('a', class_='f-c9')
43 | organization = names[0].string.strip()
44 | course = names[1].string.strip()
45 | listrow = soup.find('div', class_='listrow')
46 | for item in listrow.find_all('div', class_='item'):
47 | p = item.find('p', class_='f-thide')
48 | if p.find('a'):
49 | a = p.find('a')
50 | links.append((a.get('href'), a.string))
51 | else:
52 | links.append((url, p.string.split(']')[-1]))
53 |
54 | dir_name = course_dir(course, organization)
55 |
56 | print(dir_name)
57 |
58 | CONFIG['links'] = links
59 | return links, dir_name
60 |
61 |
62 | def parse_resource(resource):
63 | """解析资源地址和下载资源"""
64 |
65 | def open_decrypt(hex_string, t):
66 | """将加密16进制字符串转化为真实url"""
67 | CRYKey = {1: b"4fxGZqoGmesXqg2o", 2: b"3fxVNqoPmesAqg2o"}
68 | aes = AES.new(CRYKey[t], AES.MODE_ECB)
69 | return str(aes.decrypt(bytes.fromhex(hex_string)), encoding='gbk', errors="ignore").replace('\x08', '').replace('\x06', '')
70 |
71 | def update_hex_urls(node, hex_urls):
72 | """从node中解析出来url信息,并更新hex_url"""
73 | for child in node.children:
74 | sp = child.name
75 | if not hex_urls.get(sp):
76 | hex_urls[sp] = {}
77 | for hex_url_tag in child.children:
78 | hex_urls[sp][hex_url_tag.name] = hex_url_tag.string
79 |
80 | link = resource.meta
81 | file_name = resource.file_name
82 | video_info = link.replace('.html', '').split('/')[-1]
83 | xml_url = 'http://live.ws.126.net/movie/' + \
84 | video_info[-2] + '/' + video_info[-1] + '/2_' + video_info + '.xml'
85 | res = CANDY.get(xml_url)
86 | res.encoding = 'gbk'
87 |
88 | # 解析xml数据
89 | soup = BeautifulSoup(res.text, 'lxml')
90 | name = soup.find('title').string
91 | encrypt = int(soup.find('encrypt').string)
92 | hex_urls = {}
93 | update_hex_urls(soup.find('flvurl'), hex_urls)
94 | update_hex_urls(soup.find('flvurlorigin'), hex_urls)
95 | update_hex_urls(soup.find('playurl'), hex_urls)
96 | update_hex_urls(soup.find('playurl_origin'), hex_urls)
97 | subs = {}
98 | for sub in soup.find('subs'):
99 | subs[sub.find('name').string] = sub.find('url').string
100 |
101 | formats = ['mp4', 'flv']
102 | resolutions = ['shd', 'hd', 'sd']
103 | resolutions = resolutions[CONFIG['resolution']:] + \
104 | list(reversed(resolutions[:CONFIG['resolution']]))
105 | modes = ((sp, ext) for sp in resolutions for ext in formats)
106 | for sp, ext in modes:
107 | if hex_urls.get(sp):
108 | if hex_urls[sp].get(ext):
109 | hex_url = hex_urls[sp][ext]
110 | video_url = open_decrypt(hex_url, encrypt)
111 | ext = video_url.split('.')[-1] # 对扩展名进行修正,有的课程从mp4中解析出来的仍为flv
112 | if ext in formats:
113 | ext = '.' + ext
114 | resource.ext = ext
115 | break
116 |
117 | if WORK_DIR.need_download(file_name+ext, CONFIG["overwrite"]):
118 | FILES['renamer'].write(re.search(r'(\w+\%s)' %
119 | ext, video_url).group(1), file_name, ext)
120 | FILES['video'].write_string(video_url)
121 | VIDEOS.append((video_url, file_name+ext))
122 |
123 | if not CONFIG['sub']:
124 | return
125 | for subtitle_lang, subtitle_url in subs.items():
126 | if len(subs) == 1:
127 | sub_name = file_name + '.srt'
128 | else:
129 | sub_name = file_name + '_' + subtitle_lang + '.srt'
130 | if WORK_DIR.need_download(sub_name, CONFIG["overwrite"]):
131 | CANDY.download_bin(subtitle_url, WORK_DIR.file(sub_name))
132 |
133 |
134 | def get_resource(links):
135 | """获取各种资源"""
136 |
137 | outline = Outline()
138 | counter = Counter(1)
139 |
140 | video_list = []
141 |
142 | for link, name in links:
143 | counter.add(0)
144 | outline.write(name, counter, 0, sign='#')
145 | video_list.append(Video(counter, name, link))
146 |
147 | if video_list:
148 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
149 | WORK_DIR.change('Videos')
150 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
151 | if playlist:
152 | parse_res_list(video_list, rename, parse_resource, playlist.write)
153 | else:
154 | parse_res_list(video_list, rename, parse_resource)
155 |
156 |
157 | def start(url, config, cookies=None):
158 | """调用接口函数"""
159 |
160 | # 初始化设置
161 | global WORK_DIR
162 | CONFIG.update(config)
163 |
164 | # 课程信息
165 | course_info = get_summary(url)
166 |
167 | # 创建课程目录
168 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
169 |
170 | WORK_DIR.change('Videos')
171 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
172 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
173 |
174 | # 获得资源
175 | get_resource(course_info[0])
176 |
177 | exports.update({
178 | "workdir": WORK_DIR,
179 | "spider": CANDY,
180 | "videos": VIDEOS
181 | })
182 |
--------------------------------------------------------------------------------
/moocs/study_163.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """网易云课堂"""
3 |
4 | import time
5 | from urllib import parse
6 |
7 | import requests
8 |
9 | from moocs.utils import *
10 | from utils.crawler import Crawler
11 |
12 | name = "study_163"
13 | need_cookies = False
14 | CANDY = Crawler()
15 | CONFIG = {}
16 | FILES = {}
17 | VIDEOS = []
18 | exports = {}
19 | __all__ = ["name", "need_cookies", "start", "exports"]
20 |
21 |
22 | def get_summary(url):
23 | """从课程主页面获取信息"""
24 |
25 | res = requests.get(url).text
26 |
27 | if re.search(r'courseId=(\d+)', url):
28 | course_id = re.search(r'courseId=(\d+)', url).group(1)
29 | else:
30 | course_id = re.search(r'introduction/(\d+)\.htm', url).group(1)
31 | name = re.search(r'(.+) - 网易云课堂', res).group(1)
32 |
33 | dir_name = course_dir(name, '网易云课堂')
34 |
35 | print(dir_name)
36 |
37 | CONFIG['course_id'] = course_id
38 | return course_id, dir_name
39 |
40 |
41 | def parse_resource(resource):
42 | """解析资源地址和下载资源"""
43 |
44 | file_name = resource.file_name
45 | if resource.type == 'Video':
46 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
47 | 'httpSessionId': 'b1a6d411df364e51833ac11570fc3f07', 'c0-scriptName': 'LessonLearnBean',
48 | 'c0-methodName': 'getVideoLearnInfo', 'c0-id': '0', 'c0-param0': 'string:' + resource.meta[1],
49 | 'c0-param1': 'string:' + CONFIG['course_id'],
50 | 'batchId': str(int(time.time() * 1000))}
51 | res = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonLearnBean.getVideoLearnInfo.dwr',
52 | data=post_data).text.encode('utf_8').decode('unicode_escape')
53 | video_info = re.search(
54 | r'signature="(\w+)";.+videoId=(\d+);[\s\S]+name:"(.+?)",', res).group(1, 2, 3)
55 | data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={
56 | 'videoId': video_info[1],
57 | 'signature': video_info[0],
58 | 'clientType': '1'
59 | }).json()
60 |
61 | resolutions = [3, 2, 1]
62 | for sp in resolutions[CONFIG['resolution']:]:
63 | # TODO: 增加视频格式选择
64 | for video in data['result']['videos']:
65 | if video['quality'] == sp and video['format'] == 'mp4':
66 | url = video['videoUrl']
67 | ext = '.mp4'
68 | break
69 | else:
70 | continue
71 | break
72 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
73 | FILES['renamer'].write(
74 | re.search(r'(\w+\.mp4)', url).group(1), file_name, ext)
75 | FILES['video'].write_string(url)
76 | VIDEOS.append((url, file_name+ext))
77 | resource.ext = ext
78 |
79 | if not CONFIG['sub']:
80 | return
81 | # 暂未发现字幕 api应该在data['result']['srtCaptions']
82 |
83 | elif resource.type == 'Document':
84 | if not WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]):
85 | return
86 | post_data = {
87 | 'callCount': '1',
88 | 'scriptSessionId': '${scriptSessionId}190',
89 | 'httpSessionId': 'c4927103a1c042ee95faed758d0db8f8',
90 | 'c0-scriptName': 'LessonLearnBean',
91 | 'c0-methodName': 'getTextLearnInfo',
92 | 'c0-id': '0',
93 | 'c0-param0': 'string:' + resource.meta[1],
94 | 'c0-param1': 'string:' + CONFIG['course_id'],
95 | 'batchId': str(int(time.time() * 1000)),
96 | }
97 | res = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonLearnBean.getTextLearnInfo.dwr',
98 | data=post_data).text.encode('utf_8').decode('unicode_escape')
99 | pdf_url = re.search(r'pdfUrl:"(http://.+?)",', res).group(1)
100 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
101 | else:
102 | if not WORK_DIR.need_download(file_name+resource.meta[2], CONFIG["overwrite"]):
103 | return
104 | CANDY.download_bin(resource.meta[3], WORK_DIR.file(
105 | file_name + resource.meta[2]))
106 |
107 |
108 | def get_resource(course_id):
109 | """获取各种资源"""
110 |
111 | outline = Outline()
112 | counter = Counter()
113 |
114 | video_list = []
115 | pdf_list = []
116 | file_list = []
117 |
118 | post_data = {
119 | 'callCount': '1',
120 | 'scriptSessionId': '${scriptSessionId}190',
121 | 'httpSessionId': '89a04ce41c7d42759b0a62efe392e153',
122 | 'c0-scriptName': 'PlanNewBean',
123 | 'c0-methodName': 'getPlanCourseDetail',
124 | 'c0-id': '0',
125 | 'c0-param0': 'string:' + course_id,
126 | 'c0-param1': 'number:0',
127 | 'c0-param2': 'null:null',
128 | 'batchId': str(int(time.time() * 1000)),
129 | }
130 | res = CANDY.post('https://study.163.com/dwr/call/plaincall/PlanNewBean.getPlanCourseDetail.dwr',
131 | data=post_data).text.encode('utf_8').decode('unicode_escape')
132 |
133 | chapters = re.findall(r'courseId=\d+;.+id=(\d+);.+name="(.+)";', res)
134 | for chapter in chapters:
135 | counter.add(0)
136 | outline.write(chapter[1], counter, 0)
137 |
138 | lessons = re.findall(
139 | r'chapterId=%s;.+?hasReferences=(\w+);.+?id=(\d+).+?lessonName="(.*?)";.+?type=(\d+);' % chapter[0], res, re.DOTALL)
140 | for lesson in lessons:
141 | counter.add(1)
142 | outline.write(lesson[2], counter, 1)
143 |
144 | # Video
145 | if lesson[3] == '2' or lesson[3] == '50':
146 | counter.add(2)
147 | outline.write(lesson[2], counter, 2, sign='#')
148 | video_list.append(Video(counter, lesson[2], lesson))
149 | counter.reset()
150 |
151 | # Pdf
152 | elif lesson[3] == '3':
153 | counter.add(2)
154 | outline.write(lesson[2], counter, 2, sign='*')
155 | pdf_list.append(Document(counter, lesson[2], lesson))
156 |
157 | # References
158 | files = []
159 | if eval(lesson[0][0].upper() + lesson[0][1:]):
160 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
161 | 'httpSessionId': 'b1a6d411df364e51833ac11570fc3f07', 'c0-scriptName': 'LessonReferenceBean',
162 | 'c0-methodName': 'getLessonReferenceVoByLessonId', 'c0-id': '0', 'c0-param0': 'number:' + lesson[1],
163 | 'batchId': str(int(time.time() * 1000))}
164 | ref_info = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonReferenceBean.getLessonReferenceVoByLessonId.dwr',
165 | data=post_data).text.encode('utf_8').decode('unicode_escape')
166 | refs = re.findall(
167 | r'id=(\d+);.+name="(.+)";.+suffix="(\.\w+)";.+url="(.+?)";', ref_info)
168 |
169 | for ref in refs:
170 | ref = (ref[0], parse.unquote(ref[1]), ref[2], ref[3])
171 | files.append(ref)
172 |
173 | for file in files:
174 | counter.add(2)
175 | outline.write(file[1], counter, 2, sign='!')
176 | if CONFIG['file']:
177 | file_list.append(Resource(counter, file[1], file))
178 | counter.reset()
179 |
180 | if video_list:
181 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
182 | WORK_DIR.change('Videos')
183 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
184 | if playlist:
185 | parse_res_list(video_list, rename, playlist.write, parse_resource)
186 | else:
187 | parse_res_list(video_list, rename, parse_resource)
188 | if pdf_list:
189 | WORK_DIR.change('PDFs')
190 | parse_res_list(pdf_list, None, parse_resource)
191 | if file_list:
192 | WORK_DIR.change('Files')
193 | parse_res_list(file_list, None, parse_resource)
194 |
195 |
196 | def start(url, config, cookies=None):
197 | """调用接口函数"""
198 |
199 | # 初始化设置
200 | global WORK_DIR
201 | CONFIG.update(config)
202 |
203 | # 课程信息
204 | course_info = get_summary(url)
205 |
206 | # 创建课程目录
207 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
208 |
209 | WORK_DIR.change('Videos')
210 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
211 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
212 |
213 | # 获得资源
214 | get_resource(course_info[0])
215 |
216 | exports.update({
217 | "workdir": WORK_DIR,
218 | "spider": CANDY,
219 | "videos": VIDEOS
220 | })
221 |
--------------------------------------------------------------------------------
/moocs/study_mooc.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """网易云课堂 MOOC"""
3 |
4 | import time
5 |
6 | from moocs.utils import *
7 | from utils.crawler import Crawler
8 |
9 | name = "study_mooc"
10 | need_cookies = True
11 | CANDY = Crawler()
12 | CONFIG = {}
13 | FILES = {}
14 | VIDEOS = []
15 | exports = {}
16 | __all__ = ["name", "need_cookies", "start", "exports"]
17 |
18 |
19 | def get_summary(url):
20 | """从课程主页面获取信息"""
21 |
22 | url = url.replace('learn/', 'course/')
23 | res = CANDY.get(url).text
24 |
25 | term_id = re.search(r'termId : "(\d+)"', res).group(1)
26 | names = re.findall(r'name:"(.+)"', res)
27 |
28 | dir_name = course_dir(names[0], names[1])
29 |
30 | print(dir_name)
31 |
32 | CONFIG['term_id'] = term_id
33 | return term_id, dir_name
34 |
35 |
36 | def get_announce(term_id):
37 | """ 获取课程的公告 """
38 |
39 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
40 | 'httpSessionId': 'dba4977be78d42a78a6e2c2dd2b9bb42', 'c0-scriptName': 'CourseBean',
41 | 'c0-methodName': 'getAllAnnouncementByTerm', 'c0-id': '0', 'c0-param0': 'number:' + term_id,
42 | 'c0-param1': 'number:1', 'batchId': str(int(time.time() * 1000))}
43 | res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getAllAnnouncementByTerm.dwr',
44 | data=post_data).text
45 | announcements = re.findall(
46 | r'content="(.*?[^\\])".*title="(.*?[^\\])"', res)
47 |
48 | with open('Announcements.html', 'w', encoding='utf-8') as announce_file:
49 | for announcement in announcements:
50 | # 公告内容
51 | announce_content = announcement[0].encode(
52 | 'utf-8').decode('unicode_escape')
53 |
54 | # 公告标题
55 | announce_title = announcement[1].encode(
56 | 'utf-8').decode('unicode_escape')
57 | announce_file.write('' + announce_title +
58 | '
\n' + announce_content + '\n')
59 |
60 |
61 | def parse_resource(resource):
62 | """解析资源地址和下载资源"""
63 |
64 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
65 | 'httpSessionId': 'b8efd4c73fd1434896507b83de631f0f', 'c0-scriptName': 'CourseBean',
66 | 'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': '0', 'c0-param0': 'number:' + CONFIG['term_id'],
67 | 'c0-param1': 'number:' + resource.meta[0], 'c0-param2': 'number:' + resource.meta[1],
68 | 'c0-param3': 'number:0', 'c0-param4': 'number:' + resource.meta[2],
69 | 'batchId': str(int(time.time() * 1000))}
70 | res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr',
71 | data=post_data).text
72 |
73 | file_name = resource.file_name
74 | if resource.type == 'Video':
75 | signature = re.search(r'signature="(.+?)"', res).group(1)
76 | data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={
77 | 'videoId': resource.meta[0],
78 | 'signature': signature,
79 | 'clientType': '1'
80 | }).json()
81 |
82 | resolutions = [3, 2, 1]
83 | for sp in resolutions[CONFIG['resolution']:]:
84 | # TODO: 增加视频格式选择
85 | for video in data['result']['videos']:
86 | if video['quality'] == sp and video['format'] == 'mp4':
87 | url = video['videoUrl']
88 | ext = '.mp4'
89 | break
90 | else:
91 | continue
92 | break
93 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
94 | FILES['renamer'].write(
95 | re.search(r'(\w+\.mp4)', url).group(1), file_name, ext)
96 | FILES['video'].write_string(url)
97 | VIDEOS.append((url, file_name+ext))
98 | resource.ext = ext
99 |
100 | if not CONFIG['sub']:
101 | return
102 | subtitles = re.findall(r'name="(.+)";.*url="(.*?)"', res)
103 | WORK_DIR.change('Videos')
104 | for subtitle in subtitles:
105 | if len(subtitles) == 1:
106 | sub_name = file_name + '.srt'
107 | else:
108 | subtitle_lang = subtitle[0].encode(
109 | 'utf_8').decode('unicode_escape')
110 | sub_name = file_name + '_' + subtitle_lang + '.srt'
111 | if WORK_DIR.need_download(sub_name, CONFIG["overwrite"]):
112 | CANDY.download_bin(subtitle[1], WORK_DIR.file(sub_name))
113 |
114 | elif resource.type == 'Document':
115 | if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]):
116 | return
117 | pdf_url = re.search(r'textOrigUrl:"(.*?)"', res).group(1)
118 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
119 |
120 | elif resource.type == 'Rich':
121 | if not WORK_DIR.need_download(file_name + '.html', CONFIG["overwrite"]):
122 | return
123 | text = re.search(r'htmlContent:"(.*)",id',
124 | res.encode('utf_8').decode('unicode_escape'), re.S).group(1)
125 | with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file:
126 | file.write(text)
127 |
128 |
129 | def get_resource(term_id):
130 | """获取各种资源"""
131 |
132 | outline = Outline()
133 | counter = Counter()
134 |
135 | video_list = []
136 | pdf_list = []
137 | rich_text_list = []
138 |
139 | post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
140 | 'httpSessionId': 'b8efd4c73fd1434896507b83de631f0f', 'c0-scriptName': 'CourseBean',
141 | 'c0-methodName': 'getLastLearnedMocTermDto', 'c0-id': '0', 'c0-param0': 'number:' + term_id,
142 | 'batchId': str(int(time.time() * 1000))}
143 | res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr',
144 | data=post_data).text.encode('utf_8').decode('unicode_escape')
145 |
146 | chapters = re.findall(r'homeworks=\w+;.+id=(\d+).+name="(.+)";', res)
147 | for chapter in chapters:
148 | counter.add(0)
149 | outline.write(chapter[1], counter, 0)
150 |
151 | lessons = re.findall(
152 | r'chapterId=' + chapter[0] + r'.+contentType=1.+id=(\d+).+name="(.+)".+test', res)
153 | for lesson in lessons:
154 | counter.add(1)
155 | outline.write(lesson[1], counter, 1)
156 |
157 | videos = re.findall(r'contentId=(\d+).+contentType=(1).+id=(\d+).+lessonId=' +
158 | lesson[0] + r'.+name="(.+)"', res)
159 | for video in videos:
160 | counter.add(2)
161 | outline.write(video[3], counter, 2, sign='#')
162 | video_list.append(Video(counter, video[3], video))
163 | counter.reset()
164 |
165 | pdfs = re.findall(r'contentId=(\d+).+contentType=(3).+id=(\d+).+lessonId=' +
166 | lesson[0] + r'.+name="(.+)"', res)
167 | for pdf in pdfs:
168 | counter.add(2)
169 | outline.write(pdf[3], counter, 2, sign='*')
170 | if CONFIG['doc']:
171 | pdf_list.append(Document(counter, pdf[3], pdf))
172 | counter.reset()
173 |
174 | rich_text = re.findall(r'contentId=(\d+).+contentType=(4).+id=(\d+).+jsonContent=(.+);.+lessonId=' +
175 | lesson[0] + r'.+name="(.+)"', res)
176 | for text in rich_text:
177 | counter.add(2)
178 | outline.write(text[4], counter, 2, sign='+')
179 | if CONFIG['text']:
180 | rich_text_list.append(RichText(counter, text[4], text))
181 | if CONFIG['file']:
182 | if text[3] != 'null' and text[3] != '""':
183 | params = {'nosKey': re.search('nosKey":"(.+?)"', text[3]).group(1),
184 | 'fileName': re.search('"fileName":"(.+?)"', text[3]).group(1)}
185 | file_name = Resource.file_to_save(params['fileName'])
186 | outline.write(file_name, counter, 2, sign='!')
187 |
188 | WORK_DIR.change('Files')
189 | file_name = '%s %s' % (counter, file_name)
190 | if WORK_DIR.need_download(file_name, CONFIG["overwrite"]):
191 | CANDY.download_bin('https://www.icourse163.org/course/attachment.htm',
192 | WORK_DIR.file(file_name), params=params, cookies={'STUDY_SESS': None})
193 | counter.reset()
194 |
195 | if video_list:
196 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
197 | WORK_DIR.change('Videos')
198 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
199 | if playlist:
200 | parse_res_list(video_list, rename, playlist.write, parse_resource)
201 | else:
202 | parse_res_list(video_list, rename, parse_resource)
203 | if pdf_list:
204 | WORK_DIR.change('PDFs')
205 | parse_res_list(pdf_list, None, parse_resource)
206 | if rich_text_list:
207 | WORK_DIR.change('Texts')
208 | parse_res_list(rich_text_list, None, parse_resource)
209 |
210 |
211 | def start(url, config, cookies=None):
212 | """调用接口函数"""
213 |
214 | # 初始化设置
215 | global WORK_DIR
216 | CANDY.set_cookies(cookies)
217 | CONFIG.update(config)
218 |
219 | # 课程信息
220 | course_info = get_summary(url)
221 |
222 | # 创建课程目录
223 | WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
224 |
225 | print(course_info[0])
226 | # 课程公告
227 | get_announce(course_info[0])
228 |
229 | WORK_DIR.change('Videos')
230 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
231 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
232 |
233 | # 获得资源
234 | get_resource(course_info[0])
235 |
236 | exports.update({
237 | "workdir": WORK_DIR,
238 | "spider": CANDY,
239 | "videos": VIDEOS
240 | })
241 |
--------------------------------------------------------------------------------
/moocs/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """核心程序组件"""
3 |
4 | import json
5 | import os
6 | import platform
7 | import re
8 | import subprocess
9 | import sys
10 | import time
11 |
12 | from utils.aria2 import Aria2, Aria2File
13 |
14 | SYS = platform.system()
15 |
16 |
17 | class Resource(object):
18 | """所有资源类的基类
19 |
20 | 用来定义一个资源,但不同类型的资源可能要对部分功能进行重写。
21 |
22 | 属性
23 | 类
24 | regex_sort:匹配序号的正则表达式;
25 | regex_file:匹配 Windows 下文件名的非法字符;
26 | regex_spaces:匹配连续多个空白字符。
27 | type:资源的类型,默认是 'Resource';
28 |
29 | id:资源的唯一标识,用于在程序中标识一个资源,如 '2.3.2';
30 | name:资源的名称(可含有特殊字符),和最终的文件名有关;
31 | meta:资源的元信息,比如资源在每个网站的 ID 和文件名等等;
32 | feature:其他特征(基本用不到)。
33 | """
34 |
35 | regex_sort = re.compile(r'^[第一二三四五六七八九十\d]+[\s\d._\-章课节讲]*[.\s、\-]\s*\d*')
36 | regex_file = re.compile(r'[\\/:*?"<>|]')
37 | regex_spaces = re.compile(r'\s+')
38 | type = 'Resource'
39 |
40 | def __init__(self, identify, name, meta, feature=None):
41 | """将 name 的序号消除,并依次为属性赋值"""
42 | self.id = str(identify)
43 | self.name = Resource.regex_spaces.sub(
44 | ' ', Resource.regex_sort.sub('', name)).strip()
45 | self.meta = meta
46 | self.feature = feature
47 |
48 | def __str__(self):
49 | """返回资源的名称"""
50 |
51 | return self.name
52 |
53 | @property
54 | def file_name(self):
55 | """动态生成文件名(包含前缀的 ID,不含扩展名),比如 '2.3.2 file_name'"""
56 |
57 | return self.id + ' ' + Resource.regex_file.sub('', self.name)
58 |
59 | def operation(self, *funcs):
60 | """传入一个或多个函数,使用函数对资源对象进行调用"""
61 |
62 | for func in funcs:
63 | func(self)
64 |
65 | @staticmethod
66 | def file_to_save(name):
67 | """通过一个名字生成文件名"""
68 |
69 | return Resource.regex_file.sub('', Resource.regex_spaces.sub(' ', Resource.regex_sort.sub('', name)).strip())
70 |
71 |
72 | class Video(Resource):
73 | """视频资源类
74 |
75 | 属性
76 | type:默认值是 'Video';
77 | """
78 |
79 | type = 'Video'
80 | ext = '.mp4'
81 |
82 |
83 | class Document(Resource):
84 | """文档资源类
85 |
86 | 属性
87 | type:默认值是 'Video';
88 | """
89 |
90 | type = 'Document'
91 |
92 |
93 | class RichText(Resource):
94 | """富文本资源类
95 |
96 | 属性
97 | type:默认值是 'Rich';
98 | """
99 |
100 | type = 'Rich'
101 |
102 |
103 | class Attachment(Resource):
104 | """视频资源类
105 |
106 | 属性
107 | type:默认值是 'Attachment';
108 | """
109 |
110 | type = 'Attachment'
111 |
112 |
113 | class ClassicFile(object):
114 | """典型文件(UTF-8 编码的文件)类
115 |
116 | 属性
117 | _f:文件指针;
118 | file:文件名或文件路径。
119 | """
120 |
121 | def __init__(self, file):
122 | """传入一个文件名或路径,然后打开文件,并保存文件指针和文件名"""
123 |
124 | self._f = open(file, 'w', encoding='utf_8')
125 | self.file = file
126 |
127 | def __del__(self):
128 | """关闭文件,并将文件号和文件名都清空"""
129 |
130 | self._f.close()
131 | del self._f
132 | del self.file
133 |
134 | def write_string(self, string):
135 | """向对象中打开的文件写入字符串,会自动加入换行"""
136 |
137 | self._f.write(string + '\n')
138 |
139 |
140 | class Playlist(ClassicFile):
141 | """ 播放列表类 """
142 |
143 | def __init__(self, file, path_type):
144 | super().__init__(file)
145 | self.path_type = path_type
146 |
147 | def switch_path(self, path):
148 | """ 根据路径类别生成路径项 """
149 | path = os.path.normpath(path)
150 | if self.path_type == 'AP':
151 | path = os.path.abspath(path)
152 | elif self.path_type == 'RP':
153 | path = os.path.relpath(path, start=os.path.dirname(self.file))
154 | return path
155 |
156 | def write(self, video):
157 | """传入一个 Video 类的对象,将该对象的信息写入播放列表"""
158 |
159 | path = os.path.join("Videos", video.file_name + video.ext)
160 | path = self.switch_path(path)
161 | self.write_string(path)
162 |
163 |
164 | class M3u(Playlist):
165 | """ m3u 播放列表类 """
166 |
167 | def __init__(self, path_type='RP'):
168 | super().__init__('Playlist.m3u', path_type)
169 |
170 |
171 | class Dpl(Playlist):
172 | """ Potplayer 播放列表类
173 |
174 | 属性
175 | _count:已经写入的播放列表的文件数;
176 | """
177 |
178 | def __init__(self, path_type='RP'):
179 | super().__init__('Playlist.dpl', path_type)
180 | self.write_string('DAUMPLAYLIST\n')
181 | self._count = 0
182 |
183 | def write(self, video):
184 | """传入一个 Video 类的对象,将该对象的信息写入播放列表"""
185 |
186 | self._count += 1
187 | path = os.path.join("Videos", video.file_name + video.ext)
188 | path = self.switch_path(path)
189 | self.write_string('{}*file*{}'.format(self._count, path))
190 | self.write_string('{}*title*{} {}\n'.format(self._count,
191 | '.'.join(video.id.split('.')[:-1]), video.name))
192 |
193 |
194 | class Subtitle(ClassicFile):
195 | """ 播放列表类 """
196 |
197 | def __init__(self, path):
198 | super().__init__(path)
199 | self._count = 0
200 |
201 | @staticmethod
202 | def time_format(seconds):
203 | ms = int(1000 * (seconds - int(seconds)))
204 | seconds = int(seconds)
205 | minutes, sec = seconds // 60, seconds % 60
206 | hour, min = minutes // 60, minutes % 60
207 | return "{:02}:{:02}:{:02},{}".format(hour, min, sec, ms)
208 |
209 |
210 | def write(self, content, from_time, to_time):
211 | self._count += 1
212 | self.write_string(str(self._count))
213 | self.write_string(
214 | "{} --> {}".format(self.time_format(from_time), self.time_format(to_time)))
215 | self.write_string(content + "\n")
216 |
217 |
218 | class Renamer(ClassicFile):
219 | """重命名批处理文件类"""
220 |
221 | ext = 'bat' if SYS == 'Windows' else 'sh'
222 |
223 | def __init__(self, file):
224 | """初始化文件,并写入调用 UTF-8 代码页的命令"""
225 |
226 | file = file.format(ext=Renamer.ext)
227 | super().__init__(file)
228 | if SYS == 'Windows':
229 | self.write_string('CHCP 65001\n')
230 |
231 | def write(self, origin_name, file_name, ext='.mp4'):
232 | """传入一个文件的原始名字(URL 中的文件名)和一个新的文件名"""
233 |
234 | if SYS == 'Windows':
235 | self.write_string('REN "%s" "%s%s"' %
236 | (origin_name, file_name, ext))
237 | else:
238 | self.write_string('mv "%s" "%s%s"' % (origin_name, file_name, ext))
239 |
240 |
241 | class Outline(ClassicFile):
242 | """课程大纲类
243 |
244 | 属性
245 | res_type:通过一个符号代表一种文件类型。
246 | """
247 |
248 | res_type = {'#': '【视频】', '!': '【附件】', '*': '【文档】',
249 | '+': '【富文本】', '&': '【字幕】', '': ''}
250 |
251 | def __init__(self):
252 | """创建 Outline.txt 文件"""
253 |
254 | super().__init__('Outline.txt')
255 |
256 | def write(self, string, counter, level=2, sign=''):
257 | """传入一个字符串,一个计数器,一个级别(从 0 开始)和一个符号,然后写入大纲。首先会打印出相关信息。"""
258 |
259 | print('%s%s%s' % (' ' * level, Outline.res_type[sign], string))
260 | name = '%s%s {%s}%s' % (' ' * level, string, counter[level], sign)
261 | self.write_string(name)
262 |
263 |
264 | class WorkingDir(object):
265 | """工作目录类
266 |
267 | 用于切换下载目录和创建目录等。
268 |
269 | 属性
270 | base_dir:工作目录的根目录,任何时候都基于这个目录;
271 | path:相对于根目录的路径。
272 | """
273 |
274 | def __init__(self, *base_dirs):
275 | """传递一些字符串,创建一个目录,并切换到这个目录"""
276 |
277 | base_dir = os.path.join(*base_dirs)
278 | if not os.path.isdir(base_dir):
279 | os.makedirs(base_dir)
280 | os.chdir(base_dir)
281 | self.base_dir = os.getcwd()
282 | self.path = ''
283 |
284 | def change(self, *relative):
285 | """切换工作目录(假),可以接受连续多个目录名,如果不存在该目录就创建它
286 |
287 | 切换的功能需要配合 file() 才能实现。
288 | """
289 |
290 | self.path = os.path.join(self.base_dir, *relative)
291 | if not os.path.isdir(self.path):
292 | os.makedirs(self.path)
293 |
294 | def file(self, file_name):
295 | """根据文件名返回一个完整的路径,会根据 path 生成一个路径"""
296 |
297 | return os.path.join(self.path, file_name)
298 |
299 | def exist(self, file_name):
300 | """判断当前路径(雾)是否存在一个文件"""
301 |
302 | return os.path.exists(os.path.join(self.path, file_name))
303 |
304 | def need_download(self, file_name, overwrite=False):
305 | """判断当前文件是否需要下载,并且打印输出"""
306 |
307 | need = overwrite or not self.exist(file_name)
308 | sign = ">" if need else "!"
309 | res_print(file_name, sign=sign)
310 | return need
311 |
312 |
313 | class Counter(object):
314 | """计数器类
315 |
316 | 属性
317 | counter:计数器的列表。
318 | """
319 |
320 | def __init__(self, num_level=3):
321 | """初始化一个列表"""
322 |
323 | self.counter = [0] * num_level
324 | self.num_level = num_level
325 |
326 | def add(self, level):
327 | """给第 level 级别的计数器 +1"""
328 |
329 | for i in range(level + 1, self.num_level):
330 | self.counter[i] = 0
331 | self.counter[level] += 1
332 |
333 | def __str__(self):
334 | """返回一个完整的计数器"""
335 |
336 | return '.'.join(map(str, self.counter))
337 |
338 | def __getitem__(self, index):
339 | """返回到第 level 级别为止的计数器"""
340 |
341 | return '.'.join(map(str, self.counter[:index + 1]))
342 |
343 | def reset(self):
344 | """将第 2 级别的计数置为 0"""
345 |
346 | self.counter[-1] = 0
347 |
348 |
349 | def res_print(file_name, sign=">"):
350 | """打印一个将要输出的文件"""
351 |
352 | print('------{}'.format(sign), file_name)
353 |
354 |
355 | def course_dir(course_name, institution):
356 | """通过课程名和机构名返回一个完整的目录名字"""
357 |
358 | return Resource.regex_file.sub('', '%s - %s' % (course_name, institution))
359 |
360 | def file_input(file, origin_text="", message=""):
361 | """ 调用编辑器,以文件的形式获取输入 """
362 |
363 | with open(file, 'w', encoding='utf8') as f:
364 | f.write(origin_text)
365 |
366 | if SYS == 'Windows':
367 | os.startfile(file)
368 | elif SYS == 'Linux':
369 | subprocess.run('gedit "%s"' %
370 | file, shell=True, stdout=subprocess.PIPE)
371 | elif SYS == 'Darwin':
372 | subprocess.run('open -t "%s"' %
373 | file, shell=True, stdout=subprocess.PIPE)
374 | input(message)
375 | with open(file, 'r', encoding='utf8') as f:
376 | res = f.read()
377 | os.remove(file)
378 | return res
379 |
380 |
381 | def parse_res_list(res_list, file, *operator):
382 | """传入一个 Resource 实例的列表,并传入一个临时文件名,将调出默认程序修改名字,并调用对象的 operation 方法"""
383 |
384 | if file:
385 | names_text = '\n'.join(list(map(lambda res: str(res), res_list)))
386 | names = file_input(file, origin_text=names_text, message='修改完文件名后按回车继续。').split('\n')
387 | for (res, name) in zip(res_list, names):
388 | res.name = name
389 | res.operation(*operator)
390 | else:
391 | for res in res_list:
392 | res.operation(*operator)
393 |
394 |
395 | def store_cookies(mooc_type, restore=False):
396 | """存储并返回 Cookie 字典"""
397 |
398 | def cookie_input():
399 | # Mac 容易由于 Cookie 太多而阻塞
400 | if SYS == 'Darwin':
401 | cookies = file_input('cookies_tmp.txt', message='输入 Cookie 后保存,并回到终端回车继续...')
402 | else:
403 | print('输入 Cookie:')
404 | cookies = input('> ')
405 | return cookies
406 |
407 | def cookie_to_json(raw_cookies):
408 | """将分号分隔的 Cookie 转为字典"""
409 |
410 | cookies_dict = {}
411 | if not raw_cookies:
412 | return {}
413 | if raw_cookies[:7].lower() == 'cookie:':
414 | raw_cookies = raw_cookies[7:]
415 |
416 | for cookie in raw_cookies.split(';'):
417 | key, value = cookie.strip().split("=", 1)
418 | cookies_dict[key] = value
419 |
420 | return cookies_dict
421 |
422 | file_path = os.path.join(sys.path[0], "cookies.json")
423 | if not os.path.isfile(file_path):
424 | cookies = {}
425 | else:
426 | with open(file_path, 'r') as cookies_file:
427 | cookies = json.load(cookies_file)
428 |
429 | if restore or not cookies.get(mooc_type):
430 | raw_cookies = cookie_input()
431 | cookies[mooc_type] = cookie_to_json(raw_cookies)
432 | with open(file_path, 'w') as f:
433 | json.dump(cookies, f, indent=2)
434 |
435 | return cookies[mooc_type]
436 |
437 |
438 | def size_format(size, ndigits=2):
439 | """ 输入数据字节数,与保留小数位数,返回数据量字符串 """
440 | flag = '-' if size < 0 else ''
441 | size = abs(size)
442 | units = ["Bytes", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", "BB"]
443 | idx = len(units) - 1
444 | unit = ""
445 | unit_size = 0
446 | while idx >= 0:
447 | unit_size = 2 ** (idx * 10)
448 | if size >= unit_size:
449 | unit = units[idx]
450 | break
451 | idx -= 1
452 | return "{}{:.{}f} {}".format(flag, size/unit_size, ndigits, unit)
453 |
454 |
455 | def get_playlist(playlist_type, path_type):
456 | """传入播放列表类型及路径类型,返回播放列表对象"""
457 |
458 | if playlist_type == 'no':
459 | playlist = None
460 | elif playlist_type == 'dpl':
461 | playlist = Dpl(path_type=path_type)
462 | elif playlist_type == 'm3u':
463 | playlist = M3u(path_type=path_type)
464 | return playlist
465 |
466 |
467 | def aria2_download(videos, workdir, overwrite=False):
468 | """调用 aria2 下载视频"""
469 |
470 | aria2 = Aria2()
471 | files = []
472 |
473 | for url, file_name in videos:
474 | file = Aria2File(aria2, url, file_name, workdir, overwrite=overwrite)
475 | files.append(file)
476 |
477 | # 显示进度
478 | process_bar_length = 50
479 | total_length = sum([file.get_length() for file in files])
480 | length_flag = False
481 | while True:
482 | if not length_flag:
483 | length_flag = True
484 | total_length = 0
485 | for file in files:
486 | length = file.get_length()
487 | if length == 0:
488 | length_flag = False
489 | total_length += length
490 |
491 | speed = sum([file.get_speed() for file in files])
492 | completed_length = sum([file.get_complete_length() for file in files])
493 | len_done = (process_bar_length * completed_length // \
494 | total_length) if total_length else process_bar_length
495 | len_undone = process_bar_length - len_done
496 | log_string = '{}{} {}/{} {:12}'.format(
497 | "#" * len_done, "_" * len_undone, size_format(completed_length),
498 | size_format(total_length), size_format(speed)+"/s")
499 | print(log_string, end="\r")
500 | time.sleep(1)
501 |
502 | # 重命名文件
503 | for file in files:
504 | if file.get_status() == "complete" and not file.renamed:
505 | file.rename()
506 | if all([file.get_status() == "complete" for file in files]):
507 | break
508 |
509 | print("视频已下载全部完成~")
510 |
--------------------------------------------------------------------------------
/moocs/xuetangx.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """学堂在线"""
3 |
4 | import json
5 | import sys
6 |
7 | from bs4 import BeautifulSoup
8 |
9 | from moocs.utils import *
10 | from utils.crawler import Crawler
11 |
12 | name = "xuetangx"
13 | need_cookies = True
14 | BASE_URL = 'http://www.xuetangx.com'
15 | CANDY = Crawler()
16 | CONFIG = {}
17 | FILES = {}
18 | VIDEOS = []
19 | exports = {}
20 | __all__ = ["name", "need_cookies", "start", "exports"]
21 |
22 |
23 | def get_book(url):
24 | """获得所有的 PDF 电子书"""
25 |
26 | nav_page = CANDY.get(url).text
27 | shelves = set(re.findall(r'/courses/.+/pdfbook/\d/', nav_page))
28 | for shelf_count, shelf in enumerate(shelves, 1):
29 | res = CANDY.get(BASE_URL + shelf).text
30 | soup = BeautifulSoup(res, 'lxml')
31 | WORK_DIR.change('Books', str(shelf_count))
32 | for book_count, book in enumerate(soup.select('#booknav a'), 1):
33 | file_name = Resource.file_to_save(book.string) + '.pdf'
34 | if WORK_DIR.need_download(file_name, CONFIG["overwrite"]):
35 | CANDY.download_bin(
36 | BASE_URL + book['rel'][0], WORK_DIR.file(file_name))
37 |
38 |
39 | def get_handout(url):
40 | """从课程信息页面获得课程讲义并存为 HTML 文件"""
41 |
42 | handouts_html = ClassicFile('Handouts.html')
43 | res = CANDY.get(url).text
44 | soup = BeautifulSoup(res, 'lxml')
45 | handouts = soup.find(class_='handouts')
46 |
47 | # 将相对地址替换为绝对地址
48 | for link in handouts.select('a[href^="/"]'):
49 | link['href'] = BASE_URL + link['href']
50 | handouts_html.write_string('\n\n\n讲义\n\n'
51 | '\n\n%s\n' % handouts.prettify())
52 |
53 |
54 | def get_video(video):
55 | """根据视频 ID 和文件名字获取视频信息"""
56 |
57 | file_name = video.file_name
58 | if WORK_DIR.need_download(file_name+'.mp4', CONFIG["overwrite"]):
59 | res = CANDY.get('http://xuetangx.com/videoid2source/' + video.meta).text
60 | try:
61 | video_url = json.loads(res)['sources']['quality20'][0]
62 | except:
63 | video_url = json.loads(res)['sources']['quality10'][0]
64 | FILES['videos'].write_string(video_url)
65 | FILES['renamer'].write(
66 | re.search(r'(\w+-[12]0.mp4)', video_url).group(1), file_name)
67 | VIDEOS.append((video_url, file_name+".mp4"))
68 |
69 |
70 | def get_content(url):
71 | """获取网页详细内容"""
72 |
73 | outline = Outline()
74 | counter = Counter()
75 | video_counter = Counter()
76 | video_list = []
77 |
78 | courseware = CANDY.get(url).text
79 | soup = BeautifulSoup(courseware, 'lxml')
80 |
81 | chapters = soup.find(id='accordion').find_all(class_='chapter')
82 | for chapter in chapters:
83 | counter.add(0)
84 | video_counter.add(0)
85 | chapter_title = chapter.h3.a.get_text(strip=True)
86 | outline.write(chapter_title, counter, 0)
87 |
88 | sections = chapter.select('ul a')
89 | for section_info in sections:
90 | counter.add(1)
91 | video_counter.add(1)
92 | section_url = BASE_URL + section_info['href']
93 | section_title = section_info.p.string.strip()
94 |
95 | outline.write(section_title, counter, 1)
96 |
97 | section_page = CANDY.get(section_url).text
98 | soup = BeautifulSoup(section_page, 'lxml')
99 |
100 | # 对于某些需要安装 MathPlayer 插件的网页
101 | try:
102 | tabs = soup.find(id='sequence-list').find_all('li')
103 | except AttributeError:
104 | break
105 | for tab_count, tab_info in enumerate(tabs, 1):
106 | counter.add(2)
107 | # title 可能出现换行符和重复,所以用 data-page-title
108 | tab_title = tab_info.a.get('data-page-title')
109 |
110 | outline.write(tab_title, counter)
111 |
112 | if tab_title == 'Video' or tab_title == '视频' or tab_title == '':
113 | tab_title = section_title
114 |
115 | tab_sequence = tab_info.a.get('aria-controls')
116 |
117 | tab_escape = soup.find(id=tab_sequence).string
118 | tab = BeautifulSoup(tab_escape, 'lxml').div.div
119 |
120 | blocks = tab.find_all('div', class_='xblock')
121 | for block in blocks:
122 | try:
123 | # 极少数没有 data-type 属性
124 | block_type = block['data-type']
125 | except KeyError:
126 | continue
127 | if block_type == 'Video':
128 | video_counter.add(2)
129 | # 替换连续空格或制表符为单个空格
130 | video_name = block.h2.string.strip()
131 |
132 | outline.write(video_name, video_counter,
133 | level=3, sign='#')
134 |
135 | if video_name == 'Video' or video_name == '视频' or video_name == '':
136 | video_name = tab_title
137 |
138 | video_id = block.div['data-ccsource']
139 |
140 | video = Video(video_counter, video_name, video_id)
141 | video_list.append(video)
142 |
143 | if CONFIG['sub']:
144 | get_subtitles(block.div['data-transcript-available-translations-url'],
145 | block.div['data-transcript-translation-url'],
146 | video.file_name)
147 | if video_list:
148 | WORK_DIR.change('Videos')
149 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
150 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
151 | if playlist:
152 | parse_res_list(video_list, rename, playlist.write, get_video)
153 | else:
154 | parse_res_list(video_list, rename, get_video)
155 |
156 |
157 | def get_subtitles(available, transcript, file_name):
158 | """获取字幕"""
159 |
160 | subtitle_available_url = BASE_URL + available
161 | try:
162 | subtitle_available = CANDY.get(subtitle_available_url).json()
163 | except json.decoder.JSONDecodeError:
164 | return
165 | WORK_DIR.change('Videos')
166 | base_subtitle_url = BASE_URL + transcript + '/'
167 | multi_subtitle = False if len(subtitle_available) == 1 else True
168 | for subtitle_desc in subtitle_available:
169 | subtitle_url = base_subtitle_url + subtitle_desc
170 | CANDY.get(subtitle_url)
171 | if multi_subtitle:
172 | sub_file_name = file_name + '_' + \
173 | subtitle_desc.replace('_xuetangx', '') + '.srt'
174 | else:
175 | sub_file_name = file_name + '.srt'
176 | subtitle = CANDY.get(subtitle_available_url.rstrip(
177 | 'available_translations') + 'download').content
178 | with open(WORK_DIR.file(sub_file_name), 'wb') as subtitle_file:
179 | subtitle_file.write(subtitle)
180 |
181 |
182 | def get_summary(url):
183 | """从课程地址获得课程文件夹名称"""
184 |
185 | about_page = CANDY.get(url).text
186 | soup = BeautifulSoup(about_page, 'lxml')
187 |
188 | course_name = soup.find(id='title1').string
189 | institution = soup.find(class_='courseabout_text').a.string
190 |
191 | dir_name = course_dir(course_name, institution)
192 | print(dir_name)
193 | return dir_name
194 |
195 |
196 | def start(url, config, cookies=None):
197 | """调用接口函数"""
198 |
199 | global WORK_DIR
200 | CONFIG.update(config)
201 |
202 | CANDY.set_cookies(cookies)
203 | status = CANDY.get('http://www.xuetangx.com/header_ajax')
204 | if status.json()['login']:
205 | print('验证成功!')
206 | else:
207 | print('Cookie 失效。请获取新的 Cookie ')
208 | sys.exit(1)
209 |
210 | course_name = get_summary(url)
211 |
212 | WORK_DIR = WorkingDir(CONFIG['dir'], course_name)
213 | WORK_DIR.change('Videos')
214 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
215 | FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt'))
216 |
217 | handout = url.rstrip('about') + 'info'
218 | courseware = url.rstrip('about') + 'courseware'
219 |
220 | if CONFIG['doc']:
221 | # 使用 handout 作为入口更快
222 | get_book(handout)
223 |
224 | get_handout(handout)
225 | get_content(courseware)
226 |
227 | exports.update({
228 | "workdir": WORK_DIR,
229 | "spider": CANDY,
230 | "videos": VIDEOS
231 | })
232 |
--------------------------------------------------------------------------------
/moocs/xuetangx_next.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """学堂在线"""
3 |
4 | from moocs.utils import *
5 | from utils.crawler import Crawler
6 |
7 | name = "xuetangx_next"
8 | need_cookies = True
9 | CANDY = Crawler()
10 | CONFIG = {}
11 | FILES = {}
12 | VIDEOS = []
13 | exports = {}
14 | __all__ = ["name", "need_cookies", "start", "exports"]
15 |
16 |
17 | def get_summary(url):
18 | """从课程主页面获取信息"""
19 |
20 | sign, cid = re.match(r"https?://next.xuetangx.com/course/"
21 | "(?P.+?)/(?P.+)", url).group("sign", "cid")
22 |
23 | res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/product/info?cid=%s&sign=%s" % (cid, sign))
24 | course_name = res.json()['data']['classroom_name']
25 | # 机构名称不太容易获取,暂时不获取
26 | dir_name = course_dir(course_name, "学堂在线")
27 |
28 | print(dir_name)
29 | CONFIG['sign'] = sign
30 | CONFIG['cid'] = cid
31 | return cid, sign, dir_name
32 |
33 |
34 | def parse_resource(resource):
35 |
36 | cid, sign = CONFIG['cid'], CONFIG['sign']
37 | file_name = resource.file_name
38 | item_id, item_info_id = resource.meta
39 | res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/leaf_info/%s/%s/?sign=%s" % (cid, item_id, sign),
40 | headers={"xtbz": "xt"})
41 | if resource.type == 'Video':
42 | ccid = res.json()['data']['content_info']['media']['ccid']
43 |
44 | video_url_res = CANDY.get("https://next.xuetangx.com/api/v1/lms/service/playurl/%s/?appid=10000" % ccid)
45 | sources = video_url_res.json()['data']['sources']
46 | qualitys = ['20', '10']
47 | for qa in qualitys:
48 | if sources.get('quality' + qa):
49 | # 居然是个数组,暂时没发现多段的,希望以后也没有吧……
50 | video_url = sources['quality' + qa][0]
51 | break
52 |
53 | ext = '.mp4'
54 | if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
55 | FILES['renamer'].write(video_url.split('?')[0].split('/')[-1], file_name, ext)
56 | FILES['video'].write_string(video_url)
57 | VIDEOS.append((video_url, file_name+ext))
58 | resource.ext = ext
59 |
60 | if not CONFIG['sub']:
61 | return
62 | # 暂未支持多语言
63 | subtitle_res = CANDY.get("https://next.xuetangx.com/api/v1/lms/service/subtitle_parse/?c_d=%s&lg=0" % ccid)
64 | if subtitle_res.status_code != 200:
65 | return
66 | subtitle_json = subtitle_res.json()
67 | starts, ends, texts = subtitle_json['start'], subtitle_json['end'], subtitle_json['text']
68 | subtitle = Subtitle(WORK_DIR.file(file_name + '.srt'))
69 | assert len(starts) == len(ends) == len(texts)
70 | for i in range(len(starts)):
71 | subtitle.write(texts[i], starts[i]/1000, ends[i]/1000)
72 |
73 | elif resource.type == 'Document':
74 | if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]):
75 | return
76 | # 暂时也没遇到多个文件的情况
77 | downloads = res.json()['data']['content_info']['download']
78 | if downloads:
79 | pdf_url = downloads[0]['file_url']
80 | CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
81 |
82 |
83 | def get_resource(cid, sign):
84 | """获取各种资源"""
85 |
86 | outline = Outline()
87 | counter = Counter()
88 |
89 | video_list = []
90 | pdf_list = []
91 |
92 | res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/course/chapter?cid=%s&sign=%s" % (cid, sign),
93 | headers={"xtbz": "xt"})
94 | for chapter in res.json()['data']['course_chapter']:
95 | counter.add(0)
96 | chapter_id, chapter_name, chapter_order = chapter['id'], chapter['name'], chapter['order']
97 | outline.write(chapter_name, counter, 0)
98 |
99 | for section in chapter['section_leaf_list']:
100 | counter.add(1)
101 | section_id, section_name, section_order = section['id'], section['name'], section['order']
102 | outline.write(section_name, counter, 1)
103 |
104 | # 暂时忽略讨论、测验,以后可能支持(在 section 中作为叶子结点, type_id = 4 6)
105 | for item in section.get('leaf_list', []):
106 | counter.add(2)
107 | item_id, item_name, item_order = item['id'], item['name'], item['order']
108 | item_type, item_info_id = item['leaf_type'], item['leafinfo_id']
109 | # Video
110 | if item_type == 0:
111 | outline.write(item_name, counter, 2, sign='#')
112 | video_list.append(Video(counter, item_name, (item_id, item_info_id)))
113 | # Docs
114 | elif item_type == 3:
115 | item_name = item_name.rstrip('.pdf')
116 | outline.write(item_name, counter, 2, sign='*')
117 | if CONFIG['doc']:
118 | pdf_list.append(Document(counter, item_name, (item_id, item_info_id)))
119 |
120 | if video_list:
121 | rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
122 | WORK_DIR.change('Videos')
123 | playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
124 | if playlist is not None:
125 | parse_res_list(video_list, rename, parse_resource, playlist.write)
126 | else:
127 | parse_res_list(video_list, rename, parse_resource)
128 | if pdf_list:
129 | WORK_DIR.change('PDFs')
130 | parse_res_list(pdf_list, None, parse_resource)
131 |
132 |
133 | def start(url, config, cookies=None):
134 | """调用接口函数"""
135 |
136 | global WORK_DIR
137 | CANDY.set_cookies(cookies)
138 | CONFIG.update(config)
139 |
140 | cid, sign, course_name = get_summary(url)
141 |
142 | WORK_DIR = WorkingDir(CONFIG['dir'], course_name)
143 | WORK_DIR.change('Videos')
144 | FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
145 | FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
146 |
147 | get_resource(cid, sign)
148 |
149 | exports.update({
150 | "workdir": WORK_DIR,
151 | "spider": CANDY,
152 | "videos": VIDEOS
153 | })
154 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "course-crawler",
3 | "description": "一个基于 Python 3 的 MOOC 课程下载工具",
4 | "scripts": {
5 | "docs:dev": "vuepress dev docs",
6 | "docs:build": "vuepress build docs",
7 | "deploy": "bash scripts/deploy.sh"
8 | },
9 | "husky": {
10 | "hooks": {
11 | "pre-commit": "pretty-quick --staged"
12 | }
13 | },
14 | "devDependencies": {
15 | "husky": "^3.0.4",
16 | "prettier": "1.18.2",
17 | "pretty-quick": "^1.11.1",
18 | "vuepress": "^1.2.0"
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.22.0
2 | beautifulsoup4==4.8.0
3 | lxml==4.4.1
4 | pycryptodome==3.9.0
5 |
--------------------------------------------------------------------------------
/scripts/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | ACCESS_TOKEN=$1
4 | USERNAME=SigureMo # 你的用户名
5 | REPO=course-crawler # 如果不指定,将发布在 .github.io
6 | BRANCH=gh-pages # 如果不指定,将发布在 master 分支
7 | CNAME="" # 你想发布到的域名
8 |
9 | if [ $ACCESS_TOKEN ]
10 | then TOKEN_PREFIX="${ACCESS_TOKEN}@"
11 | else TOKEN_PREFIX=""
12 | fi
13 |
14 | if [ $BRANCH ]
15 | then BRANCH_POSTFIX=":${BRANCH}"
16 | else BRANCH_POSTFIX=""
17 | fi
18 |
19 | if [ $REPO ]
20 | then REMOTE=https://${TOKEN_PREFIX}github.com/${USERNAME}/${REPO}.git
21 | else REMOTE=https://${TOKEN_PREFIX}github.com/${USERNAME}/${USERNAME}.github.io.git
22 | fi
23 |
24 | # 确保脚本抛出遇到的错误
25 | set -e
26 |
27 | # 生成静态文件
28 | npm run docs:build
29 |
30 | # 进入生成的文件夹
31 | cd docs/.vuepress/dist
32 |
33 | # 如果是发布到自定义域名
34 | if [ $CNAME ]
35 | then echo $CNAME > CNAME
36 | fi
37 |
38 | # 初始化仓库并提交发布
39 | git init
40 | git config user.name "GitHub Actions"
41 | git config user.email "support@github.com"
42 | git add -A
43 | time=$(date "+%Y-%m-%d %H:%M:%S")
44 | git commit -m "rebuild @${time}"
45 | git push -f $REMOTE master${BRANCH_POSTFIX}
46 |
47 | cd -
48 |
--------------------------------------------------------------------------------
/utils/aria2.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import subprocess
3 | import json
4 | import time
5 | import os
6 |
7 | from urllib.request import urlopen
8 |
9 | rpc_url = "http://localhost:{port}/jsonrpc"
10 |
11 |
12 | class Aria2():
13 | """ Aria2 RPC 接口调用器
14 | 完整接口见(简单封装即可):
15 | http://aria2.github.io/manual/en/html/aria2c.html#rpc-interface
16 | """
17 |
18 | def __init__(self, aria2_path="aria2c", port=6800):
19 | self.port = port
20 | self.rpc_url = rpc_url.format(port=port)
21 | self.aria2_path = aria2_path
22 | self.process_file = open("process.out", "w")
23 | assert self.is_installed(), "请配置正确的 aria2 路径"
24 | if not self.is_connected():
25 | self.process = self.init_rpc()
26 | # 防止操作过快导致 aria2 没来得及开启
27 | time.sleep(1)
28 |
29 | def __del__(self):
30 | """ 析构时确保 aria2 关闭 """
31 | if self.is_connected():
32 | self.shutdown()
33 | self.process_file.close()
34 | try:
35 | os.remove(self.process_file.name)
36 | except:
37 | print("process.out 自动删除失败……")
38 |
39 | def rpc_api(method):
40 | """ RPC 装饰器 """
41 | def rpc_method(func):
42 | def new_func(self, *args):
43 | data = {
44 | 'jsonrpc': '2.0',
45 | 'id': 'qwer',
46 | 'method': method,
47 | 'params': list(filter(lambda arg: arg is not None, args)),
48 | }
49 | res = requests.post(
50 | self.rpc_url, data=json.dumps(data), timeout=2)
51 | return res.json()["result"]
52 | return new_func
53 | return rpc_method
54 |
55 | @rpc_api(method="aria2.addUri")
56 | def add_uri(self, uris, options=None, position=None):
57 | """ 添加 URI 任务 """
58 | pass
59 |
60 | @rpc_api(method="aria2.getGlobalStat")
61 | def get_global_stat(self):
62 | """ 获取全局统计信息 """
63 | pass
64 |
65 | @rpc_api(method="aria2.shutdown")
66 | def shutdown(self):
67 | """ 关闭 aria2 """
68 | pass
69 |
70 | @rpc_api(method="aria2.tellStatus")
71 | def tell_status(self, gid, keys=None):
72 | """ 获取某一下载资源的状态信息 """
73 | pass
74 |
75 | def init_rpc(self):
76 | """ 启动 aria2 RPC """
77 | cmd = self.aria2_path + \
78 | ' --enable-rpc' \
79 | ' --rpc-listen-port %d' \
80 | ' --continue' \
81 | ' --max-concurrent-downloads=20' \
82 | ' --max-connection-per-server=10' \
83 | ' --rpc-max-request-size=1024M' % self.port
84 |
85 | return subprocess.Popen(cmd, shell=True, stdout=self.process_file)
86 |
87 | def is_connected(self):
88 | """ 是否可以连接 aria2 """
89 | try:
90 | requests.post(self.rpc_url)
91 | return True
92 | except requests.exceptions.ConnectionError:
93 | return False
94 |
95 | def is_installed(self):
96 | """ 是否已经下载 aria2 """
97 | try:
98 | return subprocess.run([self.aria2_path], stdout=subprocess.PIPE,
99 | stderr=subprocess.PIPE).returncode == 1
100 | except FileNotFoundError:
101 | return False
102 |
103 |
104 | class Aria2File():
105 |
106 | def __init__(self, aria2, url, file_name, dir, overwrite=False):
107 | self.aria2 = aria2
108 | self.path = os.path.join(dir, file_name)
109 | self.tmp_path = self.path + ".t"
110 | self.aria2_file = self.tmp_path + ".aria2"
111 | if overwrite:
112 | if os.path.exists(self.tmp_path):
113 | os.remove(self.tmp_path)
114 | if os.path.exists(self.aria2_file):
115 | os.remove(self.aria2_file)
116 | self.gid = aria2.add_uri([url], {"dir": dir, "out": file_name+".t"})
117 | self.renamed = False
118 |
119 | def get_length(self):
120 | """ 获取总大小 """
121 | return int(self.aria2.tell_status(self.gid)["totalLength"])
122 |
123 | def get_complete_length(self):
124 | """ 获取已完成部分大小 """
125 | return int(self.aria2.tell_status(self.gid)["completedLength"])
126 |
127 | def get_status(self):
128 | """ 获取状态 """
129 | return self.aria2.tell_status(self.gid)["status"]
130 |
131 | def get_speed(self):
132 | """ 获取下载速度 """
133 | return int(self.aria2.tell_status(self.gid)["downloadSpeed"])
134 |
135 | def exists(self):
136 | """ 文件是否已存在 """
137 | return os.path.exists(self.path)
138 |
139 | def rename(self):
140 | """ 将文件从临时位置移动到目标位置 """
141 | if os.path.exists(self.path):
142 | os.remove(self.path)
143 | os.rename(self.tmp_path, self.path)
144 | self.renamed = True
145 |
--------------------------------------------------------------------------------
/utils/crawler.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | import requests
5 |
6 |
7 | class Crawler(requests.Session):
8 |
9 | header = {
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
11 | }
12 |
13 | def __init__(self):
14 | super().__init__()
15 | self.headers.update(Crawler.header)
16 |
17 | def set_cookies(self, cookies):
18 | """传入一个字典,用于设置 cookies"""
19 |
20 | requests.utils.add_dict_to_cookiejar(self.cookies, cookies)
21 |
22 | def download_bin(self, url, file_path, stream=True, chunk_size=1024, **kw):
23 | """下载二进制文件"""
24 |
25 | res = self.get(url, stream=stream, **kw)
26 | tmp_path = file_path + ".t"
27 | try:
28 | with open(tmp_path, "wb") as f:
29 | if stream:
30 | for chunk in res.iter_content(chunk_size=chunk_size):
31 | if not chunk:
32 | break
33 | f.write(chunk)
34 | else:
35 | f.write(res.content)
36 | except:
37 | os.remove(tmp_path)
38 | print("[warn] {} failed to download".format(file_path))
39 | if os.path.exists(file_path):
40 | os.remove(file_path)
41 | os.rename(tmp_path, file_path)
42 |
43 | def download_text(self, url, file_path, **kw):
44 | """下载文本,以 UTF-8 编码保存文件"""
45 |
46 | res = self.get(url, **kw)
47 | res.encoding = res.apparent_encoding
48 | with open(file_path, 'w', encoding='utf_8') as f:
49 | f.write(res.text)
50 |
--------------------------------------------------------------------------------